<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychiatry</journal-id>
<journal-title>Frontiers in Psychiatry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychiatry</abbrev-journal-title>
<issn pub-type="epub">1664-0640</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyt.2025.1508772</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Psychiatry</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Diagnosis of depression based on facial multimodal data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Jin</surname>
<given-names>Nani</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2860092"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ye</surname>
<given-names>Renjia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Peng</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Materdicine Lab, School of Life Sciences, Shanghai University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Research Department, Third Xiangya Hospital of Central South University</institution>, <addr-line>Changsha</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Steven Fernandes, Creighton University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Alwin Poulose, Indian Institute of Science Education and Research, India</p>
<p>Kuldeep Singh, Guru Nanak Dev University, India</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Peng Li, <email xlink:href="mailto:3308260670@qq.com">3308260670@qq.com</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>28</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1508772</elocation-id>
<history>
<date date-type="received">
<day>13</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>01</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Jin, Ye and Li</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Jin, Ye and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Depression is a serious mental health disease. Traditional scale-based depression diagnosis methods often have problems of strong subjectivity and high misdiagnosis rate, so it is particularly important to develop automatic diagnostic tools based on objective indicators.</p>
</sec>
<sec>
<title>Methods</title>
<p>This study proposes a deep learning method that fuses multimodal data to automatically diagnose depression using facial video and audio data. We use spatiotemporal attention module to enhance the extraction of visual features and combine the Graph Convolutional Network (GCN) and the Long and Short Term Memory (LSTM) to analyze the audio features. Through the multi-modal feature fusion, the model can effectively capture different feature patterns related to depression.</p>
</sec>
<sec>
<title>Results</title>
<p>We conduct extensive experiments on the publicly available clinical dataset, the Extended Distress Analysis Interview Corpus (E-DAIC). The experimental results show that we achieve robust accuracy on the E-DAIC dataset, with a Mean Absolute Error (MAE) of 3.51 in estimating PHQ-8 scores from recorded interviews.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Compared with existing methods, our model shows excellent performance in multi-modal information fusion, which is suitable for early evaluation of depression.</p>
</sec>
</abstract>
<kwd-group>
<kwd>depression</kwd>
<kwd>multi-modal data</kwd>
<kwd>feature fusion</kwd>
<kwd>spatial-temporal attention</kwd>
<kwd>artificial intelligence</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="6"/>
<equation-count count="9"/>
<ref-count count="32"/>
<page-count count="15"/>
<word-count count="6224"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Computational Psychiatry</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Depression, also known as clinical depression or depressive disorder, is a prevalent and serious mental health condition that manifests through persistent low mood, lack of energy, and other symptoms that significantly impact an individual&#x2019;s thoughts, emotions, behaviors, and overall health (<xref ref-type="bibr" rid="B1">1</xref>). According to the World Health Organization, approximately 280 million people worldwide suffer from depression, with 15% of those affected eventually dying by suicide (<xref ref-type="bibr" rid="B2">2</xref>). The multifaceted nature of depression, influenced by social, psychological, and biological factors, underscores the necessity for a comprehensive approach to its treatment (<xref ref-type="bibr" rid="B3">3</xref>). Long-term stress, genetic predispositions, and adverse social environments are key contributors to the onset of depression, necessitating multifaceted treatment strategies to help patients regain a healthy life.</p>
<p>Traditional methods for diagnosing depression often rely on clinical evaluations by doctors and self-reports from patients. These scale-based methods are fraught with challenges such as high subjectivity, potential misdiagnosis, regional disparities, and a general lack of medical awareness. Moreover, the subtle nature of depressive symptoms means many individuals fail to recognize their condition promptly, leading to delayed treatment and worsening symptoms. Therefore, developing auxiliary diagnostic tools based on objective indicators is crucial for improving early diagnosis and treatment outcomes.</p>
<p>Recent advancements in artificial intelligence (AI) and deep learning have introduced new possibilities for assisting in the diagnosis of depression. These technologies have shown promise in identifying patterns and features indicative of depression through various data modalities (<xref ref-type="bibr" rid="B4">4</xref>). However, there are still some limitations in the research aimed at automatic diagnosis of depression. Some studies only consider global features and ignore local features in facial video data, which may lead to insufficient capture of subtle facial changes related to depression. Other studies only consider video data without combining audio information, ignoring the importance of multimodal information. In addition, the design of some models is too complex, which leads to the poor interpretability of the model and the difficulty in understanding its inner mechanism. We address the limitations of previous related work and propose a novel multimodal deep convolutional network, aiming to overcome these problems and provide a more efficient solution for the automatic diagnosis of depression.</p>
<p>In this study, we propose a novel deep learning approach that leverages multimodal data fusion to automatically diagnose depression using facial video and audio data. Our method enhances the extraction of visual features through a spatiotemporal attention module and combines Graph Convolutional Networks (GCN) and Long Short-Term Memory (LSTM) networks to analyze audio features. By integrating these multimodal features, our model effectively captures diverse patterns associated with depression. Our experimental results demonstrate that the proposed method outperforms existing approaches in terms of performance metrics, making it a promising tool for the early evaluation and diagnosis of depression. The main contributions of our study are as follows:</p>
<list list-type="order">
<list-item>
<p>We introduce a novel multimodal network architecture that comprehensively integrates video and audio information, significantly enhancing the reliability of depression diagnosis.</p>
</list-item>
<list-item>
<p>We design a feature fusion model that effectively combines&#xa0;temporal and spatial features, providing a more comprehensive representation of video data and facilitating a deeper analysis of the patient&#x2019;s psychological state.</p>
</list-item>
<list-item>
<p>We employ a combined GCN and LSTM model to process audio data, constructing a graph structure to analyze Mel-Frequency Cepstral Coefficients (MFCC), thereby improving the interpretability and accuracy of the diagnostic process.</p>
</list-item>
</list>
<p>This paper is organized into five sections. The first section presents the research background, discusses the status and challenges of depression identification, and introduces the objectives and significance of the study. The second section reviews recent methods for depression evaluation using both single-modal and multi-modal data. The third section details the proposed method, including the overall network architecture and its components. Section four includes the experimental environment, training process, dataset details, results and discussions. Finally, section five summarizes the contributions of this study, evaluates the advantages and limitations of the proposed method, and outlines directions for future research.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Studies have shown that depression state is closely related to patients&#x2019; head and face activities (<xref ref-type="bibr" rid="B5">5</xref>). Currently, some studies have tried multi-modal fusion of facial video information and other types of data, such as voice features and text information, to improve the accuracy of depression diagnosis. By utilizing multiple sources of information, the emotional state and psychological characteristics of patients can be captured more comprehensively, leading to more accurate assessment.</p>
<p>Al Jazaery and Guo (<xref ref-type="bibr" rid="B6">6</xref>) used 3D convolutional neural networks to extract deep spatio-temporal features of closely cropped aligned facial regions and relatively large head regions respectively, and then used recurrent neural networks to continue learning spatio-temporal information for final prediction. It is the first application of 3D convolutional neural networks to depression level analysis and shows great promise. But it focuses more on visual-based non-verbal data and does not take audio into account. Sun Haohao et&#xa0;al. (<xref ref-type="bibr" rid="B7">7</xref>) performed face detection, alignment and cropping on video frames in AVEC2013 (<xref ref-type="bibr" rid="B8">8</xref>) and AVEC2014 (<xref ref-type="bibr" rid="B9">9</xref>) depression databases to obtain the whole face image and the local eye and mouth region. Then, the deep convolutional neural network that fuses the attention mechanism of the channel layer is used to extract the corresponding global features and local features. The multiple visual features learned are more discriminative than the global features alone. But this study does not consider the influence of the audio. Yuchen Pan et&#xa0;al. (<xref ref-type="bibr" rid="B10">10</xref>) proposed the Spatio-Temporal Attention Depression Recognition Network (STA-DRN), which mainly uses the spatio-temporal attention (STA) mechanism to generate spatial and temporal attention vectors, so as to capture the global and local spatio-temporal relationships of features. In the STA module, there is also an attention vector fusion strategy that fuses spatial and temporal domain information. This model can capture the dynamic change process of facial expression and enhance the feature correlation in the process of depression recognition. JH Kim et&#xa0;al. (<xref ref-type="bibr" rid="B11">11</xref>) introduces the customized VGG-19 (CVGG-19) architecture, which integrates designs from VGG, Inception-v1, ResNet, and Xception to enhance facial emotion recognition (FER). The CVGG-19 significantly improves performance by 59.29% and reduces computational cost by 89.5% compared to the original VGG-19. Additionally, CVGG-19 achieves an average F1-score that is 3.86% higher than Inception-V1, ResNet50, and Xception architectures. Constantino &#xc1;lvarez Casado et&#xa0;al. (<xref ref-type="bibr" rid="B12">12</xref>) extracted remote photoplethysmography (rPPG) signals directly from facial videos and computed a variety of statistical, geometric and physiological features including heart rate. These features were fed into machine learning regression models to identify different levels of depression. The results of this approach are comparable to other audiovisual models based on voice or facial expression.</p>
<p>Some studies only focus on audio information for depression recognition. Momoko Ishimaru et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>) input the feature vector converted from audio data into graph convolution layer and dense layer in turn, and finally obtain the prediction score. This new regression model uses the generated graph-structured data to express correlations between audio features, which can be exploited to assess the severity of depression. Li et&#xa0;al. (<xref ref-type="bibr" rid="B14">14</xref>) built speech signals into speech graphs based on feature similarity to input Graph-LSTM neural network for classification. The network is a new graph neural network structure combining LSTM aggregator and weighted pool, which enhances the interpretability of the model and can effectively identify speech emotional features. However, the model also has the shortcomings of redundant speech graph features and lack of visual features.</p>
<p>Some advancements in bimodal speech emotion recognition (SER) using both acoustic and text data, focusing on the significance of attention mechanisms and fusion strategies in combination with traditional deep learning techniques. Also there are some challenges such as limited datasets and difficulties in data acquisition (<xref ref-type="bibr" rid="B15">15</xref>).</p>
<p>Uddin et&#xa0;al. (<xref ref-type="bibr" rid="B16">16</xref>) input the preprocessed audio clips and video clips into the spatio-temporal network based on audio and video. The dynamic feature descriptor Volume Local Directional Structural Pattern is introduced to encode the structure, so as to extract the dynamic facial features. Then, Temporal Attentive Pooling is used to summarize the segmentation features, and Multi-modal factorized bilinear pooling is used to fuse the multimodal features. Finally, the corresponding BDI-II scores were obtained by regression to estimate the severity of depression. This method has strong feature extraction ability and considers multi-modal data but ignores the association between high-level semantic features and channels. Ming Fang et&#xa0;al. (<xref ref-type="bibr" rid="B17">17</xref>) comprehensively considered video, audio and text information, and designed a multi-modal fusion model with multi-level attention mechanism (MFM-Att) for depression detection. The model uses two LSTMs to learn video and audio features, and a Bi-LSTM with attention mechanism to learn text features, and then inputs these three outputs into the MFM-Att for feature fusion. This design can make information complementary between different modalities. However, the complexity of the model needs to be improved.</p>
<p>Improving the interpretability of diagnostic models for depression is crucial for clinical practice. David Gimeno-Gomez et&#xa0;al. (<xref ref-type="bibr" rid="B18">18</xref>) present a simple and flexible multimodal temporal model capable of recognizing nonverbal cues to depression from noisy real-world videos. They visualize the level to which these features contribute to the results through integrated gradients (<xref ref-type="bibr" rid="B19">19</xref>) based on audio-speech embeddings, facial emotion embeddings, facial, body and hand signatures, as well as gaze and blink information.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<p>
<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> shows the framework of the proposed method for diagnosing depression based on multimodal data. Firstly, visual information and audio information are extracted from the recorded videos of the participants, and the two kinds of information are pretreated separately. Then, the feature extraction is performed on the preprocessed data and the multimodal feature set is obtained by feature fusion. After that, the processed features are classified and the respective classification results are output. This framework allows the model to synthesize visual and audio information, which helps to deeply mine the hidden information in the data. In the process of facial behavior feature extraction, we use the spatio-temporal attention module to strengthen the correlation between features and video frames. For audio features, GCN and LSTM are mainly used.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Framework for depression diagnosis.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g001.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>Visual feature extraction</title>
<p>In order to effectively extract information from the facial behavior features, we propose Temporal-Spatial Network for Depression Diagnosis (TSNet-DD). The proposed model incorporates a temporal attention module and a spatial attention module to capture global and local features at the temporal and spatial levels from video frames. The core of TSNet-DD is that it can use the Temporal-Spatial Attention Module (TSAM) to enhance the correlation between pixels and frames.</p>
<p>The overall architecture of TSNet-DD network is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>. The initial layer of the network uses a 7&#xd7;7&#xd7;7 convolution kernel to perform downsampling with a step size of 1&#xd7;2&#xd7;2 to extract low-level features of the input image. Next, a 3&#xd7;3&#xd7;3 pooling operation with a step size of 1&#xd7;2&#xd7;2 is performed in the second layer, and the resulting features are denoted as <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The subsequent module1, module2, module3 and module4 correspond to different convolutional layers in the ResNet, and each module consists of a different number of residual blocks. TSNet32-DD corresponds to the ResNet18, that is, each module contains two residual blocks, and two sub-modules are also contained within each residual block. In ResNet18, these submodules are 3&#xd7;3 convolutional layers, while in our network, we introduce TSAM. Therefore, a total of 32 TSAMs are used in the ResNet18-based network, and we refer to this network as TSNet32-DD.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Architecture of TSNet-DD.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g002.tif"/>
</fig>
<p>Similarly, in the ResNet34-based network, there are 3,4,6 and 3 residual blocks in each module (sections marked red in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>). Each residual block still contains two TSAMs, resulting in a total of 64 TSAMs in the final network, thus this network is called TSnet64-DD. After all residual modules, a pooling operation is performed on the feature map to resample the features into fixed shapes, and finally a fully connected layer is used to classify the subjects.</p>
<p>The feature extraction module TSAM in TSNet-DD contains a temporal attention module and a spatial attention module. These two modules are used to generate the temporal attention weight vector <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and spatial attention weight vector <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the input <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively, so as to obtain the corresponding temporal feature <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and spatial feature <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Then, these two kinds of features are fused to capture the intrinsic relationship between spatial-temporal features, assigning adaptive weights to the features with spatio-temporal information. The structure of TSAM is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Temporal-spatial attention mechanism.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g003.tif"/>
</fig>
<p>The fusion process of temporal attention module and spatial attention module could be expressed by the following formula:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>Temporal attention module</title>
<p>For video data of patients with depression, intra-frame temporal changes are crucial for depression recognition. Such temporal changes can be short term or long term dynamics spanning several seconds. Although short-term features could capture dynamic information between several frames, their ability to extract long-term dynamic features is limited. To address this problem, we introduce a temporal Attention module (TAM) for enhancing temporal information. The specific structure of this module is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Temporal attention module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g004.tif"/>
</fig>
<p>In the TSnet-DD model, after the second layer of 3&#xd7;3&#xd7;3 pooling operation, the feature <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is obtained, and its size is assumed to be H&#xd7;W&#xd7;C. The feature <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is sent to the TAM. Firstly, the global average pooling and max pooling operations are performed respectively to obtain two 1&#xd7;1&#xd7;C channel descriptions. Subsequently, these two descriptions are fed into a two-layer neural network with shared weights for processing. Then the resulting two features are added and the weight coefficient <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is obtained through the sigmoid activation function. Finally, <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is multiplied with the original input feature <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to obtain the new scaled feature <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The process of TAM can be summarized as follows:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>To further visualize the architecture and data transfer process of TAM, we show its pseudo-code
in <xref ref-type="boxed-text" rid="algo1">
<bold>Algorithm 1</bold>
</xref>.</p>
<boxed-text id="algo1" position="float">
<label>Algorithm 1</label>
<title>Pseudocode of temporal attention module.</title>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g014.tif"/></p>
</boxed-text>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Spatial attention module</title>
<p>In our collection of videos about people with depression, some useful features usually appear in a sequence of consecutive video frames. Therefore, whether features can identify spatial order information is crucial in depression diagnosis. In addition, different locations of the face have their own unique features, and there are subtle relationships between these location features that cannot be captured by our naked eyes (<xref ref-type="bibr" rid="B20">20</xref>). With this in mind, we employ a Spatial Attention Module (SAM) to generate spatial vectors to capture the spatial information. The structure of SAM is shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Spatial Attention Module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g005.tif"/>
</fig>
<p>In SAM, the input feature <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can determine where the features are meaningful. Firstly, the average pooling and maximum pooling of the channel dimension are performed on <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> respectively to obtain two channel descriptions of size H&#xd7;W&#xd7;1, then the two descriptions are concatenated together in the channel dimension. Next, through a 7&#xd7;7 convolutional layer and the activation function sigmoid, the weight coefficient <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are obtained. Finally, the <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is multiplied with the input feature <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to obtain the final spatial attention vector <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This process can be expressed as follows:</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>*</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The flow of the Spatial Attention Module is shown in <xref ref-type="boxed-text" rid="algo2">
<bold>Algorithm 2</bold>
</xref>.</p>
<boxed-text id="algo2" position="float">
<label>Algorithm 2</label>
<title>Pseudocode of spatial attention module.</title>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g015.tif"/></p>
</boxed-text>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Audio feature extraction</title>
<p>The researchers found that people with depression tended to speak in a monotonous and lower tone than healthy controls (<xref ref-type="bibr" rid="B21">21</xref>). Therefore, in addition to the analysis of visual features, it is particularly important to mine the key features hidden in speech signals for the diagnosis of depression. See <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref> for the audio feature processing method used in this paper.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Audio feature extraction.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g006.tif"/>
</fig>
<p>We use Mel-frequency Cepstral coefficients (MFCC) of audio data as effective features for
depression recognition (<xref ref-type="bibr" rid="B22">22</xref>). MFCC takes into account the
auditory properties of the human ears and can well capture the features in speech. The calculation
process of MFCC is as follows: Firstly, the input audio signal is pre-weighted to enhance the
high-frequency components. Then, the pre-weighted signal is divided into multiple short-time frames,
and a window is applied to each frame to reduce the spectral leakage. Next, the fast fourier
transform is performed on each windowed frame to convert the time domain signal to the frequency
domain. Then the power spectrum of each frame is calculated. Finally, the power spectrum is passed
through a bank of Mel filters, and its output is log transformed and discrete cosine transformed. At
this point, we have the MFCC feature vector for each frame of the input audio. After that, we
consider the MFCC feature vector of each frame as a node, and calculate the feature similarity of
each node based on the Euclidean distance between its feature vectors, so as to construct the edges
between nodes. In this process, we set a threshold of 0.5 to limit the addition of edges, that is,
only adding edges between nodes with high enough feature similarity and small enough distance.
Finally, we assign a corresponding weight to each edge based on the inverse of the distance to
better capture the local and global features in the audio signal. The specific algorithmic
architecture of MFCC Calculation is shown in <xref ref-type="boxed-text" rid="algo3">
<bold>Algorithm 3</bold>
</xref>.</p>
<boxed-text id="algo3" position="float">
<label>Algorithm 3</label>
<title>Pseudocode of MFCC calculation.</title>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g016.tif"/></p>
</boxed-text>
<p>To explore the complex patterns and temporal features of audio data, we input the constructed
graph structure data into GCN and LSTM for processing. GCN aggregates the features of nodes and
their neighbors through convolution operations. LSTM could capture long and short-term dependencies
in the sequence. <xref ref-type="boxed-text" rid="algo4">
<bold>Algorithm 4</bold>
</xref> further details the process of combining GCN and LSTM. By combining GCN and LSTM, we can capture the high-level graph features of each node and the dynamics and dependencies in the time series. Finally, by subsampling and classifying the complex features, we can obtain a diagnosis of whether the speaker in the audio has depression or not.</p>
<boxed-text id="algo4" position="float">
<label>Algorithm 4</label>
<title>Pseudocode of GCN-LSTM model.</title>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g017.tif"/></p>
</boxed-text>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Video-audio fusion</title>
<p>Assuming that the final extracted visual feature is <inline-formula>
<mml:math display="inline" id="im50">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the obtained audio feature is <inline-formula>
<mml:math display="inline" id="im51">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we now discuss how to fuse these two features. In view of the fact that not all modality features play a positive role in the severity assessment of depression, we propose a Video-Audio Fusion Network (VAFN) to fuse the feature information of the two modalities. The structure of VAFN is shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>. The input of VAFN is the multi-modal feature <inline-formula>
<mml:math display="inline" id="im52">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the output feature is the fused <inline-formula>
<mml:math display="inline" id="im53">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The nature of video and audio data are different, which leads to different feature vector dimensions. Therefore, in the actual processing, we first perform zero-padding on the side with smaller size in <inline-formula>
<mml:math display="inline" id="im54">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im55">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to ensure that the resulting dimensions of <inline-formula>
<mml:math display="inline" id="im56">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im57">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are consistent. Then, <inline-formula>
<mml:math display="inline" id="im58">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im59">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are superimposed in the horizontal and vertical directions respectively to obtain <inline-formula>
<mml:math display="inline" id="im60">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im61">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. A fully connected layer is used to reduce the dimension of <inline-formula>
<mml:math display="inline" id="im62">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the attention weight vector <inline-formula>
<mml:math display="inline" id="im63">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is obtained. Finally, <inline-formula>
<mml:math display="inline" id="im64">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im65">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are multiplied to obtain the final multi-modal fusion feature <inline-formula>
<mml:math display="inline" id="im66">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The obtained fusion features are max-pooling and classified to obtain the
depression prediction results. The entire fusion process described above is summarized in <xref
ref-type="boxed-text" rid="algo5">
<bold>Algorithm 5</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Video-audio fusion network.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g007.tif"/>
</fig>
<boxed-text id="algo5" position="float">
<label>Algorithm 5</label>
<title>Pseudocode of VAFN.</title>
<p><graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g018.tif"/></p>
</boxed-text>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental settings</title>
<p>The GPU used in this paper is NVIDIA RTX3090. The development and testing are carried out in the Python3.9 environment, and the integrated development tool is Pycharm. We use PyTorch v1.12.0 as the deep learning framework and use CUDA 11.6 in the model training process. The operating system is Windows10. In order to alleviate the over-fitting problem, we use the AdamW optimizer for training, and add the Dropout layer to the network backbone. The dropout rate is set between [0.4, 0.6]. As for optimization, the learning rate is set to be 10e&#x2212;4 for modality feature extraction and 5&#xd7;10e&#x2212;5 for modality fusion with linear schedule strategy. During training, the number of iterations is consistently set to 5000.</p>
<p>In this study, we used the Extended Distress Analysis Interview Corpus (E-DAIC) dataset (<xref ref-type="bibr" rid="B23">23</xref>, <xref ref-type="bibr" rid="B24">24</xref>) to validate our proposed method. The E-DAIC dataset is an extended version of the WOZ-DAIC dataset (<xref ref-type="bibr" rid="B23">23</xref>) and consists of semi-structured clinical interviews designed to identify psychological distress conditions such as anxiety, depression, and PTSD. In E-DAIC dataset, OpenSMILE (<xref ref-type="bibr" rid="B25">25</xref>) was used to extract the acoustic features of subjects, including Mel-frequency cepstral coefficients (MFCC) (<xref ref-type="bibr" rid="B26">26</xref>), and OpenFace (<xref ref-type="bibr" rid="B27">27</xref>) was used to extract the corresponding visual features. Facial features, eye fixations, head poses, and motor units are included. To protect the privacy of participants, the dataset provides these extracted features directly instead of raw video recordings. The E-DAIC dataset consists of clinical interview transcripts from 219 participants, along with corresponding assessments of depression and PTSD severity. To ensure a representative distribution of the data, the training set contains 163 samples, the validation set contains 56 samples, and the test set contains 10 samples. Each participant in the E-DAIC dataset was annotated according to their Patient Health Questionny-8 (PHQ-8) score (<xref ref-type="bibr" rid="B28">28</xref>), with scores higher than 10 classified as 1 (indicating the presence of depression) and scores lower than 10 labeled as 0 (indicating the absence of depression).</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Evaluation metrics</title>
<p>Evaluation measures to evaluate the depression diagnostic model included F1 score (<xref ref-type="disp-formula" rid="eq7">Equation 7</xref>) (<xref ref-type="bibr" rid="B29">29</xref>), root-mean-square error (RMSE, <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>) and mean absolute error (MAE, <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>) (<xref ref-type="bibr" rid="B30">30</xref>). The F1 score is the harmonic average of precision and recall (<xref ref-type="disp-formula" rid="eq6">Equation 6</xref>) and is used to comprehensively measure the performance of the depression diagnostic model. RMSE can reveal how the model performs in extreme cases, such as severely overestimating or underestimating a patient"s depression, which can have a significant impact on clinical decision making. MAE gives the average difference between the model prediction and the actual value.</p>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>T</mml:mtext>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>T</mml:mtext>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>T</mml:mtext>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>Pr</mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>P</mml:mtext>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>N</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
<sec id="s4_3" sec-type="results">
<label>4.3</label>
<title>Results</title>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>Comparison with other methods</title>
<p>In this study, we preliminarily use video data and audio data separately for depression recognition based on single-modal features, and the results are shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. Specifically, for video features, we compare the proposed TSNet-DD with references (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B10">10</xref>, <xref ref-type="bibr" rid="B12">12</xref>). It shows that TSNet-DD consistently outperforms the other three models, and TSNet64-DD outperforms TSNet32-DD. The TSNet64-DD model achieved the highest F1 score of 0.853, demonstrating its ability to capture both spatial and temporal features effectively. This improvement over previous models (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B10">10</xref>, <xref ref-type="bibr" rid="B12">12</xref>) suggests that our temporal-spatial attention mechanism significantly enhances feature extraction. Although the RMSE value of TSNet64-DD is slightly higher than that of TSNet32-DD at 5.11, the small difference in RMSE here is negligible compared to the advantages of its F1 value and MAE value. For audio data, the GCN-LSTM model achieved an F1 score of 0.827, outperforming previous models (<xref ref-type="bibr" rid="B13">13</xref>) and (<xref ref-type="bibr" rid="B14">14</xref>). This indicates that combining GCN and LSTM can effectively capture the complex patterns in audio features related to depression. Although the MAE value of GCN-LSTM is not the lowest, considering the characteristic that MAE is insensitive to outliers and its excellent performance on F1, we believe that GCN-LSTM has a unique advantage in processing audio features of depression.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Results of depression recognition under single-modal features.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">feature</th>
<th valign="top" align="center">model</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">RMSE</th>
<th valign="top" align="center">MAE</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="5" align="center">video</td>
<td valign="top" align="center">RNN-C3D (<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="center">0.723</td>
<td valign="top" align="center">8.07</td>
<td valign="top" align="center">5.78</td>
</tr>
<tr>
<td valign="top" align="center">STA-DRN (<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="center">0.702</td>
<td valign="top" align="center">8.94</td>
<td valign="top" align="center">6.77</td>
</tr>
<tr>
<td valign="top" align="center">RFR (<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="center">0.710</td>
<td valign="top" align="center">8.49</td>
<td valign="top" align="center">6.57</td>
</tr>
<tr>
<td valign="top" align="center">TSNet32-DD</td>
<td valign="top" align="center">0.800</td>
<td valign="top" align="center">5.11</td>
<td valign="top" align="center">5.03</td>
</tr>
<tr>
<td valign="top" align="center">TSNet64-DD</td>
<td valign="top" align="center">0.853</td>
<td valign="top" align="center">5.23</td>
<td valign="top" align="center">4.45</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="center">audio</td>
<td valign="top" align="center">GCNN (<xref ref-type="bibr" rid="B13">13</xref>)</td>
<td valign="top" align="center">0.690</td>
<td valign="top" align="center">9.28</td>
<td valign="top" align="center">6.65</td>
</tr>
<tr>
<td valign="top" align="center">GLNN (<xref ref-type="bibr" rid="B14">14</xref>)</td>
<td valign="top" align="center">0.788</td>
<td valign="top" align="center">8.43</td>
<td valign="top" align="center">5.04</td>
</tr>
<tr>
<td valign="top" align="center">GCN-LSTM</td>
<td valign="top" align="center">0.827</td>
<td valign="top" align="center">6.67</td>
<td valign="top" align="center">6.28</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Subsequently, we fuse facial video features and audio features to evaluate the performance of multimodal data in depression diagnosis. The results are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. We find that the model based on multi-modal data consistently outperforms the performance using only single-modal data in terms of F1 value. The F1 value of our proposed method finally reaches 0.922, which is not only better than the performance of all single-modal data, but also the highest in all experiments based on multi-modal data. This may be due to the diversity of the input data, and it indicates that there are clear differences in facial visual features and voice features between patients with depression and healthy participants. It turns out that the multimodality-based assistive method has its unique significance in depression diagnosis when the privacy of the participants is protected as much as possible.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Results of depression recognition under multi-modal features.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">model</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">RMSE</th>
<th valign="top" align="center">MAE</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">MFM-Att (<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="center">0.895</td>
<td valign="top" align="center">7.29</td>
<td valign="top" align="center">4.03</td>
</tr>
<tr>
<td valign="top" align="center">GCN</td>
<td valign="top" align="center">0.918</td>
<td valign="top" align="center">6.24</td>
<td valign="top" align="center">3.88</td>
</tr>
<tr>
<td valign="top" align="center">VAFN(TSNet32+GL)</td>
<td valign="top" align="center">0.903</td>
<td valign="top" align="center">5.77</td>
<td valign="top" align="center">4.00</td>
</tr>
<tr>
<td valign="top" align="center">VAFN(TSNet64+GL)</td>
<td valign="top" align="center">0.922</td>
<td valign="top" align="center">6.06</td>
<td valign="top" align="center">3.51</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To provide a comprehensive evaluation of our model&#x2019;s performance, we also included the Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) values. The ROC curve is a graphical representation that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. The AUC provides an aggregate measure of performance across all possible classification thresholds. Our model&#x2019;s ROC curves for both single-modal and multi-modal data are shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. The AUC values for TSNet64-DD, GCN-LSTM, and VAFN(TSNet64+GL) are summarized in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. The results indicate that our multi-modal fusion model achieves the highest AUC value, further confirming its superior performance in distinguishing between depressed and non-depressed individuals.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Comparison of different ROCs.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g008.tif"/>
</fig>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Results of different model&#x2019;s AUC values.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">model</th>
<th valign="top" align="center">AUC</th>
<th valign="top" align="center">FLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">TSNet64-DD</td>
<td valign="top" align="center">0.912</td>
<td valign="top" align="center">453,787,648</td>
</tr>
<tr>
<td valign="top" align="center">GCN-LSTM</td>
<td valign="top" align="center">0.880</td>
<td valign="top" align="center">4,915,200</td>
</tr>
<tr>
<td valign="top" align="center">VAFN(TSNet64+GL)</td>
<td valign="top" align="center">0.950</td>
<td valign="top" align="center">463,424,512</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In addition, our proposed models strike a balance between computational complexity and performance. The computational complexity of TSNet-DD is primarily determined by the number of convolutional layers and the attention modules. The attention modules add a small overhead due to the additional operations for attention weight calculation. Despite the added complexity of the attention modules, TSNet-DD is designed to be non-redundant, ensuring efficient processing without unnecessary computational overhead. This balance between complexity and efficiency allows TSNet-DD to achieve high performance while maintaining reasonable computational requirements. GCN-LSTM combines the strengths of GCN and LSTM to process audio features. GCN is used to aggregate features from neighboring nodes, while LSTM captures temporal dependencies. GCN-LSTM is designed to handle the variability and noise in audio data effectively. While the combination of GCN and LSTM increases the computational complexity, the model&#x2019;s ability to capture complex patterns and temporal features justifies the additional computational cost. VAFN fuses the feature information from both video and audio modalities. It uses zero-padding to handle different feature vector dimensions and attention mechanisms to assign adaptive weights to the features. The overall complexity of VAFN depends on the individual complexities of TSNet-DD and GCN-LSTM, along with the additional operations for feature fusion. The fusion process involves concatenation, fully connected layers, and attention weight calculation, which add to the computational load. VAFN is designed to leverage the complementary nature&#xa0;of&#xa0;visual and audio features, resulting in improved diagnostic&#xa0;performance. The fusion process, while adding some computational overhead, is optimized to ensure that the model remains efficient and scalable.</p>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>Ablation study</title>
<p>To better understand the contributions of various components in our proposed model, we conducted an ablation study. This study evaluates the impact of the Temporal-Spatial Attention Module (TSAM), the combination of Graph Convolutional Network (GCN) and Long Short-Term Memory (LSTM) for audio features, and the Video-Audio Fusion Network (VAFN). We performed experiments by systematically removing or modifying these components and observing the resulting changes in performance metrics.</p>
<p>We first assess the effect of the TSAM by comparing the full TSNet64-DD model with a variant that does not include the TSAM. The results are shown in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>. It demonstrates that the TSAM significantly improves the performance of the model, highlighting its importance in capturing temporal and spatial features. Next, we evaluate the impact of using GCN and LSTM for audio feature extraction. We compare the full GCN-LSTM model with variants that use only GCN or only LSTM. The results are shown in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>. The results indicate that the combination of GCN and LSTM outperforms the individual models, demonstrating the effectiveness of integrating both graph-based and temporal features for audio data. Finally, we assess the impact of the VAFN by comparing the full VAFN(TSNet64+GL) model with variants that use only video features (TSNet64-DD) or only audio features (GCN-LSTM). The results are shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>. The results clearly show that the fusion of video and audio features significantly enhances the performance, confirming the complementary nature of these modalities. The ablation study confirms that each component of our proposed model contributes to its overall performance. The TSAM enhances the extraction of temporal and spatial features from video data, the combination of GCN and LSTM effectively captures complex audio patterns, and the VAFN successfully integrates multimodal features to improve diagnostic accuracy. These findings validate the design choices and highlight the importance of multimodal data fusion in the automatic diagnosis of depression.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Impact of Temporal-Spatial Attention Module (TSAM) on video feature extraction.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">RMSE</th>
<th valign="top" align="center">MAE</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">TSNet64-DD</td>
<td valign="top" align="center">0.853</td>
<td valign="top" align="center">5.23</td>
<td valign="top" align="center">4.45</td>
</tr>
<tr>
<td valign="top" align="center">TSNet64-DD w/o TSAM</td>
<td valign="top" align="center">0.789</td>
<td valign="top" align="center">6.12</td>
<td valign="top" align="center">5.37</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Impact of GCN and LSTM on audio feature extraction.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">RMSE</th>
<th valign="top" align="center">MAE</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">GCN-LSTM</td>
<td valign="top" align="center">0.827</td>
<td valign="top" align="center">6.67</td>
<td valign="top" align="center">6.28</td>
</tr>
<tr>
<td valign="top" align="center">GCN only</td>
<td valign="top" align="center">0.742</td>
<td valign="top" align="center">7.89</td>
<td valign="top" align="center">7.12</td>
</tr>
<tr>
<td valign="top" align="center">LSTM only</td>
<td valign="top" align="center">0.756</td>
<td valign="top" align="center">7.65</td>
<td valign="top" align="center">6.87</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Impact of Video-Audio Fusion Network (VAFN) on multimodal feature fusion.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Model</th>
<th valign="top" align="center">F1</th>
<th valign="top" align="center">RMSE</th>
<th valign="top" align="center">MAE</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">VAFN(TSNet64+GL)</td>
<td valign="top" align="center">0.922</td>
<td valign="top" align="center">6.06</td>
<td valign="top" align="center">3.51</td>
</tr>
<tr>
<td valign="top" align="center">TSNet64-DD</td>
<td valign="top" align="center">0.853</td>
<td valign="top" align="center">5.23</td>
<td valign="top" align="center">4.45</td>
</tr>
<tr>
<td valign="top" align="center">GCN-LSTM</td>
<td valign="top" align="center">0.827</td>
<td valign="top" align="center">6.67</td>
<td valign="top" align="center">6.28</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_3_3">
<label>4.3.3</label>
<title>Effects of different subject groupings</title>
<p>We divide the dataset into three categories by sex: male group, female group, and mixed gender group. For each data set, we conducted experiments based on single mode and multi-mode respectively. In the video mode experiment, we adopt TSNet64, which has better performance. The experimental results are shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>, <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref> and <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>. We find that for each modal, the F1 values of the mixed gender group are consistently lower than those assessed on either the male or female group alone. In addition, the female group almost always outperformed the male group, which may be due to a category imbalance between the samples. It also suggests that men and women differ in the information conveyed in facial behavior and speech during the diagnosis of depression. In addition, for each subject group, the experimental results based on the multimodal feature set are generally better than those based on the single-modal feature set, which is mainly due to the diversity of training data which brings more abundant features.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Comparison of different subject groups on video modal features.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g009.tif"/>
</fig>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Comparison of different subject groups on audio modal features.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g010.tif"/>
</fig>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>Comparison of different subject groups on multimodal features.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g011.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Interpretability analysis</title>
<p>Deep learning techniques are usually &#x2018;black-box&#x2019;, but in clinical practice we need more transparent models to increase their credibility and interpretability. Therefore, we perform an interpretability analysis of our model. We show the attribution scores for audio, gaze, action unit (AU), and pose, where higher values indicate strong attribution to positive predictions. The E-DAIC dataset has more than 19,000 frames of facial images for each sample, and we aggregate every 100 frames into a whole to explain depression detection. As shown in <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref>, AU contributes to the model diagnostic results to the highest degree, followed by pose and audio, and the smallest contribution is gaze. We further visualize the degree of influence of each AU feature on the diagnostic results of different frames through the contribution matrix. As can be seen in <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref>, AU04, AU05, AU14, AU15, AU17, AU23, AU26, and AU45 have a higher degree of influence, indicating a stronger correlation with depression. These units thus play a more significant role in the diagnostic process for depression.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Contribution of different indicators over frames.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g012.tif"/>
</fig>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>Contribution matrix of AU and frames.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyt-16-1508772-g013.tif"/>
</fig>
</sec>
<sec id="s4_5" sec-type="discussion">
<label>4.5</label>
<title>Discussions</title>
<p>This study introduces a novel multi-modal deep convolutional network that leverages multi-source data fusion to provide a more effective solution for the automatic diagnosis of depression. We utilize a feature fusion module to effectively integrate temporal and spatial features, thereby extracting a more comprehensive representation that is conducive to analyzing the psychological state of patients. Our model&#x2019;s complexity is balanced by its non-redundant design, ensuring efficient processing without unnecessary computational overhead. From our comprehensive test results, several noteworthy insights can be gathered. The proposed TSNet-DD model for video data demonstrates significant advantages in capturing both spatial and temporal features. For audio data, the GCN-LSTM model effectively captures complex patterns related to depression. The fusion of video and audio features further improves diagnostic performance, demonstrating the complementary nature of visual and audio features.</p>
<p>However, it is important to acknowledge some limitations inherent in our study. First, due to the lack of publicly available high-quality datasets in this field, our research focuses specifically on E-DAIC datasets. Future research should aim to extend this approach to include different datasets for various psychiatric disorder diagnoses. Additionally, it is important to categorize different levels of depression, which we plan to address in future studies. Furthermore, inspired by ACFun (<xref ref-type="bibr" rid="B31">31</xref>) and LMAC-ZS (<xref ref-type="bibr" rid="B32">32</xref>), future research could explore the integration of additional data types, such as textual information, into the model to enhance classification performance. This integration could provide the model with a more comprehensive contextual understanding, thereby improving its ability to recognize emotional states. Considering the inherent limitations of deep learning&#x2014;black-box&#x2014;we can draw from the methodologies proposed in LMAC-ZS to enhance our model&#x2019;s interpretability. This kind of interpretability mechanism not only contributes to transparency in clinical applications but also provides significant directions for future research.</p>
<p>Overall, our results underscore the importance of multi-modal data in improving the accuracy and reliability of depression diagnosis. Future work should focus on expanding the dataset to include more diverse populations and exploring additional modalities such as text or physiological signals to further enhance diagnostic capabilities.</p>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>On the premise of protecting the privacy of patients, this paper discusses the method of realizing the high precision diagnosis of depression. We design TSNet-DD architecture for video data to comprehensively consider the temporal and spatial features of video frames through the spatial-temporal attention mechanism module. For audio data, we use a combination of GCN and LSTM to capture high-level graph features and dynamic changes in timing. Finally, the multi-modal feature fusion is realized through the video-audio fusion network. The experimental results show that our method has certain potential in the automatic diagnosis of depression. In the future, researchers can further explore the sensitive features of automatic recognition of depression through larger data sets and more diverse modalities, so as to improve the recognition accuracy and provide more powerful diagnostic and treatment support for clinicians. In addition, it is crucial to classify different levels of depression, which we plan to address in future studies.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://dcapswoz.ict.usc.edu/extended-daic-database-download/">https://dcapswoz.ict.usc.edu/extended-daic-database-download/</uri> (USC).</p>
</sec>
<sec id="s8" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>Ethical review and approval was not required for the study on human participants in accordance with the local legislation and institutional requirements. Written informed consent from the participants or their legal guardian/next of kin was not required to participate in this study in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>NJ: Conceptualization, Data curation, Investigation, Methodology, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. RY: Formal analysis, Methodology, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. PL: Conceptualization, Formal analysis, Methodology, Project administration, Resources, Software, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fried</surname> <given-names>EI</given-names>
</name>
<name>
<surname>Nesse</surname> <given-names>RM</given-names>
</name>
</person-group>. <article-title>Depression sum-scores don&#x2019;t add up: why analyzing specific depression symptoms is essential</article-title>. <source>BMC Med</source>. (<year>2015</year>) <volume>13</volume>:<fpage>72</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s12916-015-0325-4</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="web">
<person-group person-group-type="author">
<collab>WHO</collab>
</person-group>. <article-title>Depressive disorder fact sheet</article-title> (<year>2023</year>). <publisher-name>World Health Organization</publisher-name>. Available online at: <uri xlink:href="https://www.who.int/news-room/fact-sheets/detail/depression/">https://www.who.int/news-room/fact-sheets/detail/depression/</uri> (Accessed <access-date>22 February 2024</access-date>).</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schaakxs</surname> <given-names>R</given-names>
</name>
<name>
<surname>Comijs</surname> <given-names>HC</given-names>
</name>
<name>
<surname>van der Mast</surname> <given-names>RC</given-names>
</name>
<name>
<surname>Schoevers</surname> <given-names>RA</given-names>
</name>
<name>
<surname>Beekman</surname> <given-names>ATF</given-names>
</name>
<name>
<surname>Penninx</surname> <given-names>BWJH</given-names>
</name>
</person-group>. <article-title>Risk factors for depression: differential across age</article-title>? <source>Am J Geriatric Psychiatry</source>. (<year>2017</year>) <volume>25</volume>:<page-range>966&#x2013;77</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jagp.2017.04.004</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname> <given-names>S</given-names>
</name>
<name>
<surname>Calhoun</surname> <given-names>VD</given-names>
</name>
<name>
<surname>Sui</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>Machine learning in major depression: From classification to treatment outcome prediction</article-title>. <source>CNS Neurosci Ther</source>. (<year>2018</year>) <volume>24</volume>:<page-range>1037&#x2013;52</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/cns.2018.24.issue-11</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jan</surname> <given-names>A</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>H</given-names>
</name>
<name>
<surname>Gaus</surname> <given-names>YFA</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F</given-names>
</name>
<name>
<surname>Turabzadeh</surname> <given-names>S</given-names>
</name>
</person-group>. (<year>2014</year>). <article-title>Automatic depression scale prediction using facial expression dynamics and regression</article-title>. In <conf-name>Proceedings of the 4th International Workshop on Audio/Visual Emotion Challenge</conf-name>. pp. <fpage>73</fpage>&#x2013;<lpage>80</lpage>.</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al Jazaery</surname> <given-names>M</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G</given-names>
</name>
</person-group>. <article-title>Video-based depression level analysis by encoding deep spatiotemporal features</article-title>. <source>IEEE Trans Affect Computing</source>. (<year>2018</year>) <volume>12</volume>:<page-range>262&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/T-AFFC.5165369</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhuhong</surname> <given-names>SHAO</given-names>
</name>
<name>
<surname>Yuanyuan</surname> <given-names>S. H. A. N. G</given-names>
</name>
</person-group>. <article-title>Channel-wise attention mechanism-relevant multi-branches convolutional network-based depressive disorder recognition</article-title>. <source>J Image Graphics</source>. (<year>2022</year>) <volume>27</volume>:<page-range>3292&#x2013;302</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.11834/jig.210397</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Valstar</surname> <given-names>M</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>Bj&#xf6;rn</given-names>
</name>
<name>
<surname>Smith</surname> <given-names>K</given-names>
</name>
<name>
<surname>Eyben</surname> <given-names>F</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>B</given-names>
</name>
<name>
<surname>Bilakhia</surname> <given-names>S</given-names>
</name>
<etal/>
</person-group>. (<year>2013</year>). <article-title>Avec 2013: the continuous audio/visual emotion and depression recognition challenge</article-title>. In <conf-name>Proceedings of the 3rd ACM international workshop on Audio/visual emotion challenge</conf-name>. pp. <fpage>3</fpage>&#x2013;<lpage>10</lpage>.</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Valstar</surname> <given-names>M</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>Bj&#xf6;rn</given-names>
</name>
<name>
<surname>Smith</surname> <given-names>K</given-names>
</name>
<name>
<surname>Almaev</surname> <given-names>T</given-names>
</name>
<name>
<surname>Eyben</surname> <given-names>F</given-names>
</name>
<name>
<surname>Krajewski</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. (<year>2014</year>). <article-title>Avec 2014: 3d dimensional affect and depression recognition challenge</article-title>. In <conf-name>Proceedings of the 4th international workshop on audio/visual emotion challenge</conf-name>. pp. <fpage>3</fpage>&#x2013;<lpage>10</lpage>.</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pan</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Shang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>T</given-names>
</name>
<name>
<surname>Shao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>H</given-names>
</name>
<etal/>
</person-group>. <article-title>Spatial&#x2013;temporal attention network for depression recognition from facial videos</article-title>. <source>Expert Syst Appl</source>. (<year>2024</year>) <volume>237</volume>:<fpage>121410</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2023.121410</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>JH</given-names>
</name>
<name>
<surname>Poulose&#x548c;</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ha</surname> <given-names>DS</given-names>
</name>
</person-group>. <article-title>CVGG-19: customized visual geometry group deep learning architecture for facial emotion recognition</article-title>. <source>IEEE Access</source>. (<year>2024</year>) <volume>12</volume>:<page-range>41557&#x2013;78</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2024.3377235</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Casado</surname> <given-names>C&#xc1;lvarez</given-names>
</name>
<name>
<surname>Ca&#xf1;ellas</surname> <given-names>ML</given-names>
</name>
<name>
<surname>L&#xf3;pez</surname> <given-names>MB</given-names>
</name>
</person-group>. <article-title>Depression recognition using remote photoplethysmography from facial videos</article-title>. <source>IEEE Trans Affect Computing</source>. (<year>2023</year>) <volume>14</volume>:<page-range>3305&#x2013;16</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2023.3238641</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ishimaru</surname> <given-names>M</given-names>
</name>
<name>
<surname>Okada</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Uchiyama</surname> <given-names>R</given-names>
</name>
<name>
<surname>Horiguchi</surname> <given-names>R</given-names>
</name>
<name>
<surname>Toyoshima</surname> <given-names>I</given-names>
</name>
</person-group>. <article-title>A new regression model for depression severity prediction based on correlation among audio features using a graph convolutional neural network</article-title>. <source>Diagnostics</source>. (<year>2023</year>) <volume>13</volume>:<fpage>727</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/diagnostics13040727</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Xu</given-names>
</name>
<name>
<surname>Im</surname> <given-names>S-K</given-names>
</name>
</person-group>. <article-title>Speech emotion recognition based on Graph-LSTM neural network</article-title>. <source>EURASIP J Audio Speech Music Process</source>. (<year>2023</year>) <volume>2023</volume>:<fpage>40</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13636-023-00303-9</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kakuba</surname> <given-names>S</given-names>
</name>
<name>
<surname>Poulose&#x548c;</surname> <given-names>A</given-names>
</name>
<name>
<surname>Han</surname> <given-names>DS</given-names>
</name>
</person-group>. <article-title>Deep learning approaches for bimodal speech emotion recognition: advancements, challenges, and a multi-learning model</article-title>. <source>IEEE Access</source>. (<year>2023</year>) <volume>11</volume>:<page-range>113769&#x2013;89</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2023.3325037</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uddin</surname> <given-names>MdA</given-names>
</name>
<name>
<surname>Joolee</surname> <given-names>JB</given-names>
</name>
<name>
<surname>Sohn</surname> <given-names>K-A</given-names>
</name>
</person-group>. <article-title>Deep multi-modal network based automated depression severity estimation</article-title>. <source>IEEE Trans Affect computing</source>. (<year>2022</year>) <volume>14</volume>:<page-range>2153&#x2013;67</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TAFFC.2022.3179478</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>S</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Hung</surname> <given-names>C-C</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>S</given-names>
</name>
</person-group>. <article-title>A multimodal fusion model with multi-level attention mechanism for depression detection</article-title>. <source>Biomed Signal Process Control</source>. (<year>2023</year>) <volume>82</volume>:<fpage>104561</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.bspc.2022.104561</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gimeno-G&#xf3;mez</surname> <given-names>D</given-names>
</name>
<name>
<surname>Bucur</surname> <given-names>A-M</given-names>
</name>
<name>
<surname>Cosma</surname> <given-names>A</given-names>
</name>
<name>
<surname>Mart&#xed;nez-Hinarejos</surname> <given-names>C-D</given-names>
</name>
<name>
<surname>Rosso</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Reading Between the Frames: Multi-modal Depression Detection in Videos from Non-verbal Cues</article-title>. In: <source>European Conference on Information Retrieval</source>. <publisher-name>Springer Nature Switzerland</publisher-name>, <publisher-loc>Cham</publisher-loc> (<year>2024</year>). p. <fpage>191</fpage>&#x2013;<lpage>209</lpage>.</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sundararajan</surname> <given-names>M</given-names>
</name>
<name>
<surname>Taly</surname> <given-names>A</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>Q</given-names>
</name>
</person-group>. (<year>2017</year>). <article-title>Axiomatic attribution for deep networks</article-title>. In: <conf-name>International conference on machine learning</conf-name>. (pp. <page-range>3319&#x2013;28</page-range>). <publisher-name>PMLR</publisher-name>.</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>J</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Z</given-names>
</name>
</person-group>. <article-title>Recognition of major depressive disorder based on facial behavior and speech fusion features</article-title>. <source>J Beijing Univ Posts Telecommunications</source>. (<year>2023</year>) <volume>46</volume>(<issue>1</issue>):<fpage>32</fpage>.</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silva</surname> <given-names>WJ</given-names>
</name>
<name>
<surname>Lopes</surname> <given-names>L</given-names>
</name>
<name>
<surname>Galdino</surname> <given-names>MKC</given-names>
</name>
<name>
<surname>Almeida</surname> <given-names>AA</given-names>
</name>
</person-group>. <article-title>Voice acoustic parameters as predictors of depression</article-title>. <source>J Voice</source>. (<year>2024</year>) <volume>38</volume>(<issue>1</issue>):<page-range>77&#x2013;85</page-range>. doi: <pub-id pub-id-type="doi">10.1016/j.jvoice.2021.06.018</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ittichaichareon</surname> <given-names>C</given-names>
</name>
<name>
<surname>Suksri</surname> <given-names>S</given-names>
</name>
<name>
<surname>Yingthawornsuk</surname> <given-names>T</given-names>
</name>
</person-group>. <article-title>Speech recognition using MFCC</article-title>. In: <conf-name>International conference on computer graphics, simulation and modeling</conf-name>. Vol. <volume>9</volume>. p. <fpage>2012</fpage>.</citation>
</ref>
<ref id="B23">
<label>23</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gratch</surname> <given-names>J</given-names>
</name>
<name>
<surname>Artstein</surname> <given-names>R</given-names>
</name>
<name>
<surname>Lucas</surname> <given-names>GM</given-names>
</name>
<name>
<surname>Stratou</surname> <given-names>G</given-names>
</name>
<name>
<surname>Scherer</surname> <given-names>S</given-names>
</name>
<name>
<surname>Nazarian</surname> <given-names>A</given-names>
</name>
<etal/>
</person-group>. <article-title>The distress analysis interview corpus of human and computer interviews</article-title>. <source>LREC</source>. (<year>2014</year>), <page-range>3123&#x2013;8</page-range>.</citation>
</ref>
<ref id="B24">
<label>24</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ringeval</surname> <given-names>F</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>Bj&#xf6;rn</given-names>
</name>
<name>
<surname>Valstar</surname> <given-names>M</given-names>
</name>
<name>
<surname>Cummins</surname> <given-names>N</given-names>
</name>
<name>
<surname>Cowie</surname> <given-names>R</given-names>
</name>
<name>
<surname>Tavabi</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>AVEC 2019 workshop and challenge: state-of-mind, detecting depression with AI, and cross-cultural affect recognition</article-title>. In: <conf-name>Proceedings of the 9th International on Audio/visual Emotion Challenge and Workshop</conf-name>. pp. <fpage>3</fpage>&#x2013;<lpage>12</lpage>.</citation>
</ref>
<ref id="B25">
<label>25</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Eyben</surname> <given-names>F</given-names>
</name>
<name>
<surname>Weninger</surname> <given-names>F</given-names>
</name>
<name>
<surname>Gross</surname> <given-names>F</given-names>
</name>
<name>
<surname>Schuller</surname> <given-names>B</given-names>
</name>
</person-group>. (<year>2013</year>). <article-title>Recent developments in opensmile, the munich open-source multimedia feature extractor</article-title>. In: <conf-name>Proceedings of the 21st ACM international conference on Multimedia</conf-name>. pp. <fpage>835</fpage>&#x2013;<lpage>838</lpage>.</citation>
</ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tiwari</surname> <given-names>V</given-names>
</name>
</person-group>. <article-title>MFCC and its applications in speaker recognition</article-title>. <source>Int J Emerging Technol</source>. (<year>2010</year>) <volume>1</volume>(<issue>1</issue>):<fpage>19</fpage>&#x2013;<lpage>22</lpage>.</citation>
</ref>
<ref id="B27">
<label>27</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Baltru&#x161;aitis</surname> <given-names>T</given-names>
</name>
<name>
<surname>Robinson</surname> <given-names>P</given-names>
</name>
<name>
<surname>Morency</surname> <given-names>L-P</given-names>
</name>
</person-group>. (<year>2016</year>). <article-title>Openface: an open source facial behavior analysis toolkit</article-title>. In: <conf-name>2016 IEEE winter conference on applications of computer vision (WACV)</conf-name>. <publisher-name>IEEE</publisher-name>, pp. <fpage>1</fpage>&#x2013;<lpage>1</lpage>.</citation>
</ref>
<ref id="B28">
<label>28</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kroenke</surname> <given-names>K</given-names>
</name>
<name>
<surname>Strine</surname> <given-names>TW</given-names>
</name>
<name>
<surname>Spitzer</surname> <given-names>RL</given-names>
</name>
<name>
<surname>Williams</surname> <given-names>JB</given-names>
</name>
<name>
<surname>Berry</surname> <given-names>JT</given-names>
</name>
<name>
<surname>Mokdad</surname> <given-names>AH</given-names>
</name>
</person-group>. <article-title>The PHQ-8 as a measure of current depression in the general population</article-title>. <source>J Affect Disord</source>. (<year>2009</year>) <volume>114</volume>:<page-range>163&#x2013;73</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jad.2008.06.026</pub-id>
</citation>
</ref>
<ref id="B29">
<label>29</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Powers</surname> <given-names>DM</given-names>
</name>
</person-group>. <article-title>Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation</article-title>. <source>arXiv preprint arXiv:2010.16061</source>. (<year>2020</year>).</citation>
</ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chai</surname> <given-names>T</given-names>
</name>
<name>
<surname>Draxler</surname> <given-names>RR</given-names>
</name>
</person-group>. <article-title>Root mean square error (RMSE) or mean absolute error (MAE)?&#x2013;Arguments against avoiding RMSE in the literature</article-title>. <source>Geoscientific Model Dev</source>. (<year>2014</year>) <volume>7</volume>:<page-range>1247&#x2013;50</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.5194/gmd-7-1247-2014</pub-id>
</citation>
</ref>
<ref id="B31">
<label>31</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ji</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>K</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>ACFun: abstract-concrete fusion facial stylization</article-title>. In: <conf-name>The Thirty-eighth Annual Conference on Neural Information Processing Systems</conf-name>, .</citation>
</ref>
<ref id="B32">
<label>32</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paissan</surname> <given-names>F</given-names>
</name>
<name>
<surname>Libera</surname> <given-names>LD</given-names>
</name>
<name>
<surname>Ravanelli</surname> <given-names>M</given-names>
</name>
<name>
<surname>Subakan</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>Listenable maps for zero-shot audio classifiers</article-title>. <source>arXiv preprint arXiv:10.48550/arXiv.2405.17615</source>. (<year>2024</year>).</citation>
</ref>
</ref-list>
</back>
</article>