<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychiatry</journal-id>
<journal-title>Frontiers in Psychiatry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychiatry</abbrev-journal-title>
<issn pub-type="epub">1664-0640</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyt.2023.1195276</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Psychiatry</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Fast and accurate assessment of depression based on voice acoustic features: a cross-sectional and longitudinal study</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author"><name><surname>Wang</surname> <given-names>Yang</given-names></name><xref rid="aff1" ref-type="aff"><sup>1</sup></xref><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="fn0001" ref-type="author-notes"><sup>&#x2020;</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Liang</surname> <given-names>Lijuan</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="aff4" ref-type="aff"><sup>4</sup></xref><xref rid="fn0001" ref-type="author-notes"><sup>&#x2020;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/659447/overview"/>
</contrib>
<contrib contrib-type="author"><name><surname>Zhang</surname> <given-names>Zhongguo</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="aff5" ref-type="aff"><sup>5</sup></xref><xref rid="fn0001" ref-type="author-notes"><sup>&#x2020;</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Xu</surname> <given-names>Xiao</given-names></name><xref rid="aff6" ref-type="aff"><sup>6</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Liu</surname> <given-names>Rongxun</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="aff7" ref-type="aff"><sup>7</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Fang</surname> <given-names>Hanzheng</given-names></name><xref rid="aff8" ref-type="aff"><sup>8</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Zhang</surname> <given-names>Ran</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1182124/overview"/>
</contrib>
<contrib contrib-type="author"><name><surname>Wei</surname> <given-names>Yange</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref></contrib>
<contrib contrib-type="author"><name><surname>Liu</surname> <given-names>Zhongchun</given-names></name><xref rid="aff9" ref-type="aff"><sup>9</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/283088/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Zhu</surname> <given-names>Rongxin</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="c003" ref-type="corresp"><sup>&#x002A;</sup></xref></contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Zhang</surname> <given-names>Xizhe</given-names></name><xref rid="aff6" ref-type="aff"><sup>6</sup></xref><xref rid="c002" ref-type="corresp"><sup>&#x002A;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/1170016/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes"><name><surname>Wang</surname> <given-names>Fei</given-names></name><xref rid="aff2" ref-type="aff"><sup>2</sup></xref><xref rid="aff3" ref-type="aff"><sup>3</sup></xref><xref rid="c001" ref-type="corresp"><sup>&#x002A;</sup></xref><uri xlink:href="https://loop.frontiersin.org/people/105087/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Psychology Institute, Inner Mongolia Normal University</institution>, <addr-line>Hohhot, Inner Mongolia</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Early Intervention Unit, Department of Psychiatry, The Affiliated Brain Hospital of Nanjing Medical University</institution>, <addr-line>Nanjing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Functional Brain Imaging Institute, Nanjing Medical University</institution>, <addr-line>Nanjing</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Laboratory of Psychology, The First Affiliated Hospital of Hainan Medical University</institution>, <addr-line>Haikou, Hainan</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>The Fourth People&#x2019;s Hospital of Yancheng</institution>, <addr-line>Yancheng, Jiangsu</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>School of Biomedical Engineering and Informatics, Nanjing Medical University</institution>, <addr-line>Nanjing</addr-line>, <country>China</country></aff>
<aff id="aff7"><sup>7</sup><institution>College of Medical Engineering, Xinxiang Medical University</institution>, <addr-line>Xinxiang, Henan</addr-line>, <country>China</country></aff>
<aff id="aff8"><sup>8</sup><institution>School of Computer Science and Engineering, Northeastern University</institution>, <addr-line>Shenyang, Liaoning</addr-line>, <country>China</country></aff>
<aff id="aff9"><sup>9</sup><institution>Department of Psychiatry, Renmin Hospital of Wuhan University</institution>, <addr-line>Wuhan, Hubei</addr-line>, <country>China</country></aff>
<author-notes>
<fn id="fn0003" fn-type="edited-by">
<p>Edited by: Zhi Yang, Shanghai Mental Health Center, China</p>
</fn>
<fn id="fn0004" fn-type="edited-by">
<p>Reviewed by: Runsen Chen, Tsinghua University, China; Tingshao Zhu, Chinese Academy of Sciences (CAS), China</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Fei Wang, <email>fei.wang@yale.edu</email></corresp>
<corresp id="c002">Xizhe Zhang, <email>zhangxizhe@njmu.edu.cn</email></corresp>
<corresp id="c003">Rongxin Zhu, <email>zhurx2000@njmu.edu.cn</email></corresp>
<fn id="fn0001" fn-type="equal">
<p><sup>&#x2020;</sup>These authors have contributed equally to this work and share first authorship</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>21</day>
<month>06</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1195276</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>03</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>06</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2023 Wang, Liang, Zhang, Xu, Liu, Fang, Zhang, Wei, Liu, Zhu, Zhang and Wang.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Wang, Liang, Zhang, Xu, Liu, Fang, Zhang, Wei, Liu, Zhu, Zhang and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>Depression is a widespread mental disorder that affects a significant portion of the population. However, the assessment of depression is often subjective, relying on standard questions or interviews. Acoustic features have been suggested as a reliable and objective alternative for depression assessment. Therefore, in this study, we aim to identify and explore voice acoustic features that can effectively and rapidly predict the severity of depression, as well as investigate the potential correlation between specific treatment options and voice acoustic features.</p>
</sec>
<sec>
<title>Methods</title>
<p>We utilized voice acoustic features correlated with depression scores to train a prediction model based on artificial neural network. Leave-one-out cross-validation was performed to evaluate the performance of the model. We also conducted a longitudinal study to analyze the correlation between the improvement of depression and changes in voice acoustic features after an Internet-based cognitive-behavioral therapy (ICBT) program consisting of 12 sessions.</p>
</sec>
<sec>
<title>Results</title>
<p>Our study showed that the neural network model trained based on the 30 voice acoustic features significantly correlated with HAMD scores can accurately predict the severity of depression with an absolute mean error of 3.137 and a correlation coefficient of 0.684. Furthermore, four out of the 30 features significantly decreased after ICBT, indicating their potential correlation with specific treatment options and significant improvement in depression (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.05).</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Voice acoustic features can effectively and rapidly predict the severity of depression, providing a low-cost and efficient method for screening patients with depression on a large scale. Our study also identified potential acoustic features that may be significantly related to specific treatment options for depression.</p>
</sec>
</abstract>
<kwd-group>
<kwd>depression</kwd>
<kwd>voice acoustic features</kwd>
<kwd>deep learning</kwd>
<kwd>Internet-based cognitive-behavioral therapy</kwd>
<kwd>cross-sectional</kwd>
<kwd>longitudinal</kwd>
</kwd-group>
<contract-sponsor id="cn1">National Science Fund for Distinguished Young Scholars</contract-sponsor>
<contract-sponsor id="cn2">NSFC-Guangdong Joint Fund</contract-sponsor>
<contract-sponsor id="cn3">Jiangsu Provincial Key Research and Development Program<named-content content-type="fundref-id">10.13039/501100013058</named-content></contract-sponsor>
<contract-sponsor id="cn4">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content></contract-sponsor>
<contract-sponsor id="cn5">National Key Research and Development Program of the Ministry of Science and Technology of China</contract-sponsor>
<contract-sponsor id="cn6">Medical Science and Technology Development Foundation, Jiangsu Commission of Health</contract-sponsor>
<contract-sponsor id="cn7">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content></contract-sponsor>
<contract-sponsor id="cn8">Jiangsu Provincial Key Research and Development Program<named-content content-type="fundref-id">10.13039/501100013058</named-content></contract-sponsor>
<contract-sponsor id="cn9">Inner Mongolia Autonomous Region Postgraduate Education Innovation Program</contract-sponsor>
<contract-sponsor id="cn10">Hainan Provincial Natural Science Foundation of China</contract-sponsor>
<contract-sponsor id="cn11">National Key R&#x0026;D Program of China</contract-sponsor>
<contract-sponsor id="cn12">Henan Province Higher Education Teaching Reform Research and Practice Project R</contract-sponsor>
<counts>
<fig-count count="5"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="49"/>
<page-count count="11"/>
<word-count count="6811"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Digital Mental Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="sec5" sec-type="intro">
<title>Introduction</title>
<p>Depression is typically diagnosed using self-report scales, which rely on patients&#x2019; responses to standardized questions (<xref ref-type="bibr" rid="ref1">1</xref>). However, the accuracy of this method can be limited by factors such as patients&#x2019; self-awareness and truthfulness, as well as social stigmas surrounding mental illness. Lengthy questionnaires may also induce fatigue or impatience. As such, more objective biomarkers of depression are needed to improve diagnosis and assessment (<xref ref-type="bibr" rid="ref2">2</xref>). Acoustic features have emerged as an important and objective measure of emotion, particularly for depression, as they often exhibit specific acoustic features that provide important cues for clinical identification and diagnosis (<xref ref-type="bibr" rid="ref3">3</xref>). In the present research on the acoustic features of depression, the majority of studies have focused on the differences in the acoustic features of healthy and depressed people (<xref ref-type="bibr" rid="ref4">4</xref>, <xref ref-type="bibr" rid="ref5">5</xref>), with few studies on the changes in the acoustic features of depressed patients over the course of psychological treatment, which will be the focus of our study.</p>
<p>Emotional states can significantly impact the function and structure of the vocal system, as expressed through rhythm, and prosody of voice (<xref ref-type="bibr" rid="ref6">6</xref>, <xref ref-type="bibr" rid="ref7">7</xref>). According to previous research, positive emotions tend to result in higher pitched and louder voice that is faster, whereas negative emotions are characterized by lower volume, slower voice, and longer pauses (<xref ref-type="bibr" rid="ref8">8</xref>). In addition to prosodic features, there are also significant differences in frequency spectrum features between positive and negative emotions (<xref ref-type="bibr" rid="ref9">9</xref>). Specifically, negative emotions tend to exhibit more high frequency sounds, an increase in fundamental frequency (F0) rises, and a decrease in formants compared to positive emotions (<xref ref-type="bibr" rid="ref10">10</xref>). Furthermore, Mel Frequency Cepstral Coefficients (MFCC) have been identified as potential biomarkers for major depression severity and recovery process (<xref ref-type="bibr" rid="ref11">11</xref>). Studies have shown that various prosodic features, including pause time and total voice volume, exhibit significant negative correlation with depression severity, and that objective acoustic parameters show a decrease in average weighted variance (AWV) (<xref ref-type="bibr" rid="ref12">12</xref>). As depression severity increases, the range of voice acoustic variation narrows and the acoustic track becomes smoother. Additionally, other voice acoustic features such as the coefficient of variation (COV) of second formants have been found to be significantly correlated with depression severity (<xref ref-type="bibr" rid="ref13">13</xref>).</p>
<p>However, previous studies on voice acoustic biomarkers for depression have several limitations, such as use of self-reported symptoms, unstructured voice design, and relatively simple data analysis, which may limit the robustness of the findings (<xref ref-type="bibr" rid="ref14 ref15 ref16">14&#x2013;16</xref>). Furthermore, voice overlap induced by interactive interviews can be an obstacle for standard data processing. To address these issues, the DAIC (Distress Analysis Interview Corpus) data by AVEC (Audio-Visual Emotion Recognition Challenge) performed interactive interviews between virtual agents and patients (<xref ref-type="bibr" rid="ref17">17</xref>). While this model can exclude emotional interaction between psychotherapy and patients, it may also exhibit other voice issues, such as confusion of voice, changes in the virtual agent&#x2019;s behavior, and long voice (<xref ref-type="bibr" rid="ref18">18</xref>, <xref ref-type="bibr" rid="ref19">19</xref>). In this study, we have not used interactive interviews for voice capture, instead we have used text reading. A standard text reading ensures that each test participant reads the same content, which reduces interference due to different readers and problems with overlapping voice. Thus, increasing the reproducibility and comparability of the experiment.</p>
<p>Voice acoustic features of depression patients may change with the relief of depression, according to clinical impressions. However, it is unclear whether medication, physical therapy, or psychotherapy can affect voice acoustic biomarkers and their sensitivity to treatment response. For example, men and women with depression may have differential responsivity and tolerability to sertraline and imipramine treatment (<xref ref-type="bibr" rid="ref20">20</xref>). Recent studies have shown that depressed patients demonstrate less voice pause time and more fluent acoustic expression after treatment with drugs or psychotherapy, indicating potential changes in voice acoustic features associated with improvement of depression (<xref ref-type="bibr" rid="ref21">21</xref>, <xref ref-type="bibr" rid="ref22">22</xref>). Additionally, changes in spectrum characteristics may also be associated with depression improvement (<xref ref-type="bibr" rid="ref23">23</xref>).</p>
<p>The aim of this study was to investigate the relationship between voice acoustic features and depression using machine learning, and to evaluate the validity of these features in predicting the severity of depression. The study also aimed to explore the correlation between changes in voice acoustic features and improvement of depression before and after Internet-based cognitive-behavioral therapy (ICBT). To achieve these goals, objective voice acoustic features related to depression and those sensitive to psychotherapy were identified. The use of a brief standardized reading instead of a nonstandard long interview facilitated post-data processing, improved standardization of data analysis, and increased the accuracy of the training model. In addition, a predictive model was constructed using machine learning to explore the complex, nonlinear correlation between acoustic features and depression and assess the validity of the model. Finally, the longitudinal design of the study allowed for the investigation of specific and effective voice acoustic features for treatment response to ICBT. The findings of this study have important implications for the early detection of depression based on voice acoustic features.</p>
<sec id="sec6">
<title>Ethics statement</title>
<p>This study was conducted in compliance with ethical standards and was approved by the Ethics Committee of Hainan Medical University (HYLL2020005). Informed consent was obtained from all participants before their participation in the study. Participants had the choice to opt-out of the study if they wished to do so. All participants who completed the assessments were provided with their individual psychometric results. In addition, participants who voluntarily chose to participate in the longitudinal study were offered free ICBT.</p>
</sec>
</sec>
<sec id="sec7">
<title>Study one: a cross-sectional study of depression and voice acoustic features</title>
<sec id="sec8">
<title>Methods and material</title>
<sec id="sec9">
<title>Participants</title>
<p>A total of 47 college students with depression (42 females and 5 males) from Hainan Medical University were recruited for this study through online advertisements. The mean age of the participants was 20.51&#x2009;&#x00B1;&#x2009;1.50&#x2009;years, and their ages ranged from 18 to 24&#x2009;years. The inclusion criteria for depression were based on the self-rated Patient Health Questionnaire-9 (PHQ-9), with a total score of 5 or higher for initial screening of depressive symptoms (<xref ref-type="bibr" rid="ref24">24</xref>). Participants who met the inclusion criteria were then assessed by a standardized HAMD-17 telephone interview conducted by psychiatrists from China Medical University and Hainan Medical College with consistent training. A score of &#x2265;7 on the Hamilton Depression (HAMD) Scale was used to determine depressive symptoms. The measurement of depression in this study was based on recent depressive symptoms rather than individuals who have been clinically diagnosed with severe depression. Participants with a score of &#x2265;3 on item 9 of the PHQ-9 indicating suicidal ideation or behavior, severe or potential mental illness such as schizophrenia or drug abuse, acute respiratory diseases or those receiving antidepressant treatment and psychological therapy were excluded from the study.</p>
</sec>
</sec>
<sec id="sec10">
<title>Voice data set</title>
<p>For the voice data set, neutral readings such as &#x201C;Life like a summer flower&#x201D; were used for acoustic sampling in accordance with a previous study&#x2019;s recommendation. The original audio recorder of an Android mobile phone was used to record mp3 and m4a format recording files, which were pretested to ensure uniform format and parameters. The collected audio files were transcoded into wav format using FFmpeg and the sampling frequency was converted to 16KHz. After the data collection, we utilized endpoint detection and normalizing for pretreatment to reduce confounding factors. Endpoint detection was used to identify the beginning and end of each voice sample, and normalization was applied to adjust the volume of each sample to a standardized level.</p>
</sec>
<sec id="sec11">
<title>Features extraction</title>
<p>Acoustic features were extracted for each voice frame with a duration of 10&#x2009;ms. A total of 120 features were calculated for each frame, including 74 COVAREP features, 20 MFCC-deltas, 20 MFCC-delta-deltas (<xref ref-type="bibr" rid="ref25">25</xref>), 5 formants (<xref ref-type="bibr" rid="ref26">26</xref>), and peak-to-RMS (<xref ref-type="bibr" rid="ref27">27</xref>). These features were referred to as Low-Level-Descriptors (LLDs). COVAREP features, which include prosodic, voice quality, and spectral features, were calculated by the COVAREP toolbox at a frequency of 100&#x2009;Hz (<xref ref-type="bibr" rid="ref28">28</xref>). A detailed list of COVAREP features is provided in <xref rid="tab1" ref-type="table">Table 1</xref>. Peak-to-RMS, a gross indicator of loudness linked to waveform shapes, was calculated on a segmental level and reflected a local loudness metric related to waveform shape across a few pitch periods (with a frame length of 20&#x2009;ms and a frame shift of 10&#x2009;ms). The first 5 formants of the frame (with a frame length of 20&#x2009;ms and a frame shift of 10&#x2009;ms) were predicted by linear predictive coding. MFCC-deltas and MFCC-delta-deltas of the first 20 Mel cepstral coefficients in each frame (with a frame length of 20&#x2009;ms and a frame shift of 10&#x2009;ms) were calculated using the librosa Library (<xref ref-type="bibr" rid="ref29">29</xref>). These features contain the dynamic information of the spectrum envelope on a frame of voice signal.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Summaries of features.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Feature Name</th>
<th/>
<th align="center" valign="top">Num</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">F0</td>
<td align="left" valign="middle">Vocal cord vibration cycle</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">VUV</td>
<td align="left" valign="middle">Vector containing the binary voicing decisions</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">NAQ</td>
<td align="left" valign="middle">Normalized amplitude quotient is presented as a method to parametrize the glottal closing phase</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">QOQ</td>
<td align="left" valign="middle">The quasiopen period describes the duration of the glottal flow above 50% of the peak amplitude</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">H1H2</td>
<td align="left" valign="middle">Difference in glottal harmonic amplitude</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">PSP</td>
<td align="left" valign="middle">Parabolic spectral parameter fitting a parabolic function to the low-frequency part of the estimated glottal flow</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">MDQ</td>
<td align="left" valign="middle">The Maxima Dispersion Quotient, is proposed for discriminating breathy to tense voice</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">Peak Slope</td>
<td align="left" valign="middle">Slope coefficient of a regression line fit to local peak by using wavelet analysis.</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">Rd</td>
<td align="left" valign="middle" rowspan="2">The Rd. shape parameter of the Liljencrants-Fant (LF) glottal model using the Mean Squared Phase (MSP) method based on MSPD2</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">Rd-conf</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">Creak</td>
<td align="left" valign="middle">Detect creaky voice using acoustic features by an artificial neural network</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">MCEP</td>
<td align="left" valign="middle">Transform the spectrogram into a Mel spectrum through the Mel scale filter bank, and then perform cepstrum analysis</td>
<td align="center" valign="middle">25</td>
</tr>
<tr>
<td align="left" valign="middle">HMPDM</td>
<td align="left" valign="middle" rowspan="2">Harmonic Model Phase Distortion Mean and Harmonic Model Phase Distortion Deviation are flexible representation of the glottal source based on the short-term statistics of the phase distortion</td>
<td align="center" valign="middle">25</td>
</tr>
<tr>
<td align="left" valign="middle">HMPDD</td>
<td align="center" valign="middle">13</td>
</tr>
<tr>
<td align="left" valign="middle">Peak-to-RMS</td>
<td align="left" valign="middle">Peak-to-RMS measure reflecting a local loudness metric related to waveform shape across a few pitch periods</td>
<td align="center" valign="middle">1</td>
</tr>
<tr>
<td align="left" valign="middle">Formant</td>
<td align="left" valign="middle">Formants refer to areas where energy is relatively concentrated in the sound spectrum</td>
<td align="center" valign="middle">5</td>
</tr>
<tr>
<td align="left" valign="middle">MFCC-deltas</td>
<td align="left" valign="middle" rowspan="2">Reflecting the dynamic information of the spectrum envelope on a frame of voice signal</td>
<td align="center" valign="middle">20</td>
</tr>
<tr>
<td align="left" valign="middle">MFCC-delta-deltas</td>
<td align="center" valign="middle">20</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Following the calculation of the 120 LLDs for each frame, we obtained a total of 1,200 HSFs for each recording, by calculating 10 statistics (maximum, minimum, median, mean, variance, kurtosis, skewness, regression slope, regression intercept, regression R2) for each LLD, in order to integrate the multi-frame LLD information and describe the distribution of each LLD in the time dimension (<xref ref-type="bibr" rid="ref30">30</xref>). The final features were standardized using the Standard Scaler of the scikit-learn library (<xref ref-type="bibr" rid="ref31">31</xref>).</p>
<p>To prevent overfitting of the neural network model, we aimed to reduce the dimensionality of the features. However, the original feature set contained 1,200 dimensions for each of the 47 samples. To address this issue, we performed Pearson correlation analyses between the 1,200 high-level statistics functions (HSFs) and the HAMD scores while controlling for sex and age as co-variants. We selected only the HSFs that were significantly (<italic>P</italic>&#x003C;0.01) related to the HAMD score and assigned the remaining features a value of 0. This approach helped to reduce the dimensionality of the feature set while retaining relevant information for predicting depression severity.</p>
</sec>
<sec id="sec12">
<title>Prediction model based on neural network</title>
<p>In recent years, machine learning techniques such as artificial neural networks (ANNs) have shown great promise in tasks such as prediction and classification using large amounts of data. However, the performance of an ANN heavily depends on its architecture, including the number of neurons, layers, and activation functions, which are usually chosen manually by the user. In this study, we employed a method called Neural Architecture Search (NAS) to automatically discover the optimal ANN architecture for the HAMD prediction task. This approach can effectively reduce the manual effort required to find the best ANN architecture and potentially improve the prediction performance.</p>
<p>In this study, we designed a base neural network using Keras Library (<xref ref-type="bibr" rid="ref32">32</xref>), which consisted of several fully connected layer networks. To optimize the architecture and parameters of the neural network, we performed a grid search over a set of hyperparameters, including the number of layers L<inline-formula>
<mml:math id="M1">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{1, 2, 3, 4}, number of hidden nodes in each layer N<inline-formula>
<mml:math id="M2">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{16, 32, 64}, activation function F<inline-formula>
<mml:math id="M3">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{&#x2018;relu&#x2019;, &#x2018;softmax&#x2019;, &#x2018;elu&#x2019;, &#x2018;selu&#x2019;, &#x2018;softplus&#x2019;, &#x2018;tanh&#x2019;, &#x2018;sigmoid&#x2019;}, batch size B<inline-formula>
<mml:math id="M4">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{2, 4, 8}, the optimizer O<inline-formula>
<mml:math id="M5">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{&#x2018;mse&#x2019;, &#x2018;sgd&#x2019;, &#x2018;RMSprop&#x2019;, &#x2018;Adam&#x2019;} and the learning rates LR<inline-formula>
<mml:math id="M6">
<mml:mo>&#x2208;</mml:mo>
</mml:math>
</inline-formula>{0.01, 0.001, 0.0001, 0.00001}. The combination of parameters that resulted in the smallest Mean Squared Error (MSE) on the test set was selected as the optimal configuration for the model. To assess the performance of the model, we used leave-one-out cross-validation.</p>
<p>To identify the most informative features for predicting HAMD scores, we utilized two random forest regression models implemented in Scikit-learn library (<xref ref-type="bibr" rid="ref33">33</xref>). One model utilized all HSFs as input features, while the other only used significant HSFs identified through the Pearson correlation analysis. The top 10 features were then selected and analyzed to gain insights into their predictive capabilities.</p>
</sec>
</sec>
<sec id="sec13">
<title>Study two: a longitudinal study about improvement of depression symptom and voice acoustic features</title>
<sec id="sec14">
<title>Methods and materials</title>
<sec id="sec15">
<title>Participants</title>
<p>For Study two, participants were required to meet the inclusion and exclusion criteria established for Study One, as well as agree to a four-week treatment schedule and have access to a computer with an internet connection. Of the 47 people with depression who participated in Study One, 18 participants continued the longitudinal study with a mean age of (20.47&#x2009;&#x00B1;&#x2009;1.52) years, including 16 (88.9%) females and 2 (11.1%) males. Seven of the participants declined to participate in the ICBT program, and 22 did not complete the program.</p>
</sec>
</sec>
<sec id="sec16">
<title>ICBT programme</title>
<p>The ICBT programme consists of 12 treatment modules, delivered through the ICBT training platform. The self-help ICBT intervention is 20&#x2009;min for each module, and participants are required to complete 3 modules a week, completing all the treatment content within 4&#x2009;weeks. All modules were based on the cognitive-behavioral model by Beck et al. (<xref ref-type="bibr" rid="ref34">34</xref>). Modules 1 and 2 introduced participants to the definition, symptoms, and causes of depression, as well as the basic cognitive model. Modules 3&#x2013;6 described how to identify cognitive distortions and cope with unhelpful automatic thoughts in daily life. Modules 7&#x2013;8 mainly focused on behavioral activation and dealing with intermediate beliefs. Modules 9&#x2013;11 centered on learning about structured problem-solving approaches and core beliefs, while Modules 12 provided a summary of the treatment and relapse prevention. After completing each module, participants were sent homework assignments through the WeChat Subscription platform.</p>
</sec>
<sec id="sec17">
<title>Data analysis</title>
<p>In this study, a difference analysis was conducted to compare the voice acoustic features before and after treatment. The normality and variance homogeneity of each feature distribution in the dataset were evaluated using the Shapiro&#x2013;Wilk test (<xref ref-type="bibr" rid="ref35">35</xref>) and Levene test (<xref ref-type="bibr" rid="ref36">36</xref>), respectively. The differences between the pre-and post-treatment voice acoustic features were analyzed using the Mann&#x2013;Whitney U test (<xref ref-type="bibr" rid="ref37">37</xref>), and the mean and median of the characteristic changes with significant differences were calculated.</p>
</sec>
</sec>
<sec id="sec18" sec-type="results">
<title>Results</title>
<sec id="sec19">
<title>Acoustic feature analysis</title>
<p>We used grid search to obtain the optimal architecture of the neural network, which consisted of 4 hidden layers, each with 32 hidden nodes. The activation function of each layer was softplus. We employed stochastic gradient descent (SGD) as the optimizer and set the learning rate to 0.001 with a batch size of 4 (<xref rid="fig1" ref-type="fig">Figure 1</xref>) (<xref ref-type="bibr" rid="ref38">38</xref>, <xref ref-type="bibr" rid="ref39">39</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>The architecture of the neural network.</p>
</caption>
<graphic xlink:href="fpsyt-14-1195276-g001.tif"/>
</fig>
<p>We calculated the Pearson correlation coefficient between the 1,200 acoustic features and the HAMD scores, and identified 30 features that were significantly associated with depression severity (<italic>p</italic> &#x003C; 0.01). These features included Mel-cepstral (MCEP), Mel-scale Frequency Cepstral Coefficients deltas (MFCC-deltas), Mel-scale Frequency Cepstral Coefficients delta-deltas (MFCC-delta-deltas), Harmonic Model Phase Distortion Mean (HMPDM), Harmonic Model Phase Distortion Deviation (HMPDD), creak, and peak to root mean square (Peak to RMS) (<xref rid="fig2" ref-type="fig">Figures 2</xref>, <xref rid="fig3" ref-type="fig">3</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Correlation heatmap: voice acoustic features associated with severity of depressive symptoms. The <italic>x</italic>-axis represents the severity of symptoms, while the <italic>y</italic>-axis represents the voice acoustic features. The colors on the heatmap correspond to the correlation coefficient between symptom and acoustic feature, with blue indicating a lower correlation and black indicating a higher correlation.</p>
</caption>
<graphic xlink:href="fpsyt-14-1195276-g002.tif"/>
</fig>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>The 30 acoustic features associated with severity of depressive significantly.</p>
</caption>
<graphic xlink:href="fpsyt-14-1195276-g003.tif"/>
</fig>
</sec>
<sec id="sec20">
<title>Prediction model by voice features</title>
<p>The acoustic features were used to predict the HAMD scores of 47 subjects using the model obtained from the previous analysis with leave-one-out cross-validation. The results showed a strong correlation between the predicted scores and the HAMD scores, with a Pearson correlation coefficient of 0.682 and a <italic>p</italic> value of 1.318<inline-formula>
<mml:math id="M7">
<mml:mo>&#x00D7;</mml:mo>
</mml:math>
</inline-formula>10&#x2013;<sup>7</sup> (<xref rid="fig4" ref-type="fig">Figure 4A</xref>). The error distribution between the predicted and actual HAMD scores is shown in <xref rid="fig4" ref-type="fig">Figure 4B</xref>, with a mean absolute error (MAE) of 3.137. This indicates that our model can accurately predict HAMD scores. Moreover, the MAE of 63.83% of subjects was less than 4.0 points, suggesting that most of the subjects&#x2019; HAMD scores can be precisely predicted using our model.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>The performance of the prediction model. <bold>(A)</bold> The scatter plot of predicted score and HAMD scores. <bold>(B)</bold> The error distribution of samples. <bold>(C)</bold> Learning curves of our neural network model.</p>
</caption>
<graphic xlink:href="fpsyt-14-1195276-g004.tif"/>
</fig>
<p>To determine the optimal training time and prevent overfitting, we generated a training curve that shows the relationship between Mean Squared Error (MSE) and epochs (<xref rid="fig4" ref-type="fig">Figure 4C</xref>). The MSE of the training set decreased as the number of epochs increased. However, the MSE of the validation set reached its lowest point at epoch 175 and then started to increase, indicating that the model began to overfit the data. Therefore, we selected the model at epoch 175 as our final model, as it had the best predictive and generalization abilities.</p>
<p>The performance of our model was evaluated on a workstation with Intel Xeon W-2102 CPU, 8GB RAM and Nvidia GeForce RTX 2080 Ti graphics card. As the prediction of ANN can be done very fast, the runtime of our protocal is mainly determined on the feature extraction. The average runtime to extract the features are listed in <xref rid="tab2" ref-type="table">Table 2</xref>. In general, the whole runtime of the whole process could be done in 347.6&#x2009;s.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Time consumption of each step of our diagnosis method.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Step Name</th>
<th align="center" valign="top">Average time consumption (s)</th>
<th align="center" valign="top">Standard deviation of time consumption (s)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Recording</td>
<td align="char" valign="middle" char=".">100.1</td>
<td align="char" valign="middle" char=".">19.3</td>
</tr>
<tr>
<td align="left" valign="middle">Extract features using the Covarep toolbox</td>
<td align="char" valign="middle" char=".">244.8</td>
<td align="char" valign="middle" char=".">48.8</td>
</tr>
<tr>
<td align="left" valign="middle">Extract MFCC related features</td>
<td align="char" valign="middle" char=".">2.5</td>
<td align="char" valign="middle" char=".">0.4</td>
</tr>
<tr>
<td align="left" valign="middle">Calculate Peak-to-RMS feature</td>
<td align="char" valign="middle" char=".">0.2</td>
<td align="char" valign="middle" char=".">0.04</td>
</tr>
<tr>
<td align="left" valign="middle">Average consumption of each recording</td>
<td align="char" valign="middle" char=".">347.6</td>
<td/>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, we used random regression forest to identify the most important features in predicting HAMD scores. <xref rid="tab3" ref-type="table">Table 3</xref> lists the top-10 features for predicting HAMD scores. We found that some acoustic features were repeated, suggesting that they are key factors for prediction in both feature sets. Specifically, four features were found to be important: the regression fit of HMPDD 8, the skewness of MCEP 0, the standard deviation of creak, and the regression intercept of MFCC deltas 10 (<xref rid="tab3" ref-type="table">Table 3</xref>).</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>The top 10 feature weights of two random forest regression models separately trained by all features and significant features.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Significant features</th>
<th/>
<th align="left" valign="top">All features</th>
<th/>
</tr>
<tr>
<th align="left" valign="middle">Name</th>
<th align="center" valign="middle">Importance</th>
<th align="left" valign="middle">Name</th>
<th align="center" valign="middle">Importance</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">
<bold>HMPDD_8_R2</bold>
</td>
<td align="char" valign="middle" char=".">0.107</td>
<td align="left" valign="middle">
<bold>creak_std</bold>
</td>
<td align="char" valign="middle" char=".">0.023</td>
</tr>
<tr>
<td align="left" valign="middle">
<bold>MCEP_0_skew</bold>
</td>
<td align="char" valign="middle" char=".">0.096</td>
<td align="left" valign="middle">MFCC_delta_deltas_20_std</td>
<td align="char" valign="middle" char=".">0.022</td>
</tr>
<tr>
<td align="left" valign="middle">MCEP_17_R2</td>
<td align="char" valign="middle" char=".">0.068</td>
<td align="left" valign="middle">Peak-to-RMS_median</td>
<td align="char" valign="middle" char=".">0.021</td>
</tr>
<tr>
<td align="left" valign="middle">MFCC_deltas_10_alph</td>
<td align="char" valign="middle" char=".">0.064</td>
<td align="left" valign="middle">MCEP_9_alph</td>
<td align="char" valign="middle" char=".">0.021</td>
</tr>
<tr>
<td align="left" valign="middle">MCEP_0_kurt</td>
<td align="char" valign="middle" char=".">0.062</td>
<td align="left" valign="middle">
<bold>MCEP_0_skew</bold>
</td>
<td align="char" valign="middle" char=".">0.019</td>
</tr>
<tr>
<td align="left" valign="middle">
<bold>creak_std</bold>
</td>
<td align="char" valign="middle" char=".">0.061</td>
<td align="left" valign="middle">
<bold>MFCC_deltas_10_intercept</bold>
</td>
<td align="char" valign="middle" char=".">0.016</td>
</tr>
<tr>
<td align="left" valign="middle">
<bold>MFCC_deltas_10_intercept</bold>
</td>
<td align="char" valign="middle" char=".">0.05</td>
<td align="left" valign="middle">HMPDM_8_min</td>
<td align="char" valign="middle" char=".">0.015</td>
</tr>
<tr>
<td align="left" valign="middle">MFCC_deltas_16_kurt</td>
<td align="char" valign="middle" char=".">0.045</td>
<td align="left" valign="middle">MFCC_delta_deltas_4_R2</td>
<td align="char" valign="middle" char=".">0.014</td>
</tr>
<tr>
<td align="left" valign="middle">MCEP_3_alph</td>
<td align="char" valign="middle" char=".">0.036</td>
<td align="left" valign="middle">MFCC_deltas_7_alph</td>
<td align="char" valign="middle" char=".">0.014</td>
</tr>
<tr>
<td align="left" valign="middle">Age</td>
<td align="char" valign="middle" char=".">0.033</td>
<td align="left" valign="middle">
<bold>HMPDD_8_R2</bold>
</td>
<td align="char" valign="middle" char=".">0.014</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec21">
<title>Result of longitudinal study</title>
<p>All participants who underwent the ICBT program returned to normal scores below the HAMD cut-off, with a mean and SD of 8.79&#x2009;&#x00B1;&#x2009;5.43 and 0.52&#x2009;&#x00B1;&#x2009;0.86 for the pre-and post-ICBT scores, respectively. Among the 30 voice acoustic features analyzed, only nine satisfied both normality and variance homogeneity criteria. Our difference analysis revealed that four voice acoustic features significantly changed in depression participants after ICBT. These four features included Peak2RMS_kurt, MFCC_deltas_10_intercept, MFCC_delta_deltas_4_kurt, and MFCC_delta_deltas_9_kurt (<xref rid="fig5" ref-type="fig">Figure 5</xref>). The mean and median of these four features were significantly lower after ICBT, compared to before.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Wilcoxon test of significant difference by voice acoustic features (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.05).</p>
</caption>
<graphic xlink:href="fpsyt-14-1195276-g005.tif"/>
</fig>
</sec>
</sec>
<sec id="sec22" sec-type="discussions">
<title>Discussion</title>
<p>The present study identified 30 voice acoustic features significantly associated with depression, and developed a deep learning model that accurately predicted depression severity. The model also demonstrated good generalization ability and avoided overfitting. These findings suggest that voice acoustic features could serve as objective and effective biomarkers for depression, and be used to monitor treatment response. The longitudinal results showed that four of the voice acoustic features were sensitive to ICBT psychotherapy, indicating that voice acoustic features could potentially be used to monitor treatment progress and adjust treatment plans. Our results are consistent with previous studies that have identified voice acoustic features as reliable biomarkers for depression (<xref ref-type="bibr" rid="ref2">2</xref>). Overall, our study provides important insights into the potential clinical application of voice acoustic features for depression diagnosis, monitoring, and treatment (<xref ref-type="bibr" rid="ref12">12</xref>).</p>
<p>Previous studies have mainly utilized difference tests, correlation analysis, and regression analysis to explore depression-related voice acoustic features. However, these methods have limitations in the extraction of relevant quantitative indicators and the accuracy of prediction. In this study, we employed an algorithm to extract more voice acoustic features and related parameters, which enabled us to fully explore depression-related voice acoustic features. By performing dimensionality reduction with a significance level of <italic>P</italic>&#x003C;0.01, we identified 30 voice acoustic features significantly associated with depression, including loudness, MFCCs, harmonic wave, and creak. These findings were consistent with previous studies that suggested depression can lead to changes in the motor control of the vocal tract, resulting in changes in voice acoustic features such as delayed articulation, dyskinesia, and poor coordination (<xref ref-type="bibr" rid="ref34">34</xref>). Our study found that Peak to RMS, related to loudness, was significantly associated with severity of depression, which is consistent with previous studies (<xref ref-type="bibr" rid="ref40">40</xref>, <xref ref-type="bibr" rid="ref41">41</xref>). Moreover, MFCCs were also found to be associated with depression severity. Previous studies suggested that MFCCs were associated with less vocal tract changes in depression patients due to the tighter vocal tract caused by psychomotor retardation (<xref ref-type="bibr" rid="ref42">42</xref>). Creak, which is caused by microtremors of the vocal cords, was identified as a latent biomarker for depression. It was found to increase with the severity of depression and was associated with a higher risk of suicide in depressed patients (<xref ref-type="bibr" rid="ref43">43</xref>, <xref ref-type="bibr" rid="ref44">44</xref>).</p>
<p>In addition, our study explored phase parameters that reflect depression symptoms, which were rarely investigated in previous studies due to the difficulty of extraction (<xref ref-type="bibr" rid="ref45">45</xref>). We found that fundamental frequency, which responds to the thickness and tightness of the vocal cords, was relatively stable for a few weeks but can manifest lower voice and decreased fundamental frequency in depression (<xref ref-type="bibr" rid="ref42">42</xref>). Previous studies also demonstrated that the amplitude of the harmonic wave was smaller in depression patients (<xref ref-type="bibr" rid="ref46">46</xref>). Overall, our study provides valuable insights into the use of voice acoustic features as objective and effective biomarkers for depression diagnosis and treatment response. The comprehensive extraction and analysis of voice acoustic features provide a more accurate prediction model for depression severity, which can improve the accuracy of diagnosis and help develop targeted treatment plans.</p>
<p>To assess the effectiveness of our model, we performed leave-one-out cross-validation on our dataset, which demonstrated that our model was able to accurately predict HAMD scores for most subjects. Compared with previous studies that utilized voice acoustic features to predict PHQ-9 and PHQ-8 scores, our model achieved a smaller mean absolute error and root mean square error, respectively (<xref ref-type="bibr" rid="ref47">47</xref>). The results suggest that acoustic features may serve as effective external indicators of depression, as they are related to changes in vocal tract status and features. Our study, which utilized a more comprehensive set of voice acoustic features and a deep learning model that accounts for the nonlinear relationship between depression and these features, demonstrated relatively stronger predictive power. Furthermore, based on the top 10 feature weights of random forest regression, creak, MCEP, MFCC, and HMPDD were identified as the most important acoustic features, which are prosodic and spectral features of voice that could serve as decisive biomarkers for depression (<xref ref-type="bibr" rid="ref48">48</xref>).</p>
<p>Longitudinal follow-up studies have shown that some voice acoustic features not only have predictive power for the severity of depression but also can be associated with the treatment response of ICBT. In this study, the kurtosis of two 20 MFCC-delta-deltas was found to significantly reduce with the improvement of depression in participants who underwent ICBT, and all participants returned to a normal status (<xref ref-type="bibr" rid="ref49">49</xref>). Previous studies on drug treatment in major depression patients have also reported the normalization of some voice acoustic features with remission of symptoms (<xref ref-type="bibr" rid="ref41">41</xref>). Therefore, MFCCs could serve not only as a predictor of depression severity but also as potential biomarkers for treatment response. Additionally, the result of this study showed that Peak to RMS, which measures loudness, increased after ICBT, consistent with previous studies. Loudness has been identified as an important biomarker for identifying depression and a sensitive biomarker for the treatment response of psychotherapy in depression, as confirmed by the results of this longitudinal study (<xref ref-type="bibr" rid="ref12">12</xref>). The sensitivity of vocal acoustic parameters to ICBT may provide a new perspective for optimal treatment options and further confirm the role of direct and indirect acoustic features in identifying depression.</p>
<p>In future studies, we plan to validate our findings and assess potential gender differences in the effectiveness of our intervention by recruiting a more balanced sample of male and female patients. Additionally, we will incorporate other voice tasks and implement a longitudinal follow-up to explore factors that may impact the relationship between vocal features and depression severity. Furthermore, it is worth noting that our study was limited by the absence of a healthy control group. To address this limitation, we will consider including a healthy control group in future studies to better understand the unique acoustic characteristics associated with depression, thus shedding light on the potential diagnostic value of these features.</p>
<p>In this study, we used a machine learning algorithm to accurately extract acoustic feature parameters, allowing for a more comprehensive exploration of the relationship between acoustic features and depression. Furthermore, our use of the random forest regression method to calculate feature weights was a more effective approach than traditional correlation analysis and principal component analysis. We also identified key acoustic features, such as creak, MCEP, MFCC, and HMPDD, as potential biomarkers for depression. Our longitudinal study examining the relationship between acoustic features and treatment response of ICBT provides new evidence for objective identification of depression and assessment of treatment effectiveness. Overall, our findings have significant implications for the use of acoustic features in depression assessment and treatment.</p>
</sec>
<sec id="sec23" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary materials, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="sec24">
<title>Ethics statement</title>
<p>The studies involving human participants were reviewed and approved by the Ethics Committee of Hainan Medical University (HYLL2020005). The patients/participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec id="sec25">
<title>Author contributions</title>
<p>YWa and LL managed the literature searches, participated in the collection and analysis of data, and wrote the manuscript. ZZ, XX, and HF participated in the collection and analysis of data. RaZ, RL, ZL, and YWe gave suggestion for ICBT and research process. FW, XZ, and RoZ designed the study, supervised the sample recruitment, and provided suggestions. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="sec26" sec-type="funding-information">
<title>Funding</title>
<p>This study was funded by Jiangsu Provincial Key Research and Development Program (BE2021617 to FW and XZ), National Science Fund for Distinguished Young Scholars (81725005 to FW), NSFC-Guangdong Joint Fund (U20A6005 to FW), National Natural Science Foundation of China (62176129 to XZ), National Key Research and Development Program (2022YFC2405603 to XZ), Key Project supported by Medical Science and Technology Development Foundation, Jiangsu Commission of Health (ZD2021026 to RoZ), National Natural Science Foundation of China (82151315 to RoZ), Jiangsu Provincial Key Research and Development Program (BE2022160 to RoZ), Inner Mongolia Autonomous Region Postgraduate Education Innovation Program Funding Project (B202101194Z to YW), Hainan Provincial Natural Science Foundation of China (821RC700 to LL), National Key R&#x0026;D Program of China (2018YFC1314600 to ZL), Henan Province Higher Education Teaching Reform Research and Practice Project (2021SJGLX189 to RL).</p>
</sec>
<sec id="conf1" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Williams</surname> <given-names>JB</given-names></name></person-group>. <article-title>A structured interview guide for the Hamilton depression rating scale</article-title>. <source>Arch Gen Psychiatry</source>. (<year>1988</year>) <volume>45</volume>:<fpage>742</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1001/archpsyc.1988.01800320058007</pub-id>, PMID: <pub-id pub-id-type="pmid">3395203</pub-id></citation></ref>
<ref id="ref2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Duvvuri</surname> <given-names>R</given-names></name> <name><surname>Chandra</surname> <given-names>KK</given-names></name> <name><surname>Nguyen</surname> <given-names>T</given-names></name> <name><surname>Ghomi</surname> <given-names>RH</given-names></name></person-group>. <article-title>Automated voice biomarkers for depression symptoms using an online cross-sectional data collection initiative</article-title>. <source>Depress Anxiety</source>. (<year>2020</year>) <volume>37</volume>:<fpage>657</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1002/da.23020</pub-id>, PMID: <pub-id pub-id-type="pmid">32383335</pub-id></citation></ref>
<ref id="ref3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Z</given-names></name> <name><surname>Bao</surname> <given-names>Z</given-names></name> <name><surname>Zhang</surname> <given-names>Z</given-names></name> <name><surname>Deng</surname> <given-names>J</given-names></name> <name><surname>Cummins</surname> <given-names>N</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Automatic assessment of depression from speech via a hierarchical attention transfer network and attention autoencoders</article-title>. <source>IEEE J Select Top Sig Process</source>. (<year>2019</year>) <volume>14</volume>:<fpage>423</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JSTSP.2019.2955012</pub-id></citation></ref>
<ref id="ref4"><label>4.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Di</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Zhu</surname> <given-names>T</given-names></name></person-group>. <article-title>Using i-vectors from voice features to identify major depressive disorder</article-title>. <source>J Affect Disord</source>. (<year>2021</year>) <volume>288</volume>:<fpage>161</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jad.2021.04.004</pub-id>, PMID: <pub-id pub-id-type="pmid">33895418</pub-id></citation></ref>
<ref id="ref5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Low</surname> <given-names>DM</given-names></name> <name><surname>Bentley</surname> <given-names>KH</given-names></name> <name><surname>Ghosh</surname> <given-names>SS</given-names></name></person-group>. <article-title>Automated assessment of psychiatric disorders using speech: a systematic review</article-title>. <source>Laryngoscope Invest Otolaryngol</source>. (<year>2020</year>) <volume>5</volume>:<fpage>96</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1002/lio2.354</pub-id>, PMID: <pub-id pub-id-type="pmid">32128436</pub-id></citation></ref>
<ref id="ref6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Marmar</surname> <given-names>CR</given-names></name> <name><surname>Brown</surname> <given-names>AD</given-names></name> <name><surname>Qian</surname> <given-names>M</given-names></name> <name><surname>Laska</surname> <given-names>E</given-names></name> <name><surname>Siegel</surname> <given-names>C</given-names></name> <name><surname>Li</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Speech-based markers for posttraumatic stress disorder in US veterans</article-title>. <source>Depress Anxiety</source>. (<year>2019</year>) <volume>36</volume>:<fpage>607</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1002/da.22890</pub-id>, PMID: <pub-id pub-id-type="pmid">31006959</pub-id></citation></ref>
<ref id="ref7"><label>7.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Russell</surname> <given-names>JA</given-names></name> <name><surname>Bachorowski</surname> <given-names>JA</given-names></name> <name><surname>Fernandez-Dols</surname> <given-names>JM</given-names></name></person-group>. <article-title>Facial and vocal expressions of emotion</article-title>. <source>Annu Rev Psychol</source>. (<year>2003</year>) <volume>54</volume>:<fpage>329</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev.psych.54.101601.145102</pub-id>, PMID: <pub-id pub-id-type="pmid">12415074</pub-id></citation></ref>
<ref id="ref8"><label>8.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Zhang</surname> <given-names>L</given-names></name> <name><surname>Liu</surname> <given-names>T</given-names></name> <name><surname>Pan</surname> <given-names>W</given-names></name> <name><surname>Hu</surname> <given-names>B</given-names></name> <name><surname>Zhu</surname> <given-names>T</given-names></name></person-group>. <article-title>Acoustic differences between healthy and depressed people: a cross-situation study</article-title>. <source>BMC Psychiatry</source>. (<year>2019</year>) <volume>19</volume>:<fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s12888-019-2300-7</pub-id></citation></ref>
<ref id="ref9"><label>9.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y</given-names></name> <name><surname>Fairbairn</surname> <given-names>C</given-names></name> <name><surname>Cohn</surname> <given-names>JF</given-names></name></person-group>. <article-title>Detecting depression severity from vocal prosody</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2013</year>) <volume>4</volume>:<fpage>142</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1109/T-AFFC.2012.38</pub-id>, PMID: <pub-id pub-id-type="pmid">26985326</pub-id></citation></ref>
<ref id="ref10"><label>10.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Agrima</surname> <given-names>A</given-names></name> <name><surname>Elmazouzi</surname> <given-names>L</given-names></name> <name><surname>Mounir</surname> <given-names>I</given-names></name> <name><surname>Farchi</surname> <given-names>A</given-names></name></person-group>, <article-title>Effect of negative and positive emotions on the fundamental frequency and formants</article-title>. <conf-name>Proceedings of the 2nd International Conference on Computing and Wireless Communication Systems</conf-name> (<year>2017</year>).</citation></ref>
<ref id="ref11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cummins</surname> <given-names>N</given-names></name> <name><surname>Sethu</surname> <given-names>V</given-names></name> <name><surname>Epps</surname> <given-names>J</given-names></name> <name><surname>Schnieder</surname> <given-names>S</given-names></name> <name><surname>Krajewski</surname> <given-names>J</given-names></name></person-group>. <article-title>Analysis of acoustic space variability in speech affected by depression</article-title>. <source>Speech Comm</source>. (<year>2015</year>) <volume>75</volume>:<fpage>27</fpage>&#x2013;<lpage>49</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.specom.2015.09.003</pub-id></citation></ref>
<ref id="ref12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mundt</surname> <given-names>JC</given-names></name> <name><surname>Vogel</surname> <given-names>AP</given-names></name> <name><surname>Feltner</surname> <given-names>DE</given-names></name> <name><surname>Lenderking</surname> <given-names>WR</given-names></name></person-group>. <article-title>Vocal acoustic biomarkers of depression severity and treatment response</article-title>. <source>Biol Psychiatry</source>. (<year>2012</year>) <volume>72</volume>:<fpage>580</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.biopsych.2012.03.015</pub-id>, PMID: <pub-id pub-id-type="pmid">22541039</pub-id></citation></ref>
<ref id="ref13"><label>13.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mundt</surname> <given-names>JC</given-names></name> <name><surname>Snyder</surname> <given-names>PJ</given-names></name> <name><surname>Cannizzaro</surname> <given-names>MS</given-names></name> <name><surname>Chappie</surname> <given-names>K</given-names></name> <name><surname>Geralts</surname> <given-names>DS</given-names></name></person-group>. <article-title>Voice acoustic measures of depression severity and treatment response collected via interactive voice response (IVR) technology</article-title>. <source>J Neurolinguistics</source>. (<year>2007</year>) <volume>20</volume>:<fpage>50</fpage>&#x2013;<lpage>64</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jneuroling.2006.04.001</pub-id>, PMID: <pub-id pub-id-type="pmid">21253440</pub-id></citation></ref>
<ref id="ref14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cannizzaro</surname> <given-names>M</given-names></name> <name><surname>Harel</surname> <given-names>B</given-names></name> <name><surname>Reilly</surname> <given-names>N</given-names></name> <name><surname>Chappell</surname> <given-names>P</given-names></name> <name><surname>Snyder</surname> <given-names>PJ</given-names></name></person-group>. <article-title>Voice acoustical measurement of the severity of major depression</article-title>. <source>Brain Cogn</source>. (<year>2004</year>) <volume>56</volume>:<fpage>30</fpage>&#x2013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bandc.2004.05.003</pub-id>, PMID: <pub-id pub-id-type="pmid">15380873</pub-id></citation></ref>
<ref id="ref15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Low</surname> <given-names>L-SA</given-names></name> <name><surname>Maddage</surname> <given-names>NC</given-names></name> <name><surname>Lech</surname> <given-names>M</given-names></name> <name><surname>Sheeber</surname> <given-names>LB</given-names></name> <name><surname>Allen</surname> <given-names>NB</given-names></name></person-group>. <article-title>Detection of clinical depression in adolescents&#x2019; speech during family interactions</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2010</year>) <volume>58</volume>:<fpage>574</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TBME.2010.2091640</pub-id></citation></ref>
<ref id="ref16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hashim</surname> <given-names>NW</given-names></name> <name><surname>Wilkes</surname> <given-names>M</given-names></name> <name><surname>Salomon</surname> <given-names>R</given-names></name> <name><surname>Meggs</surname> <given-names>J</given-names></name> <name><surname>France</surname> <given-names>DJ</given-names></name></person-group>. <article-title>Evaluation of voice acoustics as predictors of clinical depression scores</article-title>. <source>J Voice</source>. (<year>2017</year>) <volume>31</volume>:<fpage>e1</fpage>. e6. doi: <pub-id pub-id-type="doi">10.1016/j.jvoice.2016.06.006</pub-id></citation></ref>
<ref id="ref17"><label>17.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Gratch</surname> <given-names>J</given-names></name> <name><surname>Artstein</surname> <given-names>R</given-names></name> <name><surname>Lucas</surname> <given-names>G</given-names></name> <name><surname>Stratou</surname> <given-names>G</given-names></name> <name><surname>Scherer</surname> <given-names>S</given-names></name> <name><surname>Nazarian</surname> <given-names>A</given-names></name> <etal/></person-group>. <source>The distress analysis interview corpus of human and computer interviews</source>. <publisher-loc>California</publisher-loc>: <publisher-name>University of Southern California Los Angeles</publisher-name> (<year>2014</year>).</citation></ref>
<ref id="ref18"><label>18.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>W</given-names></name> <name><surname>He</surname> <given-names>Z</given-names></name> <name><surname>Xing</surname> <given-names>X</given-names></name> <name><surname>Cai</surname> <given-names>B</given-names></name> <name><surname>Lu</surname> <given-names>W</given-names></name></person-group> <article-title>Multi-modality depression detection via multi-scale temporal dilated cnns</article-title>. <conf-name>Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop</conf-name> (<year>2019</year>).</citation></ref>
<ref id="ref19"><label>19.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Tao</surname> <given-names>J</given-names></name> <name><surname>Lian</surname> <given-names>Z</given-names></name> <name><surname>Wen</surname> <given-names>Z</given-names></name> <name><surname>Yang</surname> <given-names>M</given-names></name> <etal/></person-group>., <article-title>Continuous multimodal emotion prediction based on long short term memory recurrent neural network</article-title>. <conf-name>Proceedings of the 7th Annual Workshop on Audio/Visual Emotion Challenge</conf-name> (<year>2017</year>).</citation></ref>
<ref id="ref20"><label>20.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kornstein</surname> <given-names>SG</given-names></name> <name><surname>Schatzberg</surname> <given-names>AF</given-names></name> <name><surname>Thase</surname> <given-names>ME</given-names></name> <name><surname>Yonkers</surname> <given-names>KA</given-names></name> <name><surname>McCullough</surname> <given-names>JP</given-names></name> <name><surname>Keitner</surname> <given-names>GI</given-names></name> <etal/></person-group>. <article-title>Gender differences in treatment response to sertraline versus imipramine in chronic depression</article-title>. <source>Am J Psychiatry</source>. (<year>2000</year>) <volume>157</volume>:<fpage>1445</fpage>&#x2013;<lpage>52</lpage>. doi: <pub-id pub-id-type="doi">10.1176/appi.ajp.157.9.1445</pub-id>, PMID: <pub-id pub-id-type="pmid">10964861</pub-id></citation></ref>
<ref id="ref21"><label>21.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hardy</surname> <given-names>P</given-names></name> <name><surname>Jouvent</surname> <given-names>R</given-names></name> <name><surname>Widlocher</surname> <given-names>D</given-names></name></person-group>. <article-title>Speech pause time and the retardation rating scale for depression (ERD). Towards a reciprocal validation</article-title>. <source>J Affect Disord</source>. (<year>1984</year>) <volume>6</volume>:<fpage>123</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0165-0327(84)90014-4</pub-id>, PMID: <pub-id pub-id-type="pmid">6231326</pub-id></citation></ref>
<ref id="ref22"><label>22.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>RSilva</surname> <given-names>MR</given-names></name></person-group>. <article-title>Reflections of depression in acoustic measures of the patient&#x2019;s speech</article-title>. <source>J Affect Disord</source>. (<year>2001</year>) <volume>66</volume>:<fpage>59</fpage>&#x2013;<lpage>69</lpage>. doi: <pub-id pub-id-type="doi">10.1016/s0165-0327(00)00335-9</pub-id></citation></ref>
<ref id="ref23"><label>23.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>James</surname> <given-names>C</given-names></name> <name><surname>Mundt</surname> <given-names>APV</given-names></name> <name><surname>Feltner</surname> <given-names>DE</given-names></name> <name><surname>William</surname> <given-names>R</given-names></name></person-group>. <article-title>Lenderking vocal acoustic biomarkers of depression severity and treatment response</article-title>. <source>Biol Psychiatry</source>. (<year>2012</year>) <volume>72</volume>:<fpage>1</fpage>&#x2013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.biopsych.2012.03.015</pub-id></citation></ref>
<ref id="ref24"><label>24.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Liang</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>Z</given-names></name> <name><surname>Liu</surname> <given-names>R</given-names></name> <name><surname>Wei</surname> <given-names>Y</given-names></name> <name><surname>Qi</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Factor structure of the patient health questionnaire-9 and measurement invariance across gender and age among Chinese university students</article-title>. <source>Medicine</source>. (<year>2023</year>) <volume>102</volume>:<fpage>e32590</fpage>. doi: <pub-id pub-id-type="doi">10.1097/MD.0000000000032590</pub-id>, PMID: <pub-id pub-id-type="pmid">36607886</pub-id></citation></ref>
<ref id="ref25"><label>25.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Rabiner</surname> <given-names>LR</given-names></name> <name><surname>Schafer</surname> <given-names>RW</given-names></name></person-group>. <source>Theory and applications of digital speech processing</source>. <publisher-loc>Upper Saddle River, NJ</publisher-loc>: <publisher-name>Pearson</publisher-name> (<year>2010</year>).</citation></ref>
<ref id="ref26"><label>26.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schafer</surname> <given-names>RW</given-names></name> <name><surname>Rabiner</surname> <given-names>LR</given-names></name></person-group>. <article-title>System for automatic formant analysis of voiced speech</article-title>. <source>J Acoust Soc Am</source>. (<year>1970</year>) <volume>47</volume>:<fpage>634</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.1911939</pub-id>, PMID: <pub-id pub-id-type="pmid">5445369</pub-id></citation></ref>
<ref id="ref27"><label>27.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Williamson</surname> <given-names>JR</given-names></name> <name><surname>Godoy</surname> <given-names>E</given-names></name> <name><surname>Cha</surname> <given-names>M</given-names></name> <name><surname>Schwarzentruber</surname> <given-names>A</given-names></name> <name><surname>Khorrami</surname> <given-names>P</given-names></name> <name><surname>Gwon</surname> <given-names>Y</given-names></name> <etal/></person-group>., <article-title>Detecting depression using vocal, facial and semantic communication cues</article-title>. <conf-name>Proceedings of the 6th International Workshop on Audio/Visual Emotion Challenge</conf-name> (<year>2016</year>).</citation></ref>
<ref id="ref28"><label>28.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Degottex</surname> <given-names>G</given-names></name> <name><surname>Kane</surname> <given-names>J</given-names></name> <name><surname>Drugman</surname> <given-names>T</given-names></name> <name><surname>Raitio</surname> <given-names>T</given-names></name> <name><surname>Scherer</surname> <given-names>S</given-names></name></person-group>, <article-title>COVAREP&#x2014;A collaborative voice analysis repository for speech technologies. 2014 ieee international conference on acoustics, speech and signal processing (icassp)</article-title> (<year>2014</year>). IEEE.</citation></ref>
<ref id="ref29"><label>29.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>McFee</surname> <given-names>B</given-names></name> <name><surname>Raffel</surname> <given-names>C</given-names></name> <name><surname>Liang</surname> <given-names>D</given-names></name> <name><surname>Ellis</surname> <given-names>DP</given-names></name> <name><surname>McVicar</surname> <given-names>M</given-names></name> <name><surname>Battenberg</surname> <given-names>E</given-names></name> <etal/></person-group>., <article-title>Librosa: Audio and music signal analysis in python</article-title>. <conf-name>Proceedings of the 14th Python in Science Conference</conf-name> (<year>2015</year>).</citation></ref>
<ref id="ref30"><label>30.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Aldeneh</surname> <given-names>Z</given-names></name> <name><surname>Jaiswal</surname> <given-names>M</given-names></name> <name><surname>Picheny</surname> <given-names>M</given-names></name> <name><surname>Mcinnis</surname> <given-names>M</given-names></name> <name><surname>Provost</surname> <given-names>EM</given-names></name></person-group>. <article-title>Identifying mood episodes using dialogue features from clinical interviews</article-title>. arXiv preprint arXiv:191005115. (<year>2019</year>).</citation></ref>
<ref id="ref31"><label>31.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fabian Pedregosa</surname> <given-names>GV</given-names></name> <name><surname>Gramfort</surname> <given-names>A</given-names></name> <name><surname>Michel</surname> <given-names>V</given-names></name> <name><surname>Thirio</surname> <given-names>B</given-names></name> <name><surname>Grisel</surname> <given-names>O</given-names></name> <name><surname>Blondel</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Scikit-learn: machine learlning in python</article-title>. <source>J Mach Learn Res</source>. (<year>2013</year>) <volume>12</volume>:<fpage>2825</fpage>&#x2013;<lpage>30</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1201.0490</pub-id></citation></ref>
<ref id="ref32"><label>32.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Graziotin</surname> <given-names>D</given-names></name> <name><surname>Abrahamsson</surname> <given-names>P</given-names></name></person-group>. <article-title>A web-based modeling tool for the SEMAT essence theory of software engineering</article-title>. arXiv preprint arXiv 13072075. (<year>2013</year>).</citation></ref>
<ref id="ref33"><label>33.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Briman</surname> <given-names>L</given-names></name></person-group>. <article-title>Random forests</article-title>. <source>Mach Learn</source>. (<year>2001</year>) <volume>45</volume>:<fpage>5</fpage>&#x2013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id>, PMID: <pub-id pub-id-type="pmid">37267645</pub-id></citation></ref>
<ref id="ref34"><label>34.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bennabi</surname> <given-names>D</given-names></name> <name><surname>Vandel</surname> <given-names>P</given-names></name> <name><surname>Papaxanthis</surname> <given-names>C</given-names></name> <name><surname>Pozzo</surname> <given-names>T</given-names></name> <name><surname>Haffen</surname> <given-names>E</given-names></name></person-group>. <article-title>Psychomotor retardation in depression: a systematic review of diagnostic, pathophysiologic, and therapeutic implications</article-title>. <source>Biomed Res Int</source>. (<year>2013</year>) <fpage>158746</fpage>&#x2013;<lpage>158746</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2013/158746</pub-id></citation></ref>
<ref id="ref35"><label>35.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shapiro</surname> <given-names>SS</given-names></name> <name><surname>Wilk</surname> <given-names>MB</given-names></name></person-group>. <article-title>An analysis of variance test for normality (complete samples)</article-title>. <source>Biometrika</source>. (<year>1965</year>) <volume>52</volume>:<fpage>591</fpage>&#x2013;<lpage>1</lpage>. doi: <pub-id pub-id-type="doi">10.2307/2333709</pub-id>, PMID: <pub-id pub-id-type="pmid">37237304</pub-id></citation></ref>
<ref id="ref36"><label>36.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Gastwirth</surname> <given-names>JL</given-names></name> <name><surname>Gel</surname> <given-names>YR</given-names></name> <name><surname>Miao</surname> <given-names>W</given-names></name></person-group>. <source>The impact of Levene&#x2019;s test of equality of variances on statistical theory and practice. Statistical Science</source>. (<year>2009</year>) <volume>24</volume>:<fpage>343</fpage>&#x2013;<lpage>360</lpage>. doi: <pub-id pub-id-type="doi">10.1214/09-STS301</pub-id></citation></ref>
<ref id="ref37"><label>37.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>McKnight</surname> <given-names>PE</given-names></name> <name><surname>Najab</surname> <given-names>J</given-names></name></person-group>. <article-title>Mann-Whitney U test. The Corsini encyclopedia of psychology</article-title>. (<year>2010</year>). <fpage>1</fpage>. doi: <pub-id pub-id-type="doi">10.1002/9780470479216.corpsy0524</pub-id></citation></ref>
<ref id="ref38"><label>38.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Glorot</surname> <given-names>XBA</given-names></name> <name><surname>Bengio</surname> <given-names>Y</given-names></name></person-group>. <article-title>Deep sparse rectifier neural networks</article-title>. <source>J Mach Learn Res</source>. (<year>2011</year>) <volume>15</volume>:<fpage>315</fpage>&#x2013;<lpage>3</lpage>.</citation></ref>
<ref id="ref39"><label>39.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>L</surname> <given-names>Bottou</given-names></name></person-group>. <article-title>Large-scale machine learning with stochastic gradient descent</article-title>. <conf-name>Proceedings of COMPSTAT</conf-name> <conf-loc>Verlag Berlin Heidelberg</conf-loc> (<year>2010</year>). <fpage>177</fpage>&#x2013;<lpage>6</lpage>.</citation></ref>
<ref id="ref40"><label>40.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Darby</surname> <given-names>JK</given-names></name> <name><surname>Simmons</surname> <given-names>N</given-names></name> <name><surname>Berger</surname> <given-names>PA</given-names></name></person-group>. <article-title>Speech and voice parameters of depression: a pilot study</article-title>. <source>J Commun Disord</source>. (<year>1984</year>) <volume>17</volume>:<fpage>75</fpage>&#x2013;<lpage>85</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0021-9924(84)90013-3</pub-id>, PMID: <pub-id pub-id-type="pmid">6725627</pub-id></citation></ref>
<ref id="ref41"><label>41.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alpert</surname> <given-names>M</given-names></name> <name><surname>Pouget</surname> <given-names>ER</given-names></name> <name><surname>Silva</surname> <given-names>RR</given-names></name></person-group>. <article-title>Reflections of depression in acoustic measures of the patient's speech</article-title>. <source>J Affect Disord</source>. (<year>2001</year>) <volume>66</volume>:<fpage>59</fpage>&#x2013;<lpage>69</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0165-0327(00)00335-9</pub-id>, PMID: <pub-id pub-id-type="pmid">11532533</pub-id></citation></ref>
<ref id="ref42"><label>42.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Syed</surname> <given-names>ZS</given-names></name> <name><surname>Sidorov</surname> <given-names>K</given-names></name> <name><surname>Marshall</surname> <given-names>D</given-names></name></person-group>. <article-title>Depression severity prediction based on biomarkers of psychomotor retardation</article-title>. <conf-name>Proceedings of the 7th Annual Workshop on Audio/Visual Emotion Challenge</conf-name> (<year>2017</year>).</citation></ref>
<ref id="ref43"><label>43.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ozdas</surname> <given-names>A</given-names></name> <name><surname>Shiavi</surname> <given-names>RG</given-names></name> <name><surname>Silverman</surname> <given-names>SE</given-names></name> <name><surname>Silverman</surname> <given-names>MK</given-names></name> <name><surname>Wilkes</surname> <given-names>DM</given-names></name></person-group>. <article-title>Investigation of vocal jitter and glottal flow spectrum as possible cues for depression and near-term suicidal risk</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2004</year>) <volume>51</volume>:<fpage>1530</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TBME.2004.827544</pub-id>, PMID: <pub-id pub-id-type="pmid">15376501</pub-id></citation></ref>
<ref id="ref44"><label>44.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kane</surname> <given-names>J</given-names></name> <name><surname>Drugman</surname> <given-names>T</given-names></name> <name><surname>Gobl</surname> <given-names>C</given-names></name></person-group>. <article-title>Improved automatic detection of creak</article-title>. <source>Comput Speech Lang</source>. (<year>2013</year>) <volume>27</volume>:<fpage>1028</fpage>&#x2013;<lpage>47</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.csl.2012.11.002</pub-id>, PMID: <pub-id pub-id-type="pmid">36182324</pub-id></citation></ref>
<ref id="ref45"><label>45.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Degottex</surname> <given-names>G</given-names></name> <name><surname>Erro</surname> <given-names>D</given-names></name></person-group>. <article-title>A uniform phase representation for the harmonic model in speech synthesis applications</article-title>. <source>Eurasip J Audio Speech</source>. (<year>2014</year>) 38. doi: <pub-id pub-id-type="doi">10.1002/9780470479216.corpsy0524</pub-id></citation></ref>
<ref id="ref46"><label>46.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Banse</surname> <given-names>R</given-names></name> <name><surname>Scherer</surname> <given-names>KR</given-names></name></person-group>. <article-title>Acoustic profiles in vocal emotion expression</article-title>. <source>J Pers Soc Psychol</source>. (<year>1996</year>) <volume>70</volume>:<fpage>614</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1037/0022-3514.70.3.614</pub-id>, PMID: <pub-id pub-id-type="pmid">8851745</pub-id></citation></ref>
<ref id="ref47"><label>47.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Al Hanai</surname> <given-names>T</given-names></name> <name><surname>Ghassemi</surname> <given-names>MM</given-names></name> <name><surname>Glass</surname> <given-names>JR</given-names></name></person-group>, <article-title>Detecting depression with audio/text sequence modeling of interviews</article-title>. Interspeech (<year>2018</year>).</citation></ref>
<ref id="ref48"><label>48.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>McGinnis</surname> <given-names>EW</given-names></name> <name><surname>Anderau</surname> <given-names>SP</given-names></name> <name><surname>Hruschak</surname> <given-names>J</given-names></name> <name><surname>Gurchiek</surname> <given-names>RD</given-names></name> <name><surname>Lopez-Duran</surname> <given-names>NL</given-names></name> <name><surname>Fitzgerald</surname> <given-names>K</given-names></name> <etal/></person-group>. <article-title>Giving voice to vulnerable children: machine learning analysis of speech detects anxiety and depression in early childhood</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2019</year>) <volume>23</volume>:<fpage>2294</fpage>&#x2013;<lpage>01</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JBHI.2019.2913590</pub-id>, PMID: <pub-id pub-id-type="pmid">31034426</pub-id></citation></ref>
<ref id="ref49"><label>49.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Taguchi</surname> <given-names>T</given-names></name> <name><surname>Tachikawa</surname> <given-names>H</given-names></name> <name><surname>Nemoto</surname> <given-names>K</given-names></name> <name><surname>Suzuki</surname> <given-names>M</given-names></name> <name><surname>Nagano</surname> <given-names>T</given-names></name> <name><surname>Tachibana</surname> <given-names>R</given-names></name> <etal/></person-group>. <article-title>Major depressive disorder discrimination using vocal acoustic features</article-title>. <source>J Affect Disord</source>. (<year>2018</year>) <volume>225</volume>:<fpage>214</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jad.2017.08.038</pub-id>, PMID: <pub-id pub-id-type="pmid">28841483</pub-id></citation></ref>
</ref-list>
</back>
</article>