<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurol.</journal-id>
<journal-title>Frontiers in Neurology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurol.</abbrev-journal-title>
<issn pub-type="epub">1664-2295</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fneur.2024.1394210</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neurology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Depressive and mania mood state detection through voice as a biomarker using machine learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Ji</surname> <given-names>Jun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1164046/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Dong</surname> <given-names>Wentian</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2698908/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Jiaqi</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Peng</surname> <given-names>Jingzhu</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Feng</surname> <given-names>Chaonan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1164089/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname> <given-names>Rujia</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Shi</surname> <given-names>Chuan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/793919/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Ma</surname> <given-names>Yantao</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/880571/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>College of Computer Science and Technology, Qingdao University</institution>, <addr-line>Qingdao</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Beijing Wanling Pangu Science and Technology Ltd.</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>NHC Key Laboratory of Mental Health (Peking University), Peking University Sixth Hospital, Peking University Institute of Mental Health, National Clinical Research Center for Mental Disorders (Peking University Sixth Hospital)</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Psychology, Queen's University</institution>, <addr-line>Kingston, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Arts and Sciences, Brandeis University</institution>, <addr-line>Waltham, MA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0002">
<p>Edited by: Matteo Spezialetti, University of L&#x2019;Aquila, Italy</p>
</fn>
<fn fn-type="edited-by" id="fn0003">
<p>Reviewed by: Yazhou Zhang, Tianjin University, China</p>
<p>Amirmasoud Ahmadi, Max Planck Institute for Biological Intelligence, Germany</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Yantao Ma, <email>mayantao@bjmu.edu.cn</email></corresp>
<fn fn-type="equal" id="fn0001"><p><sup>&#x2020;</sup>These authors have contributed equally to this work and share first authorship</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>04</day>
<month>07</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1394210</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>06</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Ji, Dong, Li, Peng, Feng, Liu, Shi and Ma.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Ji, Dong, Li, Peng, Feng, Liu, Shi and Ma</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Depressive and manic states contribute significantly to the global social burden, but objective detection tools are still lacking. This study investigates the feasibility of utilizing voice as a biomarker to detect these mood states. Methods:From real-world emotional journal voice recordings, 22 features were retrieved in this study, 21 of which showed significant differences among mood states. Additionally, we applied leave-one-subject-out strategy to train and validate four classification models: Chinese-speech-pretrain-GRU, Gate Recurrent Unit (GRU), Bi-directional Long Short-Term Memory (BiLSTM), and Linear Discriminant Analysis (LDA).</p>
</sec>
<sec>
<title>Results</title>
<p>Our results indicated that the Chinese-speech-pretrain-GRU model performed the best, achieving sensitivities of 77.5% and 54.8% and specificities of 86.1% and 90.3% for detecting depressive and manic states, respectively, with an overall accuracy of 80.2%.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These findings show that machine learning can reliably differentiate between depressive and manic mood states via voice analysis, allowing for a more objective and precise approach to mood disorder assessment.</p>
</sec>
</abstract>
<kwd-group>
<kwd>voice biomarker</kwd>
<kwd>machine learning</kwd>
<kwd>mood state detection</kwd>
<kwd>depression</kwd>
<kwd>mania</kwd>
</kwd-group>
<counts>
<fig-count count="3"/>
<table-count count="8"/>
<equation-count count="0"/>
<ref-count count="46"/>
<page-count count="10"/>
<word-count count="6672"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Artificial Intelligence in Neurology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>The World Health Organization (WHO) estimates that 40 million people had bipolar disorder (BPD) and 280 million people suffered from depression in 2019 (<xref ref-type="bibr" rid="ref1">1</xref>). Depression, sometimes referred to as unipolar depression or major depressive disorder (MDD), is a mental illness that affects a person&#x2019;s everyday functioning and is characterized by recurrently low emotions or a loss of interest in activities. It might be challenging to differentiate bipolar illness from depression because of its repeated episodes of mania/hypomania and depression (<xref ref-type="bibr" rid="ref2">2</xref>). In China, the lifetime prevalence rates of bipolar disorder and depression are 0.6% and 6.8%, respectively (<xref ref-type="bibr" rid="ref3">3</xref>), and the prevalence is still rising as a result of COVID-19 (<xref ref-type="bibr" rid="ref4">4</xref>). Mental disorder can increase the risk of suicide. People with depression are 20 times more likely to commit suicide (<xref ref-type="bibr" rid="ref5">5</xref>). Depression has become the fourth leading cause of death (<xref ref-type="bibr" rid="ref6">6</xref>). Patients with manic episodes tend to exhibit elevated mood, a tendency to anger easily, and excessive sensitivity. Over time, these conditions can lead to physical fatigue and compromised immune systems. Furthermore, they may engage in high-risk activities due to impulsiveness, posing potential harm to themselves and society.</p>
<p>At present, the detection of depression and manic mood are usually made by observer-based clinical rating scales, such as the Mood Disorder Questionnaire (MDQ), Quick Inventory of Depressive Symptomatology (QIDS), and Young Mania Rating Scale (YMRS). These scales are used as gold standards to evaluate the severity of manic and depressive symptoms, which are based on the Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition (DSM-V) (<xref ref-type="bibr" rid="ref7">7</xref>). Certainly, the precision of these rating scales depends on participant compliance and the subjective interpretation of practitioners. Therefore, developing continuous, objective assessments of symptom severity would be groundbreaking.</p>
<p>In recent years, many machine learning and deep learning methods are used to automatic mood recognition by speech. Faurholt-Jepsen et al. (<xref ref-type="bibr" rid="ref8">8</xref>) presented voice analysis as an objective state marker in bipolar disorder in 2016, however the study only included the speech features of 28 participants. Shin et al. (<xref ref-type="bibr" rid="ref9">9</xref>) suggested a speech biomarker machine learning model for the identification of moderate and serious depression in 2021. Lin et al. (<xref ref-type="bibr" rid="ref10">10</xref>) proposed a deep learning method for diagnosing depressive orders; Punithavathi et al. (<xref ref-type="bibr" rid="ref11">11</xref>) conducted an empirical investigation that demonstrated the potential of machine learning-based voice recognition techniques for depression prediction; and Shen et al. (<xref ref-type="bibr" rid="ref12">12</xref>) proposed a GRU/BiLSTM-based model for depression detection. The advantages and disadvantages of these studies are shown in <xref ref-type="table" rid="tab1">Table 1</xref>.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Previous studies and their advantages and disadvantages.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">References</th>
<th align="left" valign="top">Authors</th>
<th align="left" valign="top">Advantage</th>
<th align="left" valign="top">Disadvantage</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">(<xref ref-type="bibr" rid="ref8">8</xref>)</td>
<td align="left" valign="top">Faurholt-Jepsen et al.</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>We are the first to use speech analysis as an objective biomarker for bipolar disorder.</p>
</list-item>
<list-item>
<p>2. An objective and non-invasive diagnostic method has been proposed.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>The relatively small number of participants (28 people) may limit the generalization ability of the study.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">(<xref ref-type="bibr" rid="ref9">9</xref>)</td>
<td align="left" valign="top">Shin et al.</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>A machine learning model of voice biomarkers is proposed for identifying moderate and severe depression.</p>
</list-item>
<list-item>
<p>It may provide a new approach for early identification and intervention of depression.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>The model accuracy is relatively low, with sensitivity and specificity of only 0.65 and 0.66.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">(<xref ref-type="bibr" rid="ref10">10</xref>)</td>
<td align="left" valign="top">Lin et al.</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>Using deep learning methods for the diagnosis of depressive disorders may improve accuracy.</p>
</list-item>
<list-item>
<p>Deep learning models may be able to process complex voice data.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>The amount of data in the remission period is relatively small, making it difficult to verify statistical differences.</p>
</list-item>
<list-item>
<p>There is noise in the speech environment, and the noise reduction is insufficient.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">(<xref ref-type="bibr" rid="ref11">11</xref>)</td>
<td align="left" valign="top">Punithavathi et al.</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>An empirical study demonstrates the potential of machine learning-based speech recognition technology in predicting depression.</p>
</list-item>
<list-item>
<p>It provides new possibilities for early identification and intervention of depression.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>The model accuracy needs to be improved.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">(<xref ref-type="bibr" rid="ref12">12</xref>)</td>
<td align="left" valign="top">Shen et al.</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>A GRU/BiLSTM-based model is proposed for depression detection, which may improve the accuracy of diagnosis.</p>
</list-item>
<list-item>
<p>The GRU/BiLSTM structure is able to handle long-term dependencies in sequential data.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="order">
<list-item>
<p>Depression detection methods, not the depressive mood.</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The Transformer model has gained popularity in recent years, first appearing in textual analysis to handle long-range dependencies in text. Since its introduction by Vaswani et al. (<xref ref-type="bibr" rid="ref13">13</xref>), the model has made great progress in both speech and text recognition. Zhang and colleagues (<xref ref-type="bibr" rid="ref14">14</xref>) presented a hybrid model for depression detection that combines Transformers and BiLSTM. Moreover, BERT is a pre-trained Transformer model for text analysis that was introduced by Devlin et al. (<xref ref-type="bibr" rid="ref15">15</xref>). Using Wav2Vec 2.0 (<xref ref-type="bibr" rid="ref16">16</xref>) that has already been trained, Banno et al. (<xref ref-type="bibr" rid="ref17">17</xref>) devised a method for assessing oral proficiency in English. Guo et al. (<xref ref-type="bibr" rid="ref18">18</xref>) developed a pre-trained model appropriate for Chinese in 2022. Therefore, in voice recognition, pre-trained models are now commonly employed, providing novel methods for recognizing emotions.</p>
<p>While speech is a useful tool for categorizing depression and manic episodes in these models, their drawbacks include a lack of characteristics and insufficient examination of the transitions between these states. This study suggests three models to differentiate between depression, mania, and remission emotions in order to address these constraints. It also contrasts these models to find out if using pre-trained models improves classification accuracy. This study developed a voice-based machine learning model for diagnosing mania and depression by analyzing 1,337 voice messages from 93 participants. We suggest that voice can function as an objective marker for supplementary diagnosis of emotional states by the analysis of objective speech data, allowing for the predictive forecasting of emotional phase transitions.</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<sec id="sec3">
<label>2.1</label>
<title>Data sample</title>
<p>A self-monitoring app named MoodMirror was loaded onto the participants&#x2019; smartphones, and an alarm was set to sound once a day at a time of their choosing to remind the patients to provide electronic self-monitored data. The participants completed all assessment questionnaires, verified their informed consent, and obtained trial information via the app.</p>
<p>In this current longitudinal investigation, the following hypotheses were tested using the MoodMirror system in participants presenting with moderate to severe degrees of manic and depressed symptoms: In naturalistic settings, voice features from ordinary life that were extracted using the &#x201C;mood diary&#x201D; module may distinguish between different affective states. Participants are asked to describe their present naturalistic mood state in the &#x201C;mood diary&#x201D; module stated above. Participants&#x2019; smartphones recorded audio at a sample rate of 16&#x2009;kHz.</p>
<p>Every participant was chosen from Peking University Sixth Hospital and provided written informed consent electronically via the MoodMirror app. Based on the subjects&#x2019; current depressive and manic states, a sample of 1,337 voice messages from 93 subjects was divided into three groups: the depression mood state group (431 voice messages from <italic>n</italic>&#x2009;=&#x2009;39), the mania mood state group (208 voice messages from <italic>n</italic>&#x2009;=&#x2009;20), and the remission group (698 voice messages from <italic>n</italic>&#x2009;=&#x2009;34). The demographic information of the participants is shown in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Comparing the demographics based on various emotional states.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th/>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top"><italic>F/&#x03C7;<sup>2</sup></italic></th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">N</td>
<td/>
<td align="center" valign="middle">39 (41.94%)</td>
<td align="center" valign="middle">20 (21.51%)</td>
<td align="center" valign="middle">34 (36.56%)</td>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">Gender</td>
<td align="left" valign="middle">Female</td>
<td align="center" valign="middle">19 (48.72%)</td>
<td align="center" valign="middle">11 (55.00%)</td>
<td align="center" valign="middle">27 (79.42%)</td>
<td align="center" valign="middle">6.86</td>
<td align="center" valign="middle">0.032</td>
</tr>
<tr>
<td/>
<td align="left" valign="middle">Male</td>
<td align="center" valign="middle">19 (48.72%)</td>
<td align="center" valign="middle">8 (40.00%)</td>
<td align="center" valign="middle">7 (20.58%)</td>
<td/>
<td/>
</tr>
<tr>
<td/>
<td align="left" valign="middle">Absence</td>
<td align="center" valign="middle">1 (2.56%)</td>
<td align="center" valign="middle">1 (5.00%)</td>
<td align="center" valign="middle">0 (0%)</td>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">Age (years)</td>
<td/>
<td align="center" valign="middle">29.74&#x2009;&#x00B1;&#x2009;12.04</td>
<td align="center" valign="middle">29.68&#x2009;&#x00B1;&#x2009;8.03</td>
<td align="center" valign="middle">42.05&#x2009;&#x00B1;&#x2009;9.14</td>
<td align="center" valign="middle">13.15</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="middle">Education (years)</td>
<td/>
<td align="center" valign="middle">14.32&#x2009;&#x00B1;&#x2009;2.69</td>
<td align="center" valign="middle">14.06&#x2009;&#x00B1;&#x2009;3.45</td>
<td align="center" valign="middle">17.84&#x2009;&#x00B1;&#x2009;1.87</td>
<td align="center" valign="middle">17.46</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="middle">Marriage</td>
<td align="left" valign="middle">Married</td>
<td align="center" valign="middle">5 (7.69%)</td>
<td align="center" valign="middle">4 (20.00%)</td>
<td align="center" valign="middle">24 (70.59%)</td>
<td align="center" valign="middle">32.77</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td/>
<td align="left" valign="middle">Unmarried</td>
<td align="center" valign="middle">31 (79.49%)</td>
<td align="center" valign="middle">15 (75.00%)</td>
<td align="center" valign="middle">8 (23.53%)</td>
<td/>
<td/>
</tr>
<tr>
<td/>
<td align="left" valign="middle">Divorced</td>
<td align="center" valign="middle">2 (5.13%)</td>
<td align="center" valign="middle">0 (0.00%)</td>
<td align="center" valign="middle">0 (0.00%)</td>
<td/>
<td/>
</tr>
<tr>
<td/>
<td align="left" valign="middle">Absence</td>
<td align="center" valign="middle">1 (2.56)</td>
<td align="center" valign="middle">1 (5.00%)</td>
<td align="center" valign="middle">2 (5.88%)</td>
<td/>
<td/>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Clinical assessments</title>
<p>The MoodMirror app incorporates three self-evaluated scales that are considered gold standards for assessing manic and depressed mood states: the Mood Disorder Questionnaire (MDQ) (<xref ref-type="bibr" rid="ref19">19</xref>), the Quick Inventory of Depressed Symptomatology (QIDS) (<xref ref-type="bibr" rid="ref20">20</xref>) and the Young Mania Rating Scale (YMRS) (<xref ref-type="bibr" rid="ref21">21</xref>). Concurrently, the voice recordings were gathered within the application. <xref ref-type="table" rid="tab3">Table 3</xref> displays the various combinations of cut-offs that were used to determine the remission, depressive, and manic mood states. <xref ref-type="table" rid="tab4">Table 4</xref> provides an illustration of the detailed scare scores. ANOVA test was used to compare score differences among three groups.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Standard of many emotional states.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Mood state</th>
<th align="left" valign="top">Criterion</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Remission</td>
<td align="center" valign="top">MDQ&#x2009;&#x003C;&#x2009;7 and YMRS &#x003C; 13 and QIDS &#x003C; 9</td>
</tr>
<tr>
<td align="left" valign="middle">Depressive</td>
<td align="center" valign="top">MDQ&#x2009;&#x003C;&#x2009;7 and YMRS &#x003C; 13 and QIDS &#x2265; 9</td>
</tr>
<tr>
<td align="left" valign="middle">Manic</td>
<td align="center" valign="top">(MDQ&#x2009;&#x2265; 7 or YMRS &#x2265;&#x2009;13) and QIDS &#x2265;&#x2009;9</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Clinical traits according to various emotional states.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th/>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top"><italic>F</italic></th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle"><italic>N</italic></td>
<td/>
<td align="center" valign="middle">39</td>
<td align="center" valign="middle">20</td>
<td align="center" valign="middle">34</td>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">MDQ</td>
<td align="left" valign="middle">Mean</td>
<td align="center" valign="middle">1.25</td>
<td align="center" valign="middle">9.00</td>
<td align="center" valign="middle">1.27</td>
<td align="center" valign="middle">208.44</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td/>
<td align="left" valign="middle">SD</td>
<td align="center" valign="middle">1.58</td>
<td align="center" valign="middle">2.67</td>
<td align="center" valign="middle">1.51</td>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">QIDS</td>
<td align="left" valign="middle">Mean</td>
<td align="center" valign="middle">17.25</td>
<td align="center" valign="middle">22.18</td>
<td align="center" valign="middle">2.89</td>
<td align="center" valign="middle">153.59</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td/>
<td align="left" valign="middle">SD</td>
<td align="center" valign="middle">5.82</td>
<td align="center" valign="middle">6.97</td>
<td align="center" valign="middle">2.04</td>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">YMRS</td>
<td align="left" valign="middle">Mean</td>
<td align="center" valign="middle">2.77</td>
<td align="center" valign="middle">13.59</td>
<td align="center" valign="middle">1.67</td>
<td align="center" valign="middle">58.57</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td/>
<td align="left" valign="middle">SD</td>
<td align="center" valign="middle">3.79</td>
<td align="center" valign="middle">11.62</td>
<td align="center" valign="middle">2.92</td>
<td/>
<td/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>MDQ, Mood Disorder Questionnaire; QIDS, Quick Inventory of Depressive Symptomatology; YMRS, Young Mania Rating Scale.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Voice features</title>
<p>In this study, we used a total of 22 voice features: 8 time-domain features (zero-crossing rate, short-term energy, short-term energy entropy, spectral centroid, spectral spread, spectral entropy, spectral flux, spectral rolloff), 13 Mel-Frequency Cepstral Coefficients (MFCC) features, and the duration of the voice recording. Speech contains rhythm and tempo, and longer segments provide more audio information. Thus, feature extraction from longer speech segments captures crucial information more precisely. Longer speeches allow for more fine-grained segments during short-time framing, leading to more accurate information extraction. Consequently, speech duration impacts speech classification, and we included it as a feature in our analysis. The specific meanings of each feature are presented in <xref ref-type="table" rid="tab5">Table 5</xref>.</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Conception of features.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">ID</th>
<th align="left" valign="top">Feature</th>
<th align="left" valign="top">Conception</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">1</td>
<td align="left" valign="top">Zero-crossing rate</td>
<td align="left" valign="top">The number of times a signal changes from positive to negative or from negative to positive within a unit of time</td>
</tr>
<tr>
<td align="left" valign="top">2</td>
<td align="left" valign="top">Short-term energy</td>
<td align="left" valign="top">The magnitude of sound energy within a certain period of time</td>
</tr>
<tr>
<td align="left" valign="top">3</td>
<td align="left" valign="top">The short-term energy entropy</td>
<td align="left" valign="top">The energy distribution characteristics of speech signals in the time domain, which is obtained by calculating the information entropy of short-term energy</td>
</tr>
<tr>
<td align="left" valign="top">4</td>
<td align="left" valign="top">Spectral centroid</td>
<td align="left" valign="top">The central position of the spectral distribution of an audio signal, the location where the spectral energy is concentrated</td>
</tr>
<tr>
<td align="left" valign="top">5</td>
<td align="left" valign="top">Spectral spread</td>
<td align="left" valign="top">The distribution of an audio signal around its spectral center, as well as the dispersion degree of spectral energy</td>
</tr>
<tr>
<td align="left" valign="top">6</td>
<td align="left" valign="top">Spectral entropy</td>
<td align="left" valign="top">The relationship between the power spectrum and entropy rate of an audio signal, which can be used to describe the complexity and randomness of the signal</td>
</tr>
<tr>
<td align="left" valign="top">7</td>
<td align="left" valign="top">Spectral flux</td>
<td align="left" valign="top">The rate of spectral changes in an audio signal between adjacent time frames, reflecting the dynamic characteristics of the audio signal</td>
</tr>
<tr>
<td align="left" valign="top">8</td>
<td align="left" valign="top">Spectral rolloff</td>
<td align="left" valign="top">The rate of spectral attenuation in an audio signal, measuring the degree of attenuation in the audio signal</td>
</tr>
<tr>
<td align="left" valign="top">9&#x2013;21</td>
<td align="left" valign="top">Mel-Frequency Cepstral Coefficients(MFCC)</td>
<td align="left" valign="top">It is primarily employed to convert audio signals into compact and information-rich representations for tasks such as speech recognition and speaker identification (<xref ref-type="bibr" rid="ref22">22</xref>)</td>
</tr>
<tr>
<td align="left" valign="top">22</td>
<td align="left" valign="top">Duration</td>
<td align="left" valign="top">The duration of a speech utterance</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In general, a smaller value of spectral centroid indicated that the spectral energy of the audio signal was more concentrated in the low-frequency range. A greater spectral spread indicated a wider distribution of spectral energy across the frequency domain in the audio signal. Spectral rolloff represented the frequency that was below a specified percentage of the total spectral energy. Mel-Frequency Cepstral Coefficients (MFCC) was a commonly used feature extraction technique in speech and audio signal processing. By extracting MFCCs, audio signals were transformed into a compact set of feature vectors, which could be more easily utilized for tasks such as classification, recognition, or other tasks using machine learning algorithms.</p>
<p>The above-mentioned features id1&#x2013;id21 were extracted from each frame, then, the average of all frame features was calculated to serve as the feature representation of the speech. The duration of speech was also the feature. Utilizing &#x03C7;<sup>2</sup> tests for detecting differences between groups, we selected features with significant differences to train the model. <xref ref-type="table" rid="tab6">Table 6</xref> shows the inter-group comparison of features for the three categories. The violin plots representing the distributions of three groups of speech features are shown in <xref ref-type="supplementary-material" rid="SM1">Supplementary File 1</xref>. Features were extracted using the open-source program pyAudioAnalysis (<xref ref-type="bibr" rid="ref23">23</xref>) of Python version 3.6.1.</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Differences in features of three types of speech.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Feature</th>
<th align="left" valign="top">Depressive</th>
<th align="left" valign="top">Manic</th>
<th align="left" valign="top">Remission</th>
<th align="left" valign="top"><italic>&#x03C7;<sup>2</sup></italic></th>
<th align="left" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Zero-crossing rate</td>
<td align="center" valign="top">0.130&#x2009;&#x00B1;&#x2009;0.057</td>
<td align="center" valign="top">0.120&#x2009;&#x00B1;&#x2009;0.048</td>
<td align="center" valign="middle">0.105&#x2009;&#x00B1;&#x2009;0.039</td>
<td align="center" valign="middle">101.993</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Short-term energy</td>
<td align="center" valign="top">0.020&#x2009;&#x00B1;&#x2009;0.032</td>
<td align="center" valign="top">0.025&#x2009;&#x00B1;&#x2009;0.031</td>
<td align="center" valign="middle">0.013&#x2009;&#x00B1;&#x2009;0.023</td>
<td align="center" valign="middle">93.421</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">The short-term energy entropy</td>
<td align="center" valign="top">3.024&#x2009;&#x00B1;&#x2009;0.220</td>
<td align="center" valign="top">2.935&#x2009;&#x00B1;&#x2009;0.197</td>
<td align="center" valign="middle">2.977&#x2009;&#x00B1;&#x2009;0.079</td>
<td align="center" valign="middle">246.895</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Spectral centroid</td>
<td align="center" valign="top">0.229&#x2009;&#x00B1;&#x2009;0.067</td>
<td align="center" valign="top">0.214&#x2009;&#x00B1;&#x2009;0.060</td>
<td align="center" valign="middle">0.196&#x2009;&#x00B1;&#x2009;0.045</td>
<td align="center" valign="middle">138.894</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Spectral spread</td>
<td align="center" valign="top">0.221&#x2009;&#x00B1;&#x2009;0.029</td>
<td align="center" valign="top">0.207&#x2009;&#x00B1;&#x2009;0.030</td>
<td align="center" valign="middle">0.206&#x2009;&#x00B1;&#x2009;0.017</td>
<td align="center" valign="middle">183.481</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Spectral entropy</td>
<td align="center" valign="top">1.055&#x2009;&#x00B1;&#x2009;0.560</td>
<td align="center" valign="top">0.905&#x2009;&#x00B1;&#x2009;0.475</td>
<td align="center" valign="middle">0.856&#x2009;&#x00B1;&#x2009;0.373</td>
<td align="center" valign="middle">52.667</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Spectral flux</td>
<td align="center" valign="top">0.015&#x2009;&#x00B1;&#x2009;0.012</td>
<td align="center" valign="top">0.017&#x2009;&#x00B1;&#x2009;0.009</td>
<td align="center" valign="middle">0.014&#x2009;&#x00B1;&#x2009;0.006</td>
<td align="center" valign="middle">60.850</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Spectral rolloff</td>
<td align="center" valign="top">0.231&#x2009;&#x00B1;&#x2009;0.155</td>
<td align="center" valign="top">0.210&#x2009;&#x00B1;&#x2009;0.135</td>
<td align="center" valign="middle">0.175&#x2009;&#x00B1;&#x2009;0.107</td>
<td align="center" valign="middle">67.486</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC1</td>
<td align="center" valign="top">&#x2212;29.834&#x2009;&#x00B1;&#x2009;4.72</td>
<td align="center" valign="top">&#x2212;28.578&#x2009;&#x00B1;&#x2009;2.697</td>
<td align="center" valign="middle">&#x2212;28.361&#x2009;&#x00B1;&#x2009;2.49</td>
<td align="center" valign="middle">108.089</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC2</td>
<td align="center" valign="top">1.886&#x2009;&#x00B1;&#x2009;0.671</td>
<td align="center" valign="top">2.027&#x2009;&#x00B1;&#x2009;0.520</td>
<td align="center" valign="middle">2.185&#x2009;&#x00B1;&#x2009;0.417</td>
<td align="center" valign="middle">72.105</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC3</td>
<td align="center" valign="top">&#x2212;0.074&#x2009;&#x00B1;&#x2009;0.359</td>
<td align="center" valign="top">0.123&#x2009;&#x00B1;&#x2009;0.545</td>
<td align="center" valign="middle">0.183&#x2009;&#x00B1;&#x2009;0.213</td>
<td align="center" valign="middle">149.193</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC4</td>
<td align="center" valign="top">0.225&#x2009;&#x00B1;&#x2009;0.287</td>
<td align="center" valign="top">0.176&#x2009;&#x00B1;&#x2009;0.317</td>
<td align="center" valign="middle">0.339&#x2009;&#x00B1;&#x2009;0.165</td>
<td align="center" valign="middle">77.060</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC5</td>
<td align="center" valign="top">&#x2212;0.011&#x2009;&#x00B1;&#x2009;0.210</td>
<td align="center" valign="top">&#x2212;0.016&#x2009;&#x00B1;&#x2009;0.208</td>
<td align="center" valign="middle">0.213&#x2009;&#x00B1;&#x2009;0.186</td>
<td align="center" valign="middle">329.659</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC6</td>
<td align="center" valign="top">0.064&#x2009;&#x00B1;&#x2009;0.149</td>
<td align="center" valign="top">0.054&#x2009;&#x00B1;&#x2009;0.157</td>
<td align="center" valign="middle">0.067&#x2009;&#x00B1;&#x2009;0.125</td>
<td align="center" valign="middle">2.786</td>
<td align="center" valign="middle">0.248</td>
</tr>
<tr>
<td align="left" valign="top">MFCC7</td>
<td align="center" valign="top">&#x2212;0.055&#x2009;&#x00B1;&#x2009;0.151</td>
<td align="center" valign="top">0.034&#x2009;&#x00B1;&#x2009;0.169</td>
<td align="center" valign="middle">0.042&#x2009;&#x00B1;&#x2009;0.089</td>
<td align="center" valign="middle">165.106</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC8</td>
<td align="center" valign="top">&#x2212;0.085&#x2009;&#x00B1;&#x2009;0.146</td>
<td align="center" valign="top">0.020&#x2009;&#x00B1;&#x2009;0.109</td>
<td align="center" valign="middle">&#x2212;0.002&#x2009;&#x00B1;&#x2009;0.087</td>
<td align="center" valign="middle">123.477</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC9</td>
<td align="center" valign="top">&#x2212;0.039&#x2009;&#x00B1;&#x2009;0.119</td>
<td align="center" valign="top">0.003&#x2009;&#x00B1;&#x2009;0.150</td>
<td align="center" valign="middle">&#x2212;0.014&#x2009;&#x00B1;&#x2009;0.081</td>
<td align="center" valign="middle">17.099</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC10</td>
<td align="center" valign="top">&#x2212;0.099&#x2009;&#x00B1;&#x2009;0.134</td>
<td align="center" valign="top">&#x2212;0.045&#x2009;&#x00B1;&#x2009;0.123</td>
<td align="center" valign="middle">&#x2212;0.217&#x2009;&#x00B1;&#x2009;0.091</td>
<td align="center" valign="middle">379.390</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC11</td>
<td align="center" valign="top">&#x2212;0.106&#x2009;&#x00B1;&#x2009;0.115</td>
<td align="center" valign="top">&#x2212;0.015&#x2009;&#x00B1;&#x2009;0.138</td>
<td align="center" valign="middle">&#x2212;0.047&#x2009;&#x00B1;&#x2009;0.071</td>
<td align="center" valign="middle">111.395</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC12</td>
<td align="center" valign="top">&#x2212;0.035&#x2009;&#x00B1;&#x2009;0.115</td>
<td align="center" valign="top">0.012&#x2009;&#x00B1;&#x2009;0.124</td>
<td align="center" valign="middle">&#x2212;0.015&#x2009;&#x00B1;&#x2009;0.061</td>
<td align="center" valign="middle">39.043</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">MFCC13</td>
<td align="center" valign="top">&#x2212;0.064&#x2009;&#x00B1;&#x2009;0.104</td>
<td align="center" valign="top">&#x2212;0.054&#x2009;&#x00B1;&#x2009;0.110</td>
<td align="center" valign="middle">&#x2212;0.106&#x2009;&#x00B1;&#x2009;0.063</td>
<td align="center" valign="middle">90.403</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
<tr>
<td align="left" valign="top">Duration</td>
<td align="center" valign="top">45.005&#x2009;&#x00B1;&#x2009;28.981</td>
<td align="center" valign="top">10.371&#x2009;&#x00B1;&#x2009;10.989</td>
<td align="center" valign="middle">32.720&#x2009;&#x00B1;&#x2009;15.234</td>
<td align="center" valign="middle">212.314</td>
<td align="center" valign="middle">&#x003C;0.001</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>Machine learning models</title>
<p>After the feature extraction process from the original speech recordings, leave-one-subject-out was used validation method. The Gate Recurrent Unit (GRU) (<xref ref-type="bibr" rid="ref12">12</xref>) is a gating mechanism in recurrent neural networks (<xref ref-type="bibr" rid="ref24">24</xref>), which is similar to LSTM (long short-term memory) (<xref ref-type="bibr" rid="ref25">25</xref>) with an output gate and fewer parameters. GRU is included in this study since it performs similarly to LSTM with fewer parameters on tasks including speech signal modeling and natural language processing (<xref ref-type="bibr" rid="ref26">26</xref>, <xref ref-type="bibr" rid="ref27">27</xref>).</p>
<p>Bidirectional Long Short-Term Memory (BiLSTM) (<xref ref-type="bibr" rid="ref28">28</xref>) is a further development of LSTM and BiLSTM combines the forward hidden layer and the backward hidden layer, which can access both the preceding and succeeding information (<xref ref-type="bibr" rid="ref28">28</xref>). The application of BiLSTM in speech processing typically involves leveraging its bidirectional recurrent structure to capture temporal information in speech signals, thereby improving performance in tasks such as speech recognition and speech emotion recognition.</p>
<p>Wav2vec2.0 (<xref ref-type="bibr" rid="ref16">16</xref>) is a speech feature extraction model proposed by the Facebook AI Research team, which utilizes Transformer as its underlying architecture. The structure of transformers is illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>. The Chinese-speech-pretrain model used in this study is developed by TencentGames and Tencent Zhiji based on Wav2vec2.0, which has been trained using a large amount of unsupervised learning data. These data primarily originate from YouTube and Podcast, covering a wide range of recording scenarios, background noises and speaking styles. The model was trained using the Fairseq toolkit (<xref ref-type="bibr" rid="ref29">29</xref>) and employed the 10,000&#x2009;h Chinese dataset from the train_1 set of WenetSpeech (<xref ref-type="bibr" rid="ref30">30</xref>) as the foundation. This self-supervised learning approach enables the model to learn the deep structure of speech without annotations, enhancing its comprehension of Chinese speech.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>The structure of transformer.</p>
</caption>
<graphic xlink:href="fneur-15-1394210-g001.tif"/>
</fig>
<p>Linear Discriminant Analysis (LDA) is a linear learning algorithm originally proposed by Fisher (<xref ref-type="bibr" rid="ref31">31</xref>) in the field of classification. For binary classification problems, LDA projects both categories of data onto a single line to ensure that data within the same category are as close as possible, while data from different categories are as far apart as possible, which means minimizing the intra-class distance and maximizing the inter-class distance. When a new sample is encountered, it is also mapped onto this line, and the category of the new sample is determined based on the position of its projection point (<xref ref-type="bibr" rid="ref32">32</xref>).</p>
</sec>
<sec id="sec7">
<label>2.5</label>
<title>Processing</title>
<p>First of all, voice recordings and questionnaire data (QIDS, MDQ, and YMRS scores) were collected from 93 patients. The questionnaire responses were organized according to <xref ref-type="table" rid="tab3">Table 3</xref> criteria to classify patients into depressive, manic, or remission states based on their mood. To mitigate environmental noise variability in self-recorded voices, a band-pass filter (200&#x2013;4,000&#x2009;Hz) was applied to improve the signal-to-noise ratio (SNR). The Kruskal-Wallis test was used to compare SNR differences among the three speech groups, while the Mann&#x2013;Whitney U test was employed to analyze differences between any two speech types. Furthermore, an endpoint detection method based on short-time energy and zero-crossing rate was implemented to remove silent intervals at the beginning and end of each speech sample. Subsequently, speech signals were segmented into frames with a frame duration of 30&#x2009;ms and a 15&#x2009;ms frame shift. Each frame underwent Hamming windowing to prepare for subsequent feature extraction.</p>
<p>Initially, the 93 participants were numbered from 1 to 93, and voice features were extracted for each individual. The leave-one-subject-out cross-validation method was employed for model validation. Specifically, the voice data of participant ID1 served as the test set, while the data from the remaining 92 participants constituted the training set. To address class imbalance, the Synthetic Minority Over-sampling Technique (SMOTE) was applied during training data preparation. For evaluating the models trained using Linear Discriminant Analysis (LDA), Gate Recurrent Unit (GRU), and Bi-directional Long Short-Term Memory (BiLSTM), the voice data from participant ID1 was input into each trained model. The accuracy, specificity, and sensitivity of these models in recognizing voice emotions were computed accordingly. Subsequently, participant ID2&#x2019;s voice data was used as the test set, and the process of training the models with the data from the remaining 92 participants was repeated. This cycle was iterated until each of the 93 participants had served as the test set at least once, allowing for a comprehensive evaluation of the trained models. Finally, the average performance across all 93 test iterations was calculated to determine the overall effectiveness of the models.</p>
<p>In addition, default parameters were used in LDA. And for GRU and BiLSTM, grid search was employed to determine the optimal parameters. The parameters to be determined include batch-size, with options of 30, 40, 50; dropout rate with options of 0.1, 0.2, 0.3; and learning rate with options of 0.006, 0.0006, 0.00006. Both GRU and BiLSTM use ReLU and Softmax as the activation functions for the fully connected layers. The aforementioned training process was implemented using PyTorch (<xref ref-type="bibr" rid="ref33">33</xref>). As a result, the batch-size, dropout and learning rate of GRU were 40, 0.3 and 0.0006, respectively, while for BiLSTM, the batch-size, dropout and learning rate were 40, 0.2 and 0.0006, respectively.</p>
<p>This study retrieved features using the pre-trained model Wav2vec2.0, and the best results were obtained by training using the aforementioned ideal parameters and model.</p>
</sec>
</sec>
<sec sec-type="results" id="sec8">
<label>3</label>
<title>Results</title>
<p><xref ref-type="table" rid="tab2">Table 2</xref> presents demographic information. Gender, marital status, years of schooling, and age showed significant differences (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.05) across the three groups.</p>
<p><xref ref-type="table" rid="tab4">Table 4</xref> illustrates that there are differences among the three groups of people in the three scales of MDQ, QIDS and YMRS (<italic>p</italic>&#x2009;&#x003C;&#x2009;0.05).</p>
<p>The average signal-to-noise ratio (SNR) for depressive mood speech is 5.838&#x2009;&#x00B1;&#x2009;3.211, while the average SNR for manic mood speech is 5.586&#x2009;&#x00B1;&#x2009;3.211, and the SNR for the speech during remission period is 4.561&#x2009;&#x00B1;&#x2009;2.281. Significant differences in SNR were found among these groups (K&#x2009;=&#x2009;143.954, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.05). Specifically, there was no significant difference in SNR between depressive and manic mood speech (U&#x2009;=&#x2009;42,872, <italic>p</italic>&#x2009;=&#x2009;0.379). However, SNR differed significantly between the remission period and depressive mood speech (U&#x2009;=&#x2009;101,271, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.05), as well as between the remission period and manic mood speech (U&#x2009;=&#x2009;38,077, <italic>p</italic>&#x2009;&#x003C;&#x2009;0.05).</p>
<p>As is shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>, the speech length of people in the depressive mood state is concentrated in 40&#x2013;60&#x2009;s, the speech length of the manic mood state is concentrated in 1&#x2013;30&#x2009;s, and the speech length of the people in the remission is concentrated in 20&#x2013;40&#x2009;s. The speech duration typically ranges from 1 to 60&#x2009;s, with graphical representation continuing beyond 60&#x2009;s. This extension occurs due to a peak near the 60-s mark, where the graph automatically extends around the peak instead of abruptly stopping. The duration of speech varies with different emotions, which correlates with emotional states. Individuals experiencing depression often communicate via electronic devices like mobile phones, showing a strong inclination to express themselves (<xref ref-type="bibr" rid="ref34">34</xref>, <xref ref-type="bibr" rid="ref35">35</xref>). They may exhibit slower speech due to lower mood and energy levels, along with reduced interaction with others. This slower articulation could stem from delayed thought processes. Conversely, individuals in a manic state tend to speak rapidly, reflecting impatience and haste, resulting in shorter recorded speech durations.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Density map showing the dispersion of voice recording duration.</p>
</caption>
<graphic xlink:href="fneur-15-1394210-g002.tif"/>
</fig>
<p>Moreover, <xref ref-type="fig" rid="fig3">Figure 3</xref> shows participants&#x2019; Mel spectrograms depressive, manic and remission mood states. In a Mel spectrogram, time is depicted on the horizontal axis and frequency on the vertical axis, with brighter areas indicating higher energy levels. The figure illustrates that the sound energy across the three different emotional states is primarily concentrated in lower frequencies. Specifically, the depressive state and remission period exhibit concentration below 2,048&#x2009;Hz, while the manic mood is concentrated below 1,024&#x2009;Hz. A comparative analysis reveals that compared to the remission period, the energy distribution in the depressive state is relatively more dispersed, while in the remission period, energy is denser than 512&#x2009;Hz. This observation highlights discernible distinctions in energy and frequency among the three emotional states, thereby facilitating the discrimination of speech emotions. As can be seen in <xref ref-type="fig" rid="fig3">Figure 3</xref>, the MFCC plots of the depressive and the remission are similar, while the manic mood state is significantly different from them. This might be attributed to depression and mania being two extremes of emotion, thus potentially exhibiting starkly different patterns in speech features.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Participants&#x2019; Mel spectrograms showing depressive, manic and remission mood states.</p>
</caption>
<graphic xlink:href="fneur-15-1394210-g003.tif"/>
</fig>
<p><xref ref-type="table" rid="tab6">Table 6</xref> demonstrates the contrasting differences of 22 features among depressive mood, manic mood, and remission mood voices. It is evident from <xref ref-type="table" rid="tab6">Table 6</xref> that, except for the MFCC6 feature, which showed no significant difference among the three voice types (<italic>p</italic> =&#x2009;0.248), the remaining 21 features exhibited significant differences (<italic>p</italic> &#x003C;&#x2009;0.001). <xref ref-type="supplementary-material" rid="SM1">Supplementary File 1</xref> presents violin plots of the 21 features excluding MFCC6, which indicate variations in the distributions of these 21 features across the three types of voice. For detailed information, please refer to <xref ref-type="supplementary-material" rid="SM1">Supplementary File 1</xref>. These 21 features are the selected ones for model training.</p>
<p>Lastly, by using the selected 21 features to train the model, the confusion matrix and performance of the model is shown in <xref ref-type="table" rid="tab7">Tables 7</xref>, <xref ref-type="table" rid="tab8">8</xref>. The Chinese-speech-pretrain-GRU achieves an accuracy of 80.2%, outperforming LDA (78.9%), BiLSTM (69.5%), and GRU (70.0%). BiLSTM demonstrates 82.2% specificity and 56.4% sensitivity for detecting depressive mood, and 85.3% specificity and 46.2% sensitivity for manic mood. 60.1% precision and 58.2% F1-score for detecting depressive mood, and 36.6% precision and 40.8% F1-score for manic. Similarly, GRU exhibits 57.3% sensitivity, 82.4% specificity, 60.8% precision and 59.0% F1-score for depressive mood, and 56.3% sensitivity, 86.2% specificity, 42.8% precision and 48.6% F1-score for manic mood.</p>
<table-wrap position="float" id="tab7">
<label>Table 7</label>
<caption>
<p>The confusion matrix of LDA, BiLSTM, GRU and Chinese-speech-pretrain+GRU.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th/>
<th align="center" valign="top" colspan="3">Predicted</th>
<th align="center" valign="top" colspan="3">Predicted</th>
</tr>
<tr>
<th/>
<th/>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
</tr>
<tr>
<th/>
<th/>
<th align="center" valign="top" colspan="3">LDA</th>
<th align="center" valign="top" colspan="3">BiLSTM</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="3">True label</td>
<td align="left" valign="top">Remission</td>
<td align="center" valign="top">622</td>
<td align="center" valign="top">44</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">593</td>
<td align="center" valign="top">58</td>
<td align="center" valign="top">47</td>
</tr>
<tr>
<td align="left" valign="top">Depressive</td>
<td align="center" valign="top">44</td>
<td align="center" valign="top">303</td>
<td align="center" valign="top">84</td>
<td align="center" valign="top">69</td>
<td align="center" valign="top">243</td>
<td align="center" valign="top">119</td>
</tr>
<tr>
<td align="left" valign="top">Manic</td>
<td align="center" valign="top">30</td>
<td align="center" valign="top">48</td>
<td align="center" valign="top">130</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">103</td>
<td align="center" valign="top">96</td>
</tr>
<tr>
<td/>
<td/>
<td align="center" valign="top" colspan="3"><bold>GRU</bold></td>
<td align="center" valign="top" colspan="3">
<bold>Chinese-speech-pretrain + GRU</bold>
</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">True label</td>
<td align="left" valign="top">Remission</td>
<td align="center" valign="top">572</td>
<td align="center" valign="top">89</td>
<td align="center" valign="top">37</td>
<td align="center" valign="top">625</td>
<td align="center" valign="top">39</td>
<td align="center" valign="top">34</td>
</tr>
<tr>
<td align="left" valign="top">Depressive</td>
<td align="center" valign="top">65</td>
<td align="center" valign="top">247</td>
<td align="center" valign="top">119</td>
<td align="center" valign="top">22</td>
<td align="center" valign="top">334</td>
<td align="center" valign="top">75</td>
</tr>
<tr>
<td align="left" valign="top">Manic</td>
<td align="center" valign="top">21</td>
<td align="center" valign="top">70</td>
<td align="center" valign="top">117</td>
<td align="center" valign="top">7</td>
<td align="center" valign="top">87</td>
<td align="center" valign="top">114</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>LDA, Linear Discriminant Analysis; BiLSTM, Bidirectional Long Short-Term Memory; GRU, The Gate Recurrent Unit.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab8">
<label>Table 8</label>
<caption>
<p>Performance comparison between LDA, BiLSTM, GRU, and Chinese-speech-pretrain+GRU.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
<th align="center" valign="top">Remission</th>
<th align="center" valign="top">Depressive</th>
<th align="center" valign="top">Manic</th>
</tr>
<tr>
<th/>
<th align="center" valign="top" colspan="3">LDA</th>
<th align="center" valign="top" colspan="3">BiLSTM</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Accuracy</td>
<td align="center" valign="top" colspan="3">0.789</td>
<td align="center" valign="top" colspan="3">0.695</td>
</tr>
<tr>
<td align="left" valign="top">Sensitivity</td>
<td align="center" valign="top">0.891</td>
<td align="center" valign="top">0.703</td>
<td align="center" valign="top">0.625</td>
<td align="center" valign="top">0.849</td>
<td align="center" valign="top">0.564</td>
<td align="center" valign="top">0.462</td>
</tr>
<tr>
<td align="left" valign="top">Specificity</td>
<td align="center" valign="top">0.884</td>
<td align="center" valign="top">0.898</td>
<td align="center" valign="top">0.897</td>
<td align="center" valign="top">0.878</td>
<td align="center" valign="top">0.822</td>
<td align="center" valign="top">0.853</td>
</tr>
<tr>
<td align="left" valign="top">Precision</td>
<td align="center" valign="top">0.894</td>
<td align="center" valign="top">0.767</td>
<td align="center" valign="top">0.528</td>
<td align="center" valign="top">0.884</td>
<td align="center" valign="top">0.601</td>
<td align="center" valign="top">0.366</td>
</tr>
<tr>
<td align="left" valign="top">F1-score</td>
<td align="center" valign="top">0.892</td>
<td align="center" valign="top">0.734</td>
<td align="center" valign="top">0.572</td>
<td align="center" valign="top">0.866</td>
<td align="center" valign="top">0.582</td>
<td align="center" valign="top">0.408</td>
</tr>
<tr>
<td/>
<td align="center" valign="top" colspan="3"><bold>GRU</bold></td>
<td align="center" valign="top" colspan="3">
<bold>Chinese-speech-pretrain + GRU</bold>
</td>
</tr>
<tr>
<td align="left" valign="top">Accuracy</td>
<td align="center" valign="top" colspan="3">0.700</td>
<td align="center" valign="top" colspan="3">0.802</td>
</tr>
<tr>
<td align="left" valign="top">Sensitivity</td>
<td align="center" valign="top">0.819</td>
<td align="center" valign="top">0.573</td>
<td align="center" valign="top">0.563</td>
<td align="center" valign="top">0.896</td>
<td align="center" valign="top">0.775</td>
<td align="center" valign="top">0.548</td>
</tr>
<tr>
<td align="left" valign="top">Specificity</td>
<td align="center" valign="top">0.865</td>
<td align="center" valign="top">0.824</td>
<td align="center" valign="top">0.862</td>
<td align="center" valign="top">0.955</td>
<td align="center" valign="top">0.861</td>
<td align="center" valign="top">0.903</td>
</tr>
<tr>
<td align="left" valign="top">Precision</td>
<td align="center" valign="top">0.869</td>
<td align="center" valign="top">0.608</td>
<td align="center" valign="top">0.428</td>
<td align="center" valign="top">0.925</td>
<td align="center" valign="top">0.750</td>
<td align="center" valign="top">0.529</td>
</tr>
<tr>
<td align="left" valign="top">F1-score</td>
<td align="center" valign="top">0.843</td>
<td align="center" valign="top">0.590</td>
<td align="center" valign="top">0.486</td>
<td align="center" valign="top">0.956</td>
<td align="center" valign="top">0.726</td>
<td align="center" valign="top">0.511</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Further, for depressed conditions, the Chinese-speech-pretrain-GRU shows 77.5% sensitivity, 86.1% specificity, 75.0% precision and 72.6% F1-score, and for manic states, 54.8% sensitivity, 90.3% specificity, 52.9% precision and 51.1% F1-score. LDA, in contrast, reaches 62.5% sensitivity and 89.7% specificity, 52.8% precision and 57.2% F1-score for manic states and 70.3% sensitivity, 89.8% specificity, 76.7% precision and 73.4% F1-score for depressed conditions. LDA performs better than BiLSTM and GRU, but Chinese-speech-pretrain-GRU performs better overall. <xref ref-type="table" rid="tab7">Table 7</xref> shows the confusion matrix of four model. <xref ref-type="table" rid="tab8">Table 8</xref> presents the specific findings.</p>
</sec>
<sec sec-type="discussion" id="sec9">
<label>4</label>
<title>Discussion</title>
<p>This study shows that Chinese-speech-pretrain-GRU, BiLSTM, LDA and GRU can be utilized for voice analysis of mood state detection, and the Chinese-speech-pretrain-GRU can distinguish depressive and manic mood with 80.2% accuracy, the LDA can distinguish depressive and manic mood with 78.9% accuracy, the GRU can distinguish depressive and manic mood with 70%, while the BiLSTM with 69.5% accuracy. It means machine learning models can distinguish those mood states through objective speech, and pre-train models can further extract the information embedded in the speech, thus improving prediction accuracy. Furthermore, speech can serve as a biomarker to differentiate between depressive and manic moods.</p>
<p>In recent years, several scholars have explored multimodal models for identifying depressive and manic emotions. Ye et al. (<xref ref-type="bibr" rid="ref36">36</xref>) introduced a hybrid model integrating voice and text for depression detection. Zheng et al. (<xref ref-type="bibr" rid="ref37">37</xref>) proposed a multitask model capable of simultaneous emotion recognition and depression detection. Alghowinem et al. (<xref ref-type="bibr" rid="ref38">38</xref>) developed a machine learning-based multimodal depression detector, showing significant advancements over unimodal approaches. Future studies could involve not only voice but also text, video, and image modalities. This extension may provide a thorough comprehension of differences between manic, depressed, and remission phases from many angles, improving the precision of emotion identification.</p>
<p>Patients with bipolar disorder have the characteristics of &#x201C;polarization&#x201D; of emotions. The methods and focuses of interventions for manic and depressive episodes are different. During manic episodes, it is necessary to prevent the harm caused by mania. However, during a depressive episode, we need to pay attention to the risk of self-injury and suicide, and consider using antidepressants in combination (<xref ref-type="bibr" rid="ref39">39</xref>). By integrating a model that can effectively identify different phases from mobile devices such as mobile phones, and having patients or their family members regularly upload their voices, remote and regular monitoring of the emotional phases of bipolar patients can be achieved, so that intervention measures and interventions for patients can be adjusted in a timely manner. The treatment plan provides a feasible solution to achieve personalized treatment of bipolar disorder and save the human and material costs required for mental illness management. This study also has certain auxiliary value in the diagnosis of bipolar disorder. Because patients are lack of knowledge for themselves disease and may be in a depressive episode when seeking treatment, bipolar disorder is easily misdiagnosed as unipolar depression. The ideas proposed in this study can help doctors discover missed manic phases in clinical practice.</p>
<p>When the sound wave in the voice is affected by aerodynamic factors and generates mechanical vibration, it is converted into a sound source signal to generate (<xref ref-type="bibr" rid="ref40">40</xref>). The voice contains biological acoustic characteristics such as spectrum, prosody, and formant (<xref ref-type="bibr" rid="ref41">41</xref>, <xref ref-type="bibr" rid="ref42">42</xref>). The length of the voice has an impact on the amount and accuracy of the features contained in the voice. Vogel and Morgan studied that the length of voice data would affect the accuracy of biological characteristics in the voice (<xref ref-type="bibr" rid="ref43">43</xref>). Scherer et al. showed through research that the accuracy of disturbance measurement in the voice was affected by the duration of the voice, and only voice over 3&#x2009;s could provide accurate features (<xref ref-type="bibr" rid="ref44">44</xref>). There are also studies proving that the pitch measurement of long voice is more accurate than that of short voice (<xref ref-type="bibr" rid="ref45">45</xref>, <xref ref-type="bibr" rid="ref46">46</xref>). To conclude, in this study, we observed that patients with different emotional states exhibited varying voice durations, indicating that the information carried in their voices differs. Significant differences in voice durations were found among patients in depressive, manic, and remission states. However, we did not study the impact of voice duration on the internal characteristics of speech. Future research will further analyze the specific impact of duration on these speech characteristics.</p>
<p>This study also has certain limitations. Firstly, the number of participants included in this study is limited, and the use of self-reported scales may lead to inaccuracies of the emotion labels. Secondly, regarding the quantity of speech data, the number of recordings varies among patients, which may introduce bias to the model. Thirdly, in the study, we only used voice as a biomarker to investigate the recognition of emotions. Therefore, in order to enhance the accuracy and generalizability of the results, the future study will increase the number of participants, control the quantity of speech samples per patient, and utilize alternative techniques to ascertain the patients&#x2019; emotional states. In subsequent research, apart from speech, we will introduce information such as images, videos and texts to detect depressive mood, manic mood and remission emotion through a Multi-Modal machine learning approach.</p>
</sec>
<sec sec-type="data-availability" id="sec10">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="sec11">
<title>Ethics statement</title>
<p>The studies involving humans were approved by the Medical Ethics Committee of Peking University Sixth Hospital (Institute of Mental Health). The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec sec-type="author-contributions" id="sec12">
<title>Author contributions</title>
<p>JJ: Writing &#x2013; original draft. WD: Conceptualization, Supervision, Writing &#x2013; review &#x0026; editing. JL: Methodology, Writing &#x2013; original draft. JP: Methodology, Writing &#x2013; original draft. CF: Data curation, Writing &#x2013; review &#x0026; editing. RL: Data curation, Writing &#x2013; review &#x0026; editing. CS: Writing &#x2013; review &#x0026; editing. YM: Conceptualization, Funding acquisition, Writing &#x2013; review &#x0026; editing.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec13">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. The work was supported by the Capital&#x2019;s Funds for Health Improvement and Research (grant numbers 2018-2-4112 and 2020-2Z-4117); the Capital Characteristics of Clinical Application Research provided by the Beijing Municipal Science and Technology Commission (grant number Z171100001017086); and the Space Medical Experiment Project of China Manned Space Program (grant number HYZHXM03006).</p>
</sec>
<ack>
<p>Thanks to all the researchers who participated in this study.</p>
</ack>
<sec sec-type="COI-statement" id="sec14">
<title>Conflict of interest</title>
<p>JJ and CF were employed by Beijing Wanling Pangu Science and Technology Ltd.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="sec15">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec16">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fneur.2024.1394210/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fneur.2024.1394210/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.PDF" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1">
<label>1.</label>
<citation citation-type="other"><person-group person-group-type="author">
<collab id="coll1">Global Health Data Exchange (GHDx)</collab>
</person-group>. <comment>Available at:</comment> <ext-link xlink:href="https://vizhub.healthdata.org/gbd-results/" ext-link-type="uri">https://vizhub.healthdata.org/gbd-results/</ext-link> (<year>2019</year>).</citation>
</ref>
<ref id="ref2">
<label>2.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Phillips</surname> <given-names>ML</given-names></name> <name><surname>Kupfer</surname> <given-names>DJ</given-names></name></person-group>. <article-title>Bipolar disorder diagnosis: challenges and future directions</article-title>. <source>Lancet</source>. (<year>2013</year>) <volume>381</volume>:<fpage>1663</fpage>&#x2013;<lpage>71</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0140-6736(13)60989-7</pub-id>, PMID: <pub-id pub-id-type="pmid">23663952</pub-id></citation>
</ref>
<ref id="ref3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>YU</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Yu</surname> <given-names>X</given-names></name> <name><surname>Yan</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Prevalence of mental disorders in China: a cross-sectional epidemiological study</article-title>. <source>Lancet Psychiatry</source>. (<year>2019</year>) <volume>6</volume>:<fpage>211</fpage>&#x2013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S2215-0366(18)30511-X</pub-id></citation>
</ref>
<ref id="ref4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Lu</surname> <given-names>H</given-names></name> <name><surname>Zeng</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>S</given-names></name> <name><surname>Du</surname> <given-names>Q</given-names></name> <name><surname>Jiang</surname> <given-names>T</given-names></name> <etal/></person-group>. <article-title>The differential psychological distress of populations affected by the COVID-19 pandemic</article-title>. <source>Brain Behav Immun</source>. (<year>2020</year>) <volume>87</volume>:<fpage>49</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bbi.2020.04.031</pub-id></citation>
</ref>
<ref id="ref5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>L&#x00E9;pine</surname> <given-names>JP</given-names></name> <name><surname>Briley</surname> <given-names>M</given-names></name></person-group>. <article-title>The increasing burden of depression</article-title>. <source>Neuropsychiatr Dis Treat</source>. (<year>2011</year>) <volume>7</volume>:<fpage>3</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.2147/ndt.s19617</pub-id>, PMID: <pub-id pub-id-type="pmid">21750622</pub-id></citation>
</ref>
<ref id="ref6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>P</given-names></name> <name><surname>Wang</surname> <given-names>R</given-names></name> <name><surname>Lin</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>F</given-names></name> <name><surname>Tu</surname> <given-names>J</given-names></name> <name><surname>Sun</surname> <given-names>M</given-names></name></person-group>. <article-title>Automatic depression recognition by intelligent speech signal processing: a systematic survey</article-title>. <source>CAAI Trans Intell Technol</source>. (<year>2023</year>) <volume>8</volume>:<fpage>701</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1049/cit2.12113</pub-id></citation>
</ref>
<ref id="ref7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Silverman</surname> <given-names>MA</given-names></name>
</person-group>. <article-title>Diagnosing the diagnostic and statistical manual of mental disorders. By Rachel Cooper</article-title>. <source>Psychoanal Q</source>. (<year>2015</year>) <volume>84</volume>:<fpage>239</fpage>&#x2013;<lpage>47</lpage>. doi: <pub-id pub-id-type="doi">10.1002/j.2167-4086.2015.00011.x</pub-id></citation>
</ref>
<ref id="ref8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Faurholt-Jepsen</surname> <given-names>M</given-names></name> <name><surname>Busk</surname> <given-names>J</given-names></name> <name><surname>Frost</surname> <given-names>M</given-names></name> <name><surname>Vinberg</surname> <given-names>M</given-names></name> <name><surname>Christensen</surname> <given-names>EM</given-names></name> <name><surname>Winther</surname> <given-names>O</given-names></name> <etal/></person-group>. <article-title>Voice analysis as an objective state marker in bipolar disorder</article-title>. <source>Transl Psychiatry</source>. (<year>2016</year>) <volume>6</volume>:<fpage>e856</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1038/tp.2016.123</pub-id>, PMID: <pub-id pub-id-type="pmid">27434490</pub-id></citation>
</ref>
<ref id="ref9">
<label>9.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shin</surname> <given-names>D</given-names></name> <name><surname>Cho</surname> <given-names>WI</given-names></name> <name><surname>Park</surname> <given-names>CHK</given-names></name> <name><surname>Rhee</surname> <given-names>SJ</given-names></name> <name><surname>Kim</surname> <given-names>MJ</given-names></name> <name><surname>Lee</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Detection of minor and major depression through voice as a biomarker using machine learning</article-title>. <source>J Clin Med</source>. (<year>2021</year>) <volume>10</volume>:<fpage>3046</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jcm10143046</pub-id>, PMID: <pub-id pub-id-type="pmid">34300212</pub-id></citation>
</ref>
<ref id="ref10">
<label>10.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>Y</given-names></name> <name><surname>Liyanage</surname> <given-names>BN</given-names></name> <name><surname>Sun</surname> <given-names>Y</given-names></name> <name><surname>Lu</surname> <given-names>T</given-names></name> <name><surname>Zhu</surname> <given-names>Z</given-names></name> <name><surname>Liao</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>A deep learning-based model for detecting depression in senior population</article-title>. <source>Front Psych</source>. (<year>2022</year>) <volume>13</volume>:<fpage>1016676</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyt.2022.1016676</pub-id>, PMID: <pub-id pub-id-type="pmid">36419976</pub-id></citation>
</ref>
<ref id="ref11">
<label>11.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Punithavathi</surname> <given-names>R</given-names></name> <name><surname>Sharmila</surname> <given-names>M</given-names></name> <name><surname>Avudaiappan</surname> <given-names>T</given-names></name> <name><surname>Raj</surname> <given-names>I</given-names></name> <name><surname>Kanchana</surname> <given-names>S</given-names></name> <name><surname>Mamo</surname> <given-names>SA</given-names></name></person-group>. <article-title>Empirical investigation for predicting depression from different machine learning based voice recognition techniques</article-title>. <source>Evid Based Complement Alternat Med</source>. (<year>2022</year>) <volume>2022</volume>:<fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2022/6395860</pub-id></citation>
</ref>
<ref id="ref12">
<label>12.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Lin</surname> <given-names>L.</given-names></name></person-group> Automatic depression detection: an emotional audio-textual corpus and a GRU/BiLSTM-based model. In ICASSP 2022&#x2013;2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), (<year>2022</year>), 6247&#x2013;6251.</citation>
</ref>
<ref id="ref13">
<label>13.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A</given-names></name> <name><surname>Shazeer</surname> <given-names>N</given-names></name> <name><surname>Parmar</surname> <given-names>N</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J</given-names></name> <name><surname>Jones</surname> <given-names>L</given-names></name> <name><surname>Gomez</surname> <given-names>AN</given-names></name> <etal/></person-group>. <article-title>Attention is all you need</article-title>. <source>Adv Neural Inf Proces Syst</source>. (<year>2017</year>) <volume>30</volume>:<fpage>15</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></citation>
</ref>
<ref id="ref14">
<label>14.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>He</surname> <given-names>Y</given-names></name> <name><surname>Rong</surname> <given-names>L</given-names></name> <name><surname>Ding</surname> <given-names>Y</given-names></name></person-group> A hybrid model for depression detection with transformer and bi-directional long short-term memory. In: <italic>2022 IEEE international conference on bioinformatics and biomedicine (BIBM)</italic>. IEEE, (<year>2022</year>): 2727&#x2013;2734.</citation>
</ref>
<ref id="ref15">
<label>15.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Devlin</surname> <given-names>J.</given-names></name> <name><surname>Chang</surname> <given-names>M. W.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Toutanova</surname> <given-names>K.</given-names></name></person-group> Bert: Pre-training of deep bidirectional transformers for language understanding. (<year>2018</year>). arXiv [Preprint]. arXiv: 1810.04805.</citation>
</ref>
<ref id="ref16">
<label>16.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Alexei</surname> <given-names>B.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <name><surname>Abdelrahman</surname> <given-names>M</given-names></name> <name><surname>Michael</surname> <given-names>A</given-names></name></person-group>, "wav2vec 2.0: A framework for self-supervised learning of speech representations," In: <italic>Proc. NeurIPS</italic>, (<year>2020</year>).</citation>
</ref>
<ref id="ref17">
<label>17.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Bann&#x00F2;</surname> <given-names>S.</given-names></name> <name><surname>Matassoni</surname> <given-names>M.</given-names></name></person-group>, Proficiency assessment of L2 spoken English using Wav2Vec 2.0. In: <italic>2022 IEEE Spoken Language Technology Workshop (SLT), Doha, Qatar</italic>, (<year>2023</year>), pp. 1088&#x2013;1095.</citation>
</ref>
<ref id="ref18">
<label>18.</label>
<citation citation-type="other"><person-group person-group-type="author">
<collab id="coll2">TencentGames</collab>
</person-group>. <comment>Available at:</comment> <ext-link xlink:href="https://github.com/TencentGameMate/chinese_speech_pretrain" ext-link-type="uri">https://github.com/TencentGameMate/chinese_speech_pretrain</ext-link> (<year>2022</year>)</citation>
</ref>
<ref id="ref19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hirschfeld</surname> <given-names>RM</given-names></name> <name><surname>Williams</surname> <given-names>JB</given-names></name> <name><surname>Spitzer</surname> <given-names>RL</given-names></name> <name><surname>Calabrese</surname> <given-names>JR</given-names></name> <name><surname>Flynn</surname> <given-names>L</given-names></name> <name><surname>Keck</surname> <given-names>PE</given-names> <suffix>Jr</suffix></name> <etal/></person-group>. <article-title>Development and validation of a screening instrument for bipolar spectrum disorder: the mood disorder questionnaire</article-title>. <source>Am J Psychiatry</source>. (<year>2000</year>) <volume>157</volume>:<fpage>1873</fpage>&#x2013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1176/appi.ajp.157.11.1873</pub-id>, PMID: <pub-id pub-id-type="pmid">11058490</pub-id></citation>
</ref>
<ref id="ref20">
<label>20.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rush</surname> <given-names>AJ</given-names></name> <name><surname>Bernstein</surname> <given-names>IH</given-names></name> <name><surname>Trivedi</surname> <given-names>MH</given-names></name> <name><surname>Carmody</surname> <given-names>TJ</given-names></name> <name><surname>Wisniewski</surname> <given-names>S</given-names></name> <name><surname>Mundt</surname> <given-names>JC</given-names></name> <etal/></person-group>. <article-title>An evaluation of the quick inventory of depressive symptomatology and the Hamilton rating scale for depression: a sequenced treatment alternatives to relieve depression trial report</article-title>. <source>Biol Psychiatry</source>. (<year>2006</year>) <volume>59</volume>:<fpage>493</fpage>&#x2013;<lpage>501</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.biopsych.2005.08.022</pub-id>, PMID: <pub-id pub-id-type="pmid">16199008</pub-id></citation>
</ref>
<ref id="ref21">
<label>21.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Young</surname> <given-names>RC</given-names></name> <name><surname>Biggs</surname> <given-names>JT</given-names></name> <name><surname>Ziegler</surname> <given-names>VE</given-names></name> <name><surname>Meyer</surname> <given-names>DA</given-names></name></person-group>. <article-title>A rating scale for mania: reliability, validity and sensitivity</article-title>. <source>Br J Psychiatry</source>. (<year>1978</year>) <volume>133</volume>:<fpage>429</fpage>&#x2013;<lpage>35</lpage>. doi: <pub-id pub-id-type="doi">10.1192/bjp.133.5.429</pub-id></citation>
</ref>
<ref id="ref22">
<label>22.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Al-Anzi</surname> <given-names>FS</given-names></name> <name><surname>Abu</surname> <given-names>ZD</given-names></name></person-group>. <article-title>The capacity of mel frequency cepstral coefficients for speech recognition</article-title>. <source>Int J Comput Informat Eng</source>. (<year>2017</year>) <volume>11</volume>:<fpage>1149</fpage>&#x2013;<lpage>53</lpage>. doi: <pub-id pub-id-type="doi">10.5281/zenodo.1132455</pub-id></citation>
</ref>
<ref id="ref23">
<label>23.</label>
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Giannakopoulos</surname> <given-names>T</given-names></name>
</person-group>. <article-title>Pyaudioanalysis: an open-source python library for audio signal analysis</article-title>. <source>PLoS One</source>. (<year>2015</year>) <volume>10</volume>:<fpage>e0144610</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0144610</pub-id>, PMID: <pub-id pub-id-type="pmid">26656189</pub-id></citation>
</ref>
<ref id="ref24">
<label>24.</label>
<citation citation-type="other"><person-group person-group-type="author">
<name><surname>Polikar</surname> <given-names>R.</given-names></name>
</person-group> Ensemble based systems in decision making. In: <italic>IEEE Circuits and Systems Magazine</italic>, (<year>2006</year>), 6, 21&#x2013;45.</citation>
</ref>
<ref id="ref25">
<label>25.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hermans</surname> <given-names>M</given-names></name> <name><surname>Schrauwen</surname> <given-names>B</given-names></name></person-group>. <article-title>Training and analysing deep recurrent neural networks</article-title>. <source>Adv Neural Inf Proces Syst</source>. (<year>2013</year>) <volume>26</volume>:<fpage>633</fpage>. doi: <pub-id pub-id-type="doi">10.5555/2999611.2999633</pub-id></citation>
</ref>
<ref id="ref26">
<label>26.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gers</surname> <given-names>FA</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J</given-names></name> <name><surname>Cummins</surname> <given-names>F</given-names></name></person-group>. <article-title>Learning to forget: continual prediction with LSTM</article-title>. <source>Neural Comput</source>. (<year>2000</year>) <volume>12</volume>:<fpage>2451</fpage>&#x2013;<lpage>71</lpage>. doi: <pub-id pub-id-type="doi">10.1162/089976600300015015</pub-id>, PMID: <pub-id pub-id-type="pmid">11032042</pub-id></citation>
</ref>
<ref id="ref27">
<label>27.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ravanelli</surname> <given-names>M</given-names></name> <name><surname>Brakel</surname> <given-names>P</given-names></name> <name><surname>Omologo</surname> <given-names>M</given-names></name> <name><surname>Bengio</surname> <given-names>Y</given-names></name></person-group>. <article-title>Light gated recurrent units for speech recognition</article-title>. <source>IEEE Trans Emerg Top Comput Intell</source>. (<year>2018</year>) <volume>2</volume>:<fpage>92</fpage>&#x2013;<lpage>102</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TETCI.2017.2762739</pub-id></citation>
</ref>
<ref id="ref28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>G</given-names></name> <name><surname>Guo</surname> <given-names>J</given-names></name></person-group>. <article-title>Bidirectional LSTM with attention mechanism and convolutional layer for text classification</article-title>. <source>Neurocomputing</source>. (<year>2019</year>) <volume>337</volume>:<fpage>325</fpage>&#x2013;<lpage>38</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2019.01.078</pub-id></citation>
</ref>
<ref id="ref29">
<label>29.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Ott</surname> <given-names>M.</given-names></name> <name><surname>Edunov</surname> <given-names>S.</given-names></name> <name><surname>Baevski</surname> <given-names>A.</given-names></name> <name><surname>Fan</surname> <given-names>A.</given-names></name> <name><surname>Gross</surname> <given-names>S.</given-names></name> <name><surname>Ng</surname> <given-names>N.</given-names></name> <etal/></person-group>. Fairseq: a fast, extensible toolkit for sequence modeling. In: <italic>Proceedings NAACL</italic>, (<year>2019</year>).</citation>
</ref>
<ref id="ref30">
<label>30.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>B.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Guo</surname> <given-names>P.</given-names></name> <name><surname>Shao</surname> <given-names>Q.</given-names></name> <name><surname>Yang</surname> <given-names>C.</given-names></name> <name><surname>Xie</surname> <given-names>L.</given-names></name> <etal/></person-group>. WenetSpeech: A 10000+ hours multi-domain mandarin corpus for speech recognition. In: <italic>Proceedings ICASSP</italic>, (<year>2021</year>).</citation>
</ref>
<ref id="ref31">
<label>31.</label>
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Fisher</surname> <given-names>RA</given-names></name>
</person-group>. <article-title>The use of multiple measurements in taxonomic problems</article-title>. <source>Ann Eugenics</source>. (<year>1936</year>) <volume>7</volume>:<fpage>179</fpage>&#x2013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.1469-1809.1936.tb02137.x</pub-id></citation>
</ref>
<ref id="ref32">
<label>32.</label>
<citation citation-type="book"><person-group person-group-type="author">
<name><surname>Chaonan</surname> <given-names>F</given-names></name>
</person-group>. <source>Optimization and application of psychiatric scale tools based on machine learning</source> <publisher-name>Qingdao University</publisher-name> (<year>2020</year>). This is a dissertation that can be found on CNKI (China National Knowledge Infrastructure).</citation>
</ref>
<ref id="ref33">
<label>33.</label>
<citation citation-type="other"><person-group person-group-type="author">
<collab id="coll3">PyTorch</collab>
</person-group>. <comment>Available at:</comment> <ext-link xlink:href="https://pytorch.org" ext-link-type="uri">https://pytorch.org</ext-link> (<year>2017</year>)</citation>
</ref>
<ref id="ref34">
<label>34.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shuhong</surname> <given-names>Q</given-names></name> <name><surname>Ying</surname> <given-names>C</given-names></name> <name><surname>Shiwei</surname> <given-names>H</given-names></name></person-group>. <article-title>Relationship among depressive symptoms, neuroticism and network social activites in college students</article-title>. <source>Chin Ment Health J</source>. (<year>2019</year>) <volume>33</volume>:<fpage>932</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.3969/j.issn.1000-6729.2019.12.009</pub-id></citation>
</ref>
<ref id="ref35">
<label>35.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bonetti</surname> <given-names>L</given-names></name> <name><surname>Campbell</surname> <given-names>MA</given-names></name> <name><surname>Gilmore</surname> <given-names>L</given-names></name></person-group>. <article-title>The relationship of loneliness and social anxiety with children&#x2019;s and adolescents&#x2019; online communication</article-title>. <source>Cyberpsychol Behav Soc Netw</source>. (<year>2010</year>) <volume>13</volume>:<fpage>279</fpage>&#x2013;<lpage>85</lpage>. doi: <pub-id pub-id-type="doi">10.1089/cyber.2009.0215</pub-id></citation>
</ref>
<ref id="ref36">
<label>36.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname> <given-names>J</given-names></name> <name><surname>Yu</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Q</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Liang</surname> <given-names>H</given-names></name> <name><surname>Zheng</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Multi-model depression detection based on emotional audio and evaluation text</article-title>. <source>J Affect Disord</source>. (<year>2021</year>) <volume>295</volume>:<fpage>904</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jad.2021.08.090</pub-id>, PMID: <pub-id pub-id-type="pmid">34706461</pub-id></citation>
</ref>
<ref id="ref37">
<label>37.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>W</given-names></name> <name><surname>Yan</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>F</given-names></name></person-group>. <article-title>Two birds with one stone: knowledge-embedded temporal convolutional transformer for depression detection and emotion recognition</article-title>. <source>IEEE Trans Affect Comput</source>. (<year>2023</year>) <volume>14</volume>:<fpage>2595</fpage>&#x2013;<lpage>613</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TAFFC.2023.3282704</pub-id></citation>
</ref>
<ref id="ref38">
<label>38.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alghowinem</surname> <given-names>S</given-names></name> <name><surname>Goecke</surname> <given-names>R</given-names></name> <name><surname>Wagner</surname> <given-names>M</given-names></name> <name><surname>Epps</surname> <given-names>J</given-names></name> <name><surname>Hyett</surname> <given-names>M</given-names></name> <name><surname>Parker</surname> <given-names>G</given-names></name> <etal/></person-group>. <article-title>Multimodel depression detection:fusion analysis of paralinguishtic, head pose and eye gaze behaviors</article-title>. <source>Trans Affect Comput</source>. (<year>2018</year>) <volume>9</volume>:<fpage>478</fpage>&#x2013;<lpage>90</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TAFFC.2016.2634527</pub-id></citation>
</ref>
<ref id="ref39">
<label>39.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yatham</surname> <given-names>LN</given-names></name> <name><surname>Kennedy</surname> <given-names>SH</given-names></name> <name><surname>Parikh</surname> <given-names>SV</given-names></name> <name><surname>Schaffer</surname> <given-names>A</given-names></name> <name><surname>Bond</surname> <given-names>DJ</given-names></name> <name><surname>Frey</surname> <given-names>BN</given-names></name> <etal/></person-group>. <article-title>Canadian network for mood and anxiety treatments (CANMAT) and International Society for Bipolar Disorders (ISBD) 2018 guidelines for the management of patients with bipolar disorder</article-title>. <source>Bipolar Disord</source>. (<year>2018</year>) <volume>20</volume>:<fpage>97</fpage>&#x2013;<lpage>170</lpage>. doi: <pub-id pub-id-type="doi">10.1111/bdi.12609</pub-id></citation>
</ref>
<ref id="ref40">
<label>40.</label>
<citation citation-type="book"><person-group person-group-type="author">
<name><surname>Farouk</surname> <given-names>M. H.</given-names></name>
</person-group> <source>Speech production and perception in application of wavelets in speech Processing</source>. <publisher-loc>Switzerland</publisher-loc> <publisher-name>Springer</publisher-name>, (<year>2018</year>), <fpage>5</fpage>&#x2013;<lpage>10</lpage>.</citation>
</ref>
<ref id="ref41">
<label>41.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cummins</surname> <given-names>N</given-names></name> <name><surname>Baird</surname> <given-names>A</given-names></name> <name><surname>Schuller</surname> <given-names>BW</given-names></name></person-group>. <article-title>Speech analysis for health: current state-of-the-art and the increasing impact of deep learning</article-title>. <source>Methods</source>. (<year>2018</year>) <volume>151</volume>:<fpage>41</fpage>&#x2013;<lpage>54</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ymeth.2018.07.007</pub-id>, PMID: <pub-id pub-id-type="pmid">30099083</pub-id></citation>
</ref>
<ref id="ref42">
<label>42.</label>
<citation citation-type="other"><person-group person-group-type="author"><name><surname>Akkaralaertsest</surname> <given-names>T.</given-names></name> <name><surname>Yingthawornsuk</surname> <given-names>T.</given-names></name></person-group> Classification of depressed speech samples with spectral energy ratios as depression indicator. In: <italic>Proceedings 14th International Joint Symposium Artificial Intelligence Natural Language Processing. (iSAI-NLP)</italic>, (<year>2019</year>), 12: 1&#x2013;6.</citation>
</ref>
<ref id="ref43">
<label>43.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vogel</surname> <given-names>A</given-names></name> <name><surname>Morgan</surname> <given-names>A</given-names></name></person-group>. <article-title>Factors affecting the quality of sound recording for speech and voice analysis</article-title>. <source>Speech Lang Pathol</source>. (<year>2009</year>) <volume>11</volume>:<fpage>431</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.3109/17549500902822189</pub-id>, PMID: <pub-id pub-id-type="pmid">21271920</pub-id></citation>
</ref>
<ref id="ref44">
<label>44.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Scherer</surname> <given-names>RC</given-names></name> <name><surname>Vail</surname> <given-names>VJ</given-names></name> <name><surname>Guo</surname> <given-names>CG</given-names></name></person-group>. <article-title>Required number of tokens to determine representative voice perturbation values</article-title>. <source>Speech Lang Hearing Res</source>. (<year>1995</year>) <volume>38</volume>:<fpage>1260</fpage>&#x2013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1044/jshr.3806.1260</pub-id>, PMID: <pub-id pub-id-type="pmid">8747819</pub-id></citation>
</ref>
<ref id="ref45">
<label>45.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zraick</surname> <given-names>RI</given-names></name> <name><surname>Birdwell</surname> <given-names>KY</given-names></name> <name><surname>Smith-Olinde</surname> <given-names>L</given-names></name></person-group>. <article-title>The effect of speaking sample duration on determination of habitual pitch</article-title>. <source>J Voice</source>. (<year>2005</year>) <volume>19</volume>:<fpage>197</fpage>&#x2013;<lpage>201</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jvoice.2004.01.010</pub-id>, PMID: <pub-id pub-id-type="pmid">15907434</pub-id></citation>
</ref>
<ref id="ref46">
<label>46.</label>
<citation citation-type="journal"><person-group person-group-type="author">
<name><surname>Almaghrabi</surname> <given-names>SA</given-names></name>
</person-group>. <article-title>The reproducibility of bio-acoustic features is associated with sample duration, speech task, and gender</article-title>. <source>IEEE Trans Neural Syst Rehabil Eng</source>. (<year>2022</year>) <volume>30</volume>:<fpage>167</fpage>&#x2013;<lpage>75</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNSRE.2022.3143117</pub-id>, PMID: <pub-id pub-id-type="pmid">35038295</pub-id></citation>
</ref>
</ref-list>
</back>
</article>