<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="brief-report">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychol.</journal-id>
<journal-title>Frontiers in Psychology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychol.</abbrev-journal-title>
<issn pub-type="epub">1664-1078</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyg.2022.841926</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Psychology</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>EmotionBox: A music-element-driven emotional music generation system based on music psychology</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Zheng</surname> <given-names>Kaitong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Meng</surname> <given-names>Ruijie</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Zheng</surname> <given-names>Chengshi</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1398035/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Xiaodong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Sang</surname> <given-names>Jinqiu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1184300/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Cai</surname> <given-names>Juanjuan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Jie</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1669733/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Xiao</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Key Laboratory of Noise and Vibration Research, Institute of Acoustics, Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>State Key Laboratory of Media Convergence and Communication, Communication University of China</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>School of Electronics and Communication Engineering, Guangzhou University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Humanities and Management, Southwest Medical University</institution>, <addr-line>Luzhou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Robert J. Zatorre, McGill University, Canada</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Kazuma Mori, McGill University, Canada; Vesa Putkinen, Turku PET Centre, Finland</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Jinqiu Sang <email>sangjinqiu&#x00040;mail.ioa.ac.cn</email></corresp>
<corresp id="c002">Xiao Wang <email>wangxiao&#x00040;swmu.edu.cn</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Auditory Cognitive Neuroscience, a section of the journal Frontiers in Psychology</p></fn></author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>08</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>841926</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>12</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>08</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2022 Zheng, Meng, Zheng, Li, Sang, Cai, Wang and Wang.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Zheng, Meng, Zheng, Li, Sang, Cai, Wang and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license> </permissions>
<abstract>
<p>With the development of deep neural networks, automatic music composition has made great progress. Although emotional music can evoke listeners&#x00027; different auditory perceptions, only few research studies have focused on generating emotional music. This paper presents EmotionBox -a music-element-driven emotional music generator based on music psychology that is capable of composing music given a specific emotion, while this model does not require a music dataset labeled with emotions as previous methods. In this work, pitch histogram and note density are extracted as features that represent mode and tempo, respectively, to control music emotions. The specific emotions are mapped from these features through Russell&#x00027;s psychology model. The subjective listening tests show that the Emotionbox has a competitive performance in generating different emotional music and significantly better performance in generating music with low arousal emotions, especially peaceful emotion, compared with the emotion-label-based method.</p></abstract>
<kwd-group>
<kwd>emotional music generation</kwd>
<kwd>deep neural networks</kwd>
<kwd>auditory perceptions</kwd>
<kwd>music psychology</kwd>
<kwd>music element</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="5"/>
<equation-count count="0"/>
<ref-count count="45"/>
<page-count count="12"/>
<word-count count="7046"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>Introduction</title>
<p>Computational modeling of polyphonic music has been deeply studied for decades (Westergaard et al., <xref ref-type="bibr" rid="B42">1959</xref>). Recently, with the development of deep learning, neural network systems for automatic music generation have made great progress on the quality and coherence of music (Herremans et al., <xref ref-type="bibr" rid="B17">2017</xref>; Herremans and Chew, <xref ref-type="bibr" rid="B16">2019</xref>; Jin et al., <xref ref-type="bibr" rid="B24">2020</xref>). As we know, emotion is of great importance in music since the music consistently elicits auditory responses from its listeners (Raynor and Meyer, <xref ref-type="bibr" rid="B32">1958</xref>). Therefore, Emotional music has significant implications for subjects such as music psychology, music composition, and performance. However, surprisingly, automatic systems rarely consider emotion when generating music, which lacks the ability to generate music that evokes a specific auditory response.</p>
<p>To study the automatic music generation with music psychology, it is necessary to review the relation between music emotions and music elements. As mentioned by Parncutt (<xref ref-type="bibr" rid="B30">2014</xref>), the relationship in Western tonal music between emotional valence (positive vs. negative) and music-structural factors, such as tempo (fast vs. slow) and mode (major vs. minor tonality), have been studied. Experimental results have illustrated that a fast tempo tends to make music sound happy while slow tempo has the opposite effect (Rigg, <xref ref-type="bibr" rid="B33">1940</xref>). In typical tonal musical excerpts, the experimental result showed that tempo was more determinant than the mode in forming happy-sad judgments (Gagnon and Peretz, <xref ref-type="bibr" rid="B12">2003</xref>). Many experiments have demonstrated that musical excerpts written in the major or minor mode were judged to be positive or negative, respectively (Hevner, <xref ref-type="bibr" rid="B18">1935</xref>, <xref ref-type="bibr" rid="B19">1936</xref>). Recent psychological studies have shown that the happiness ratings were elevated for fast-tempo and major-key stimuli while sadness ratings were elevated for slow tempo and minor-key stimuli (Hunter et al., <xref ref-type="bibr" rid="B21">2008</xref>, <xref ref-type="bibr" rid="B22">2010</xref>). Another study has revealed that mode and tempo were the most impactful cues in shaping emotions while sadness and joy were among the most accurately recognized emotions (Micallef Grimaud and Eerola, <xref ref-type="bibr" rid="B28">2022</xref>). The effect of cues on emotions in music as combinations of multiple cues rather than as individual cues has also been discussed, as mixed cues might portray a complicated emotion.</p>
<p>Most previous emotional music generation models were based on emotion labels (Ferreira and Whitehead, <xref ref-type="bibr" rid="B11">2019</xref>; Zhao et al., <xref ref-type="bibr" rid="B45">2019</xref>; Ferreira et al., <xref ref-type="bibr" rid="B10">2020</xref>), without taking into consideration the effect of music psychology. Moreover, label-based methods require a huge music dataset labeled with different emotions, which need a lot of tedious work. Utilizing music psychology instead of the manual labels to train the emotional music generator and exploring the most suitable music elements for evoking the specific emotion are the main focuses in this paper.</p>
<p>In this work, we extract two features from two music elements (i.e., tempo and mode) to supervise the deep neural network for generating music with a specific emotion. To the best of our knowledge, this is the first music-element-driven emotional symbolic music generation system based on a deep neural network.</p>
</sec>
<sec id="s2">
<title>Related work</title>
<p>Currently, deep learning algorithms have become mainstream methods in the field of music generation research. Music generation can be classified into two types: symbol domain generation (i.e., generating MIDIs or piano sheets Yang et al., <xref ref-type="bibr" rid="B43">2017</xref>; Dong et al., <xref ref-type="bibr" rid="B6">2018</xref>) and audio domain generation (i.e., directly generating sound waves van den Oord et al., <xref ref-type="bibr" rid="B39">2016</xref>; Schimbinschi et al., <xref ref-type="bibr" rid="B35">2019</xref>; Subramani et al., <xref ref-type="bibr" rid="B36">2020</xref>).</p>
<p>Recurrent Neural Network (RNN) or its variants have been widely used to model sequential data. Its outstanding temporal modeling ability makes it suitable for music generation. The first attempt is that Todd used RNN to generate monophonic melodies early in Todd (<xref ref-type="bibr" rid="B37">1989</xref>). To solve the gradient vanishing problem of RNN, Eck et al. proposed an LSTM-based model in music generation for the first time (Eck and Schmidhuber, <xref ref-type="bibr" rid="B7">2002</xref>). In Boulanger-Lewandowski et al. (<xref ref-type="bibr" rid="B1">2012</xref>), RNN combined with Restricted Boltzmann Machines was proposed to model polyphonic music, which is superior to the traditional model in various datasets. In 2016, the magenta team proposed the Melody RNN model which can generate long-term structures in songs (Waite, <xref ref-type="bibr" rid="B41">2016</xref>). In 2017, anticipate RNN (Hadjeres and Nielsen, <xref ref-type="bibr" rid="B14">2017</xref>) was used to generate music interactively with positional constraints. Moreover, Bi-axial LSTM (BALSTM) (Johnson, <xref ref-type="bibr" rid="B25">2017</xref>) proposed by Johnson et al. are capable of generating polyphonic music while preserving translation invariance of the dataset. Recently, more advanced deep generative models, such as VAE (Hadjeres and Nielsen, <xref ref-type="bibr" rid="B14">2017</xref>; Brunner et al., <xref ref-type="bibr" rid="B3">2018</xref>), GAN (Guan et al., <xref ref-type="bibr" rid="B13">2019</xref>; Huang et al., <xref ref-type="bibr" rid="B20">2019</xref>), and Transformer (Huang et al., <xref ref-type="bibr" rid="B20">2019</xref>; Zhang, <xref ref-type="bibr" rid="B44">2020</xref>), have gradually been used in music generation.</p>
<p>The expressive generation has long been explored in the field of computer music, reviewed in Kirke and Miranda (<xref ref-type="bibr" rid="B26">2009</xref>). With the development of deep learning, there are several previous attempts to generate emotional music based on deep neural networks. Ferreira et al. proposed a multiplicative long short-term memory (mLSTM) based model that can be directed to compose music with a specific emotion and analyze music emotions (Ferreira and Whitehead, <xref ref-type="bibr" rid="B11">2019</xref>). mLSTM is a RNN architecture for sequence modeling that combines the factorized hidden-to-hidden transition of multiplicative RNN with the gating framework from the LSTM. However, only video game soundtracks are used in training and evaluation. In 2019, Zhao et al. extended the BALSTM network proposed in Mao (<xref ref-type="bibr" rid="B27">2018</xref>) and used the model in emotional music generation (Zhao et al., <xref ref-type="bibr" rid="B45">2019</xref>). Recently, Ferreira et al. proposed a system called Bardo Composer, which generates music with different emotions for the tabletop role-playing games based on the mood of players (Ferreira et al., <xref ref-type="bibr" rid="B10">2020</xref>). However, all methods mentioned above are label-based thus a large dataset labeled with emotions is needed. Moreover, to the best of our knowledge, no MIDI dataset labeled with emotion is available online. Labeling the dataset manually takes a lot of time and effort. In our work, we train the model on an open-source MIDI dataset without emotion labels.</p>
</sec>
<sec id="s3">
<title>Data preprocessing</title>
<sec>
<title>Note representation</title>
<p>The input of our proposed generation model consists of polyphonic MIDI files, which are composed of both melody and accompaniment. To present notes with expressive timing and dynamics, we use the performance encoding proposed in Oore et al. (<xref ref-type="bibr" rid="B29">2020</xref>), which consists of a vocabulary of NOTE-ON, NOTE-OFF, TIME-SHIFT, and VELOCITY events. The main purpose of encoding is to transform the music information in MIDI files into a suitable presentation for training the neural network.</p>
<p>The pitch information in MIDI files ranges from 0 to 127, which is beyond the pitch range of a piano. In our work, pieces in the training set are all performed by piano. Thus, the pitch range is only presented from 21 to 108, which corresponds to A0 and C8 on piano, respectively. For each note, music dynamics is recorded in MIDI files, ranging from 0 to 127 to present how loud a note is. For convenience, we use velocity ranges from 0 to 32 to convey the dynamics. The range can be mapped from 0 to 127 when generating MIDI files.</p>
<p>Finally, a MIDI excerpt is represented as a sequence of events from the following vocabulary of 240 different events:</p>
<list list-type="bullet">
<list-item><p>88 NOTE-ON events: one for each of the 88 (21-108) MIDI pitches. Each event starts a new note.</p></list-item>
<list-item><p>88 NOTE-OFF events: one for each of the 88 (21-108) MIDI pitches. Each event releases a note.</p></list-item>
<list-item><p>32 TIME-SHIFT events: each event moves the time step forward by increments of 15 ms up to 1 s.</p></list-item>
<list-item><p>32 VELOCITY events: each event changes the velocity applied to all upcoming notes.</p></list-item>
</list>
</sec>
<sec>
<title>Feature extraction</title>
<p>In this work, the model is fed with two extracted musical features, namely pitch histogram and note density. All these calculations are done automatically by computers and thus no human labors are required. A pitch histogram (Tzanetakis et al., <xref ref-type="bibr" rid="B38">2003</xref>) is an array of 12 integer values indexed by 12 semitones in a chromatic scale, showing the frequency of occurrence of each semitone in a music piece. An example of a pitch histogram in C major is shown in <xref ref-type="table" rid="T1">Table 1</xref>. According to music theory, notes with a sharp sign are not included in C major. Therefore, in this work, we set their corresponding value in pitch histogram as 0 so that they will never be played in a C major music. C, F, and G are the tonic, subdominant, and dominant in C major, respectively. They are the main elements in a C major music so their corresponding value in pitch histogram is set as 2, which means the probability of starting these notes is two times as much as other notes in C major. Pitch histograms can capture musical information regarding harmonic features of different scales.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>An example of a pitch histogram in a C major scale.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Pitch name</bold></th>
<th valign="top" align="center"><bold>C</bold></th>
<th valign="top" align="center"><bold>C<xref ref-type="table-fn" rid="TN1"><sup>&#x0266F;</sup></xref></bold></th>
<th valign="top" align="center"><bold>D</bold></th>
<th valign="top" align="center"><bold>D<xref ref-type="table-fn" rid="TN1"><sup>&#x0266F;</sup></xref></bold></th>
<th valign="top" align="center"><bold>E</bold></th>
<th valign="top" align="center"><bold>F</bold></th>
<th valign="top" align="center"><bold>F<xref ref-type="table-fn" rid="TN1"><sup>&#x0266F;</sup></xref></bold></th>
<th valign="top" align="center"><bold>G</bold></th>
<th valign="top" align="center"><bold>G<xref ref-type="table-fn" rid="TN1"><sup>&#x0266F;</sup></xref></bold></th>
<th valign="top" align="center"><bold>A</bold></th>
<th valign="top" align="center"><bold>A<xref ref-type="table-fn" rid="TN1"><sup>&#x0266F;</sup></xref></bold></th>
<th valign="top" align="center"><bold>B</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Pitch histogram</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">Probability distribution</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0.1</td>
</tr>
</tbody>
</table><table-wrap-foot>
<fn id="TN1">
<label>&#x0266F;</label>
<p>Means higher in pitch by one semitone.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Note density is a number to record how many notes will be played within a time window (2 s in our work). Note density can present the speed information in each part of a music piece. Note density and pitch histogram are calculated at each time step.</p>
<p>The motivation for this is that we can explicitly choose a pitch histogram and note density when creating samples, which provides us with two options to control the music generation. By changing the pitch histogram and note density, we can therefore alter the mode and tempo of the music, which ultimately leads to emotional difference.</p>
</sec>
<sec>
<title>Russell emotion model</title>
<p>There are various models for describing emotion and they can be mainly divided into four categories: discrete, dimensional, miscellaneous, and music-specific models (Eerola and Vuoskoski, <xref ref-type="bibr" rid="B8">2012</xref>). This work is based on the simplified emotion model of Russell (<xref ref-type="bibr" rid="B34">1980</xref>). Russell&#x00027;s circumplex model is a typical dimensional model, which uses two coordinate axes to present the degree of valence and arousal, respectively. This emotion model is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. For simplicity, we only use four basic emotions as shown in four quadrants. Our model is designed to generate music with these four basic emotions, namely happy, tensional, sad, and peaceful. The four emotions are located in four different quadrants, presenting four varying degrees of valence and arousal.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Simplified Russell&#x00027;s two-dimensional valence-arousal emotion space. The x-axis denotes valence while the y-axis denotes arousal.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-g0001.tif"/>
</fig>
</sec>
<sec>
<title>Emotion presentation</title>
<p>As we have mentioned in the introduction, there is a strong connection between music elements and music emotional valence. Therefore, we combine note density and pitch histogram to control the tempo and mode of the generated sample. According to twelve-tone equal temperament, an octave is divided into 12 parts, all of which are equal on a logarithmic scale. So, we can choose the mode when generating music by changing the probability of each semitone. We use an array containing 12 integers to present a pitch histogram. For example, C major is presented as [2, 0, 1, 0, 1, 2, 0, 2, 0, 1, 0, 1] where 2 presents the tonic, subdominant, and dominant while 1 presents other notes in the scale. Pitch histogram of C minor is presented as [2, 0, 1, 1, 0, 2, 0, 2, 1, 0, 1, 0] according to music theory. A pitch histogram is used to control the valence of music.</p>
<p>Note density indicates the number of notes that will be performed within 2 s (the time window is adjustable). We set note density as 1 to present slow music and note density as 5 to present fast music. Note density is used to control the arousal of music. Combining mode and note density as two adjustable parameters, we aim to generate four categories of emotional music: happy (with the major scale and fast tempo), tensional (with the minor scale and fast tempo), peaceful (with the major scale and slow tempo), and sad (with the minor scale and slow tempo).</p>
</sec>
</sec>
<sec id="s4">
<title>Method</title>
<sec>
<title>Neural network architecture</title>
<p>A recurrent neural network has an excellent performance in modeling sequential data. A gated recurrent unit (GRU) (Cho et al., <xref ref-type="bibr" rid="B4">2014</xref>) is an improved version of the standard RNN. It was proposed to solve the vanishing gradient problem of a standard recurrent neural network during backpropagation. The gating mechanism enables GRU to carry information from earlier time steps to later ones. The illustration of GRU is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. In our work, GRU is used for temporal modeling.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>The illustration of gated recurrent units (GRU). <italic>x</italic><sub><italic>i</italic></sub> and <italic>y</italic><sub><italic>i</italic></sub> denote the current input and output of GRU, <italic>h</italic><sub><italic>i</italic>&#x02212;1</sub> and <italic>h</italic><sub><italic>i</italic></sub> are the last hidden information and current hidden information, <italic>r</italic><sub><italic>i</italic></sub> and <italic>z</italic><sub><italic>i</italic></sub> are the reset and update gates. A GRU network is formed from a series of GRUs.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-g0002.tif"/>
</fig>
<p>The model is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. Input X represents the masked performance events while Input Z represents the pitch histogram and the note density. Masking means the last event of each event sequence is dropped out and the rest part of the event sequence is sent to the neural network as the input. The reason for this is to make the model generate the unmasked sequence recursively. Then, we can calculate the loss, i.e., the difference, between the generated unmasked sequence and ground truth. If the length of an event sequence is <italic>T</italic>, the size of Input X (i.e., the masked performance events) will be (<italic>T</italic>&#x02212;1) &#x000D7; 1. Each performance event is converted to a 240-dimension vector by a 240 &#x000D7; 240 embedding layer. The 240-dimension vector was chosen for convenience. The pitch histogram is a (<italic>T</italic>&#x02212;1) &#x000D7; 12 vector and note density is converted to a (<italic>T</italic>&#x02212;1) &#x000D7; 12 one-hot vector. A (<italic>T</italic>&#x02212;1) &#x000D7; 1 zero vector is used to increase the stability of the neural network. Therefore, the size of input Z is (<italic>T</italic>&#x02212;1) &#x000D7; 25.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Diagram of the EmotionBox model architecture. &#x0201C;Input X&#x0201D; denotes a sequence of events and &#x0201C;Input Z&#x0201D; denotes the pitch histogram and note density.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-g0003.tif"/>
</fig>
<p>The pitch histogram and note density are then concatenated with the 240-dimension vector. The size of the concatenated vector is (<italic>T</italic>&#x02212;1) &#x000D7; 265. The concatenated input is fed into a 265 &#x000D7; 512 full connection layer and a rectified linear unit (ReLU) activation function. Then, this (<italic>T</italic>&#x02212;1) &#x000D7; 512 vector is sent into a three-layer, 512-unit GRU, with a 0.3 dropout applied after each of the first two GRU layers. The GRU output is then fed to a 240-unit linear layer. The output of the neural network is a <italic>T</italic>&#x000D7; 240 vector. The output presents the probability of each event at each time step. The cross-entropy loss between the generated sequence and the unmasked event sequence, namely, the ground truth, is then calculated. The codes of this work have been open-sourced on Github<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref>.</p>
</sec>
<sec>
<title>Emotional music generation</title>
<p>At the generating stage, we generate samples with different emotions by specifying a particular pitch histogram and note density. When the model generates music, the first event will be randomly selected. The first event, pitch histogram, and note density are sent to the model to create new events recursively. The output of our model is the probability of 240 events. If we use greedy sampling to select an event with the largest probability, the sample may end up with some partial repetition, which means a small part of the music may repeat again and again. Therefore, we combine greedy sampling with stochastic sampling. We select a threshold ranged from 0 to 1. Whenever a new event is sampled, we produce a random number ranged from 0 to 1. If the random number is larger than the threshold, this event will be sampled using the greedy algorithm, which means selecting an event with the largest probability. If not, this event will be sampled based on the probability of each event, which produces a lot of uncertainty.</p>
<p>When generating a new piece of emotional music, we can use temperature (He et al., <xref ref-type="bibr" rid="B15">2018</xref>) to alter the degree of uncertainty. Temperature is a hyperparameter used to control the randomness of predictions by scaling the logits before applying softmax. Lower temperature results in more predictable events, while higher temperature results in more surprising events. The temperature parameter is manually tuned by listening to the generated music. If the music is too random, the temperature will be turned down. If the music is too repetitive, the temperature will be turned up.</p>
</sec>
</sec>
<sec id="s5">
<title>Experiment</title>
<sec>
<title>Dataset</title>
<p>We selected a widely used dataset, piano-midi<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>, to train our model. It includes 329 piano pieces from 23 classical composers. Each piece is a MIDI file capturing a classical piano performance with expressive dynamics and timing. The dataset is highly homogeneous because all of the pieces in it are classical music, and the solo instrument is consistently piano. The authors in Zhao et al. (<xref ref-type="bibr" rid="B45">2019</xref>) labeled this dataset with four basic emotions mentioned above (i.e., happy, tensional, peaceful, and sad) manually to train their label-based automatic emotional music generator. For the comparison experiment, we also used this emotion-labeled dataset with the permission of the authors to train a label-based model. The Pretty-Midi package was used to extract the note information from the MIDI files (Raffel and Ellis, <xref ref-type="bibr" rid="B31">2014</xref>).</p>
</sec>
<sec>
<title>Training</title>
<p>At the training stage, the whole sequence of events is cut into 200-event-wide event sequences. The stride of event sequences is 10 events. The network was trained using the ADAM optimizer with a loss function of cross-entropy loss between the predicted event and the ground truth event. We used a learning rate of 0.0002, and the model was trained for 100 epochs with a batch size of 64. We implemented our models in PyTorch.</p>
</sec>
<sec>
<title>Comparison</title>
<p>We implement a label-based model for comparison as all previous emotional music generation models were based on emotion labels (Ferreira and Whitehead, <xref ref-type="bibr" rid="B11">2019</xref>; Zhao et al., <xref ref-type="bibr" rid="B45">2019</xref>). In order to evaluate the performance between our proposed method and the labeled-based method, the structure of the label-based model remains unchanged except that the inputs Z of the model are substituted with emotion labels. One-hot coding is used to present four basic emotions. The neural network is trained to learn the mapping between music emotions and well-classified emotion labels. In the generation stage, the label-based model takes the emotion label as input.</p>
</sec>
</sec>
<sec id="s6">
<title>Results and discussion</title>
<p>To evaluate the performance of music generation given a specific emotion, a subjective listening test study was carried out to compare our proposed method with the label-based method. Similar to the subjective listening test for analyzing different styles of classification, three 6-s long music samples were provided for each emotion and each model<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref>. The total amount of music samples was 24 (3 samples &#x000D7; 4 emotions &#x000D7; 2 models). The samples were randomly selected and shuffled. <xref ref-type="table" rid="T2">Table 2</xref> shows the average note density of the experimental stimuli. Twenty-six subjects took part in the test. For each sample, participants were asked which emotion was observed in the sample? They have to choose one option from happy, peaceful, sad, and tensional. It is a little difficult for untrained participants to classify the music&#x00027;s emotion. Therefore, we provided a warming-up stage by playing four manually selected emotional music samples with their corresponding emotional labels. During the listening test, samples can be stopped and replayed to make sure the participants can hear the music clearly.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>The average note density of the experimental stimuli.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center"><bold>EmotionBox</bold></th>
<th valign="top" align="center"><bold>Label-based method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Happy</td>
<td valign="top" align="center">18.03</td>
<td valign="top" align="center">20.54</td>
</tr>
<tr>
<td valign="top" align="left">Tensional</td>
<td valign="top" align="center">17.23</td>
<td valign="top" align="center">32.39</td>
</tr>
<tr>
<td valign="top" align="left">Sad</td>
<td valign="top" align="center">6.24</td>
<td valign="top" align="center">12.06</td>
</tr>
<tr>
<td valign="top" align="left">Peaceful</td>
<td valign="top" align="center">6.29</td>
<td valign="top" align="center">14.41</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec>
<title>Emotion classification</title>
<p>In this section, we calculated the accuracy of emotion classification for each of the four emotions and two methods. The statistical results are shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. In <xref ref-type="fig" rid="F4">Figure 4</xref>, it shows that our proposed model, without a database labeled with emotions, has comparable performance to the label-based model in terms of emotion classification accuracy. Among the four kinds of emotion, the results indicate that the music samples with tensional and happy emotions were correctly recognized by the highest accuracy for both methods. These observations can be explained by an emotion psychology study that showed that valence can be distinguished more easily by high-arousal stimuli (Bradley et al., <xref ref-type="bibr" rid="B2">2001</xref>). The proposed method outperforms the label-based method on peaceful and sad samples, which greatly overcome the shortcomings of the label-based method and yield a more balanced result. A two-way ANOVA is used with emotion (happy, sad, tensional, peaceful) and model (EmotionBox, label-based) set as within-subject factors to investigate how these two factors, in combination, affect the accuracy of subjective experiments. For each subject, the accuracy of emotion classification was calculated for each emotion and model. The classification accuracy was calculated by dividing the number of samples that were correctly recognized by the number of samples tested for each emotion and model (3 tested samples for each emotion and model). The statistical results show that model [<italic>F</italic><sub>(1, 25)</sub> = 0.603, <italic>p</italic> = 0.445, partial &#x003B7;<sup>2</sup> = 0.024] has no significant effect while emotion [<italic>F</italic><sub>(3, 75)</sub> = 15.115, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> = 0.377] has a significant effect on the accuracy of subjective experiments. For the interaction of model and emotion, Mauchlys test of sphericity indicates that the assumption of sphericity has been violated [<inline-formula><mml:math id="M1"><mml:msubsup><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>5</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>12</mml:mn><mml:mo>.</mml:mo><mml:mn>904</mml:mn><mml:mo>,</mml:mo><mml:mi>p</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>024</mml:mn></mml:math></inline-formula>]. By applying the Greenhouse-Geisser correction, the interaction of model and emotion shows a significant effect on the accuracy of subjective experiments [<italic>F</italic><sub>(2.435, 60.865)</sub> = 6.475, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> = 0.206].</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>The mean accuracy and SD of subjective evaluation test for classifying generated music samples into emotion categories.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-g0004.tif"/>
</fig>
<p><xref ref-type="table" rid="T3">Table 3</xref> shows a <italic><bold>post-hoc</bold></italic> Bonferroni adjusted pairwise comparison within each emotion pair of two methods. <xref ref-type="table" rid="T3">Table 3</xref> indicates that there are significant differences between the two methods on tensional and peaceful samples. The emotion classification accuracy of the label-based method is significantly high on tensional emotion while that is significantly low on peaceful emotion. There are no significant differences between the two methods on happy and sad samples. The note density of experimental stimuli can be used to explain why the proposed model achieved good performance for peaceful whereas the label-based model worked well for tensional. <xref ref-type="table" rid="T2">Table 2</xref> shows that the tensional samples of the label-based model have a much higher note density than that of the EmotionBox. Therefore, the subjects are more likely to judge the former as tensional. On the other hand, the peaceful samples of the EmotionBox have a much lower note density than that of the label-based model. Therefore, the subjects are more likely to judge the former as peaceful. A <italic><bold>post-hoc</bold></italic> Bonferroni adjusted pairwise comparison between each emotion of EmotionBox has been conducted. The result shows no statistically significant differences (<italic><bold>p</bold></italic> &#x0003E; 0.05) between these emotions. Another <italic><bold>post-hoc</bold></italic> Bonferroni adjusted pairwise comparison between each emotion of label-based method has also been conducted. The result shows no statistically significant differences (<italic><bold>p</bold></italic> &#x0003E; 0.05) between happy and tensional, peaceful and sad. For other pairs, there are statistically significant differences (<italic><bold>p</bold></italic> &#x0003C; 0.05). Combined with <xref ref-type="fig" rid="F4">Figure 4</xref>, the results indicate that emotions with higher arousal like happy and tensional are more likely to be distinguished than emotions with low arousal like sad and peaceful for label-based method.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>A <italic>post-hoc</italic> Bonferroni adjusted pairwise comparison of each emotion pair between two methods.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>EmotionBox</bold></th>
<th valign="top" align="left"><bold>Label-based method</bold></th>
<th valign="top" align="center"><italic><bold>p</bold></italic><bold>-value</bold></th>
</tr>
</thead>
<tbody><tr>
<td valign="top" align="left">Happy</td>
<td valign="top" align="left">Happy</td>
<td valign="top" align="center">0.606</td>
</tr>
<tr>
<td valign="top" align="left">Tensional</td>
<td valign="top" align="left">Tensional</td>
<td valign="top" align="center"><bold>0.004</bold></td>
</tr>
<tr>
<td valign="top" align="left">Sad</td>
<td valign="top" align="left">Sad</td>
<td valign="top" align="center">0.240</td>
</tr>
<tr>
<td valign="top" align="left">Peaceful</td>
<td valign="top" align="left">Peaceful</td>
<td valign="top" align="center"><bold>0.045</bold></td>
</tr>
</tbody>
</table><table-wrap-foot>
<p><italic><bold>p</bold></italic>-value less than 0.05 means a statistically significant difference at a confidence level of 5<italic><bold>%</bold></italic> and is presented in bold type.</p>
</table-wrap-foot>
</table-wrap>
<p>To investigate the performance of generating different emotional music within each model, we also count the result of all the combinations between specific emotions at generating stage and emotions classified by subjects as shown in <xref ref-type="table" rid="T4">Table 4</xref>. From <xref ref-type="table" rid="T4">Table 4A</xref>, it shows that the arousal of music is more distinguishable than valence. For example, for the first row, 28% of happy samples were classified as tensional samples that have the same level of arousal but a different level of valence. However, a happy sample is rarely classified as a peaceful sample as they have a different level of arousal. This experimental result agrees with the observation that tempo is more determinant than the mode in forming happy-sad judgments as reported in Gagnon and Peretz (<xref ref-type="bibr" rid="B12">2003</xref>). In our work, the tempo and the mode are associated with arousal and valence of music, respectively. The classification of arousal and valence will be discussed in next section.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>The results of human classification for each combination between specific emotion at generating stage and emotion classified by subjects.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="center" colspan="5"><bold>(A)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-i0001.tif"/></td>
<td valign="top" align="center"><bold>Happy</bold></td>
<td valign="top" align="center"><bold>Tensional</bold></td>
<td valign="top" align="center"><bold>Sad</bold></td>
<td valign="top" align="center"><bold>Peaceful</bold></td>
</tr>
<tr style="border-top: thin solid #000000;">
<td valign="top" align="left">Happy</td>
<td valign="top" align="center">71%</td>
<td valign="top" align="center">28%</td>
<td valign="top" align="center">0%</td>
<td valign="top" align="center">1%</td>
</tr>
<tr>
<td valign="top" align="left">Tensional</td>
<td valign="top" align="center">17%</td>
<td valign="top" align="center">74%</td>
<td valign="top" align="center">5%</td>
<td valign="top" align="center">4%</td>
</tr>
<tr>
<td valign="top" align="left">Sad</td>
<td valign="top" align="center">1%</td>
<td valign="top" align="center">8%</td>
<td valign="top" align="center">56%</td>
<td valign="top" align="center">35%</td>
</tr>
<tr>
<td valign="top" align="left">Peaceful</td>
<td valign="top" align="center">8%</td>
<td valign="top" align="center">4%</td>
<td valign="top" align="center">26%</td>
<td valign="top" align="center">63%</td>
</tr>
<tr style="border-top: thin solid #000000;">
<td valign="top" align="center" colspan="5"><bold>(B)</bold></td>
</tr>
<tr style="border-top: thin solid #000000;">
<td valign="top" align="left"><inline-graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-i0002.tif"/></td>
<td valign="top" align="center"><bold>Happy</bold></td>
<td valign="top" align="center"><bold>Tensional</bold></td>
<td valign="top" align="center"><bold>Sad</bold></td>
<td valign="top" align="center"><bold>Peaceful</bold></td>
</tr>
<tr style="border-top: thin solid #000000;">
<td valign="top" align="left">Happy</td>
<td valign="top" align="center">74%</td>
<td valign="top" align="center">23%</td>
<td valign="top" align="center">0%</td>
<td valign="top" align="center">3%</td>
</tr>
<tr>
<td valign="top" align="left">Tensional</td>
<td valign="top" align="center">10%</td>
<td valign="top" align="center">90%</td>
<td valign="top" align="center">0%</td>
<td valign="top" align="center">0%</td>
</tr>
<tr>
<td valign="top" align="left">Sad</td>
<td valign="top" align="center">4%</td>
<td valign="top" align="center">18%</td>
<td valign="top" align="center">47%</td>
<td valign="top" align="center">31%</td>
</tr>
<tr>
<td valign="top" align="left">Peaceful</td>
<td valign="top" align="center">26%</td>
<td valign="top" align="center">28%</td>
<td valign="top" align="center">5%</td>
<td valign="top" align="center">41%</td>
</tr>
</tbody>
</table><table-wrap-foot>
<p><bold>(A)</bold> The results of the EmotionBox. <bold>(B)</bold> The results of the emotion-label-based model.</p>
</table-wrap-foot>
</table-wrap>
<p>From <xref ref-type="table" rid="T4">Table 4B</xref>, the classification accuracy is similar for high arousal music. However, for low arousal music, the classification accuracy in terms of both arousal and valence of emotion decreases significantly. For the last row, 26 and 28% peaceful samples were perceived as happy samples and tensional samples, respectively, which indicates that the label-based method has a poor performance on generating music with a low arousal emotion.</p>
</sec>
<sec>
<title>Arousal and valence classification</title>
<p>Our proposed method uses note density and pitch histogram as features to present the arousal and valence of a specific emotion, respectively. To investigate whether these two features are suitable or not for training the deep neural networks, we calculated the accuracy of arousal and valence classification as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. If the emotion specified during generating stage and the emotion classified by subjects have the same arousal or valence, the classification result will be calculated as correct. For example, if the emotion of a sample specified during generating stage is happy while classified as tensional by subjects, the classification result will be viewed as correct because of the same arousal of happy and tensional.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>The mean accuracy and SD of subjective evaluation test for classifying generated music samples into arousal and valence categories.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpsyg-13-841926-g0005.tif"/>
</fig>
<p>A two-way ANOVA is used with arousal and model set as within-subject factors to investigate how these two factors affect the accuracy of subjective experiments. The statistical results show that model [<italic>F</italic><sub>(1, 25)</sub> = 20.457, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> = 0.450] and arousal [<italic>F</italic><sub>(1, 25)</sub> = 42.989, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> = 0.632] have a significant effect on the accuracy of subjective experiments. The interaction of model and arousal has a significant effect on the accuracy of subjective experiments [<italic>F</italic><sub>(1, 25)</sub> = 43.846, <italic>p</italic> &#x0003C; 0.01, partial &#x003B7;<sup>2</sup> = 0.637]. Another two-way ANOVA is also adopted with valence and model set as within-subject factors. The statistical results show that model [<italic>F</italic><sub>(1, 25)</sub> = 0.962, <italic>p</italic> = 0.346, partial &#x003B7;<sup>2</sup> = 0.036] and valence [<italic>F</italic><sub>(1, 25)</sub> = 0.962, <italic>p</italic> = 0.259, partial &#x003B7;<sup>2</sup> = 0.051] have no significant effect on the accuracy of subjective experiments. The interaction of model and valence shows no significant effect on the accuracy of subjective experiments [<italic>F</italic><sub>(1, 25)</sub> = 1.000, <italic>p</italic> = 0.327, partial &#x003B7;<sup>2</sup> = 0.038]. <xref ref-type="table" rid="T5">Table 5</xref> shows a <italic>post-hoc</italic> Bonferroni adjusted pairwise comparison between two methods in terms of arousal and valence.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>A <italic>post-hoc</italic> Bonferroni adjusted pairwise comparison of each arousal and valence conditions of the two methods.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>EmotionBox</bold></th>
<th valign="top" align="left"><bold>Label-based method</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">High arousal</td>
<td valign="top" align="left">High arousal</td>
<td valign="top" align="center">0.325</td>
</tr>
<tr>
<td valign="top" align="left">Low arousal</td>
<td valign="top" align="left">Low arousal</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">High valence</td>
<td valign="top" align="left">High valence</td>
<td valign="top" align="center">0.891</td>
</tr>
<tr>
<td valign="top" align="left">Low valence</td>
<td valign="top" align="left">Low valence</td>
<td valign="top" align="center">0.220</td>
</tr>
</tbody>
</table><table-wrap-foot>
<p><italic>p</italic>-value less than 0.05 means a statistically significant difference at a confidence level of 5% and is presented in bold type.</p>
</table-wrap-foot>
</table-wrap>
<p>It shows that the classification accuracy of EmotionBox is significantly higher than that of the label-based method on low arousal emotions. For other emotion categories, <xref ref-type="table" rid="T5">Table 5</xref> shows that there is no significant difference between two methods for other three pairs. The tempo and the mode are relevant with note density and pitch histogram, respectively, in our work. Note density and pitch histogram further present arousal and valence, respectively. Without the limitation of note density, the label-based method tends to generate music with a faster tempo, which results in a low classification accuracy of the samples with low arousal emotions. This result means note density is a suitable feature to control the arousal of music.</p>
</sec>
<sec>
<title>Limitations and outlook</title>
<p>However, there are still some limitations to the proposed method. First, the classification of valence is still challenging, which indicates that the valence of music cannot solely be presented by mode. A more appropriate presentation method of valence should be investigated in future work. Second, the generated music is more like an improvization. The model learns how to play the next note according to the previous notes whereas it has no idea about the structure of music. The structure of music is important and needs to be considered in the future work.</p>
<p>The EmotionBox can be used to help the composers create music with a specific emotion by providing various novel samples. By tuning the network&#x00027;s parameters, the EmotionBox can be a versatile assistant to create music. The combination of intelligent music composition and performance of music robot based on emotional computing is a promising approach for the future development of human-machine interaction, which provides a practical solution to eliminate the interaction barrier between humans and machines. Automatic emotional music may also be helpful for music therapy. Studies have shown neurological evidence that music effectively enhances auditory and language function through the human brain&#x00027;s plasticity (Hyde et al., <xref ref-type="bibr" rid="B23">2009</xref>; Dittinger et al., <xref ref-type="bibr" rid="B5">2017</xref>). Music therapies that utilize music as a treatment for tinnitus can leverage the plasticity in the auditory cortex and thus reduce the impact of tinnitus (Ellis et al., <xref ref-type="bibr" rid="B9">2010</xref>). Some researchers have also shown that emotional music may support emotion recognition in children with ASD, and thus improve their social skills (Wagener et al., <xref ref-type="bibr" rid="B40">2021</xref>). Music therapy often needs to avoid repetitive music. By tuning the networks parameters, the proposed method can generate non-repetitive music with a predefined emotion, which may be helpful for music therapy applications.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s7">
<title>Conclusion</title>
<p>In this work, we propose a music-element-driven automatic emotional music generator based on music psychology. This model does not need any music datasets with emotion labels that the previous methods required. The note density and the pitch histogram are chosen to present the arousal and valence of music, respectively. Then, different combinations of arousal and valence will be mapped to different emotions according to the Russell emotion model. Based on the specific note density and pitch histogram, our proposed method will be able to evoke listeners&#x00027; different auditory perceptions and emotions. Subjective experimental results indicate that our proposed method has a significantly better performance in generating music with low arousal emotions. The results of the subjective listening test also indicate that note density is a suitable presentation for the arousal of music while more research studies should be carried out to find a more appropriate feature to convey the valence of music. The proposed method may have unique values for some music therapy applications.</p>
</sec>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s9">
<title>Ethics statement</title>
<p>The studies involving human participants were reviewed and approved by the Ethics Committee of the Institute of Acoustics Chinese Academy of Sciences. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s10">
<title>Author contributions</title>
<p>KZ: writing. RM, KZ, and JS: methodology. CZ and XL: supervision and editing. JS: writing-review. JC: database. JW: evaluation. XW: data analysis and modification. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec sec-type="funding-information" id="s11">
<title>Funding</title>
<p>This work was supported by the National Science Fund of China (Grant Nos. 12074403 and 11974086), the Open Research Project of the State Key Laboratory of Media Convergence and Communication, Communication University of China, China (Nos. SKLMCC2021KF014 and SKLMCC2020KF005). This work was supported by National Key Research and Development Project (2021YFB3201702).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ack><p>The authors would like to thank those subjects who participated in the listening tests. The authors would also like to express our great appreciation to the editor and the reviewers.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Boulanger-Lewandowski</surname> <given-names>N.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Vincent</surname> <given-names>P.</given-names></name></person-group> (<year>2012</year>). <article-title>Modeling temporal dependencies in high-dimensional sequences: application to polyphonic music generation and transcription</article-title>. <source>arXiv preprint arXiv:1206.6392</source>. <pub-id pub-id-type="doi">10.1109/ICASSP.2013.6638244</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bradley</surname> <given-names>M. M.</given-names></name> <name><surname>Codispoti</surname> <given-names>M.</given-names></name> <name><surname>Cuthbert</surname> <given-names>B. N.</given-names></name> <name><surname>Lang</surname> <given-names>P. J.</given-names></name></person-group> (<year>2001</year>). <article-title>Emotion and motivation i: defensive and appetitive reactions in picture processing</article-title>. <source>Emotion</source> <volume>1</volume>, <fpage>276</fpage>. <pub-id pub-id-type="doi">10.1037/1528-3542.1.3.276</pub-id><pub-id pub-id-type="pmid">12934687</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Brunner</surname> <given-names>G.</given-names></name> <name><surname>Konrad</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Wattenhofer</surname> <given-names>R.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;MiDI-VAE: modeling dynamics and instrumentation of music with applications to style transfer,&#x0201D;</article-title> in <source>Proceedings of the 19th International Society for Music Information Retrieval Conference</source> (<publisher-loc>Paris</publisher-loc>).</citation>
</ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>K.</given-names></name> <name><surname>Van Merri&#x000EB;nboer</surname> <given-names>B.</given-names></name> <name><surname>Gulcehre</surname> <given-names>C.</given-names></name> <name><surname>Bahdanau</surname> <given-names>D.</given-names></name> <name><surname>Bougares</surname> <given-names>F.</given-names></name> <name><surname>Schwenk</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>&#x0201C;Learning phrase representations using RNN encoder-decoder for statistical machine translation,&#x0201D;</article-title> in <source>EMNLP 2014-2014 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference</source> (<publisher-loc>Doha</publisher-loc>).</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dittinger</surname> <given-names>E.</given-names></name> <name><surname>Chobert</surname> <given-names>J.</given-names></name> <name><surname>Ziegler</surname> <given-names>J. C.</given-names></name> <name><surname>Besson</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>Fast brain plasticity during word learning in musically-trained children</article-title>. <source>Front. Hum. Neurosci</source>. 11, 233. <pub-id pub-id-type="doi">10.3389/fnhum.2017.00233</pub-id><pub-id pub-id-type="pmid">28553213</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Dong</surname> <given-names>H. W.</given-names></name> <name><surname>Hsiao</surname> <given-names>W. Y.</given-names></name> <name><surname>Yang</surname> <given-names>L. C.</given-names></name> <name><surname>Yang</surname> <given-names>Y. H.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Musegan: multi-track sequential generative adversarial networks for symbolic music generation and accompaniment,&#x0201D;</article-title> in <source>32nd AAAI Conference on Artificial Intelligence</source> (<publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>AAAI</publisher-name>).</citation>
</ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Eck</surname> <given-names>D.</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>2002</year>). <article-title>&#x0201C;Finding temporal structure in music: blues improvisation with LSTM recurrent networks,&#x0201D;</article-title> in <source>Neural Networks for Signal Processing-Proceedings of the IEEE Workshop, Vol. 2002</source> (<publisher-loc>Martigny</publisher-loc>: <publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eerola</surname> <given-names>T.</given-names></name> <name><surname>Vuoskoski</surname> <given-names>J. K.</given-names></name></person-group> (<year>2012</year>). <article-title>A review of music and emotion studies: approaches, emotion models, and stimuli</article-title>. <source>Music Percept</source>. <volume>30</volume>, <fpage>307</fpage>&#x02013;<lpage>340</lpage>. <pub-id pub-id-type="doi">10.1525/mp.2012.30.3.307</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ellis</surname> <given-names>E. C. W.</given-names></name> <name><surname>Schlaug</surname> <given-names>G.</given-names></name> <name><surname>Pantev</surname> <given-names>C.</given-names></name></person-group> (<year>2010</year>). <article-title>Listening to filtered music as a treatment option for tinnitus: a review</article-title>. <source>Music Percept</source>. <volume>27</volume>, <fpage>327</fpage>&#x02013;<lpage>330</lpage>. <pub-id pub-id-type="doi">10.1525/mp.2010.27.4.327</pub-id><pub-id pub-id-type="pmid">21170296</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ferreira</surname> <given-names>L.</given-names></name> <name><surname>Lelis</surname> <given-names>L.</given-names></name> <name><surname>Whitehead</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Computer-generated music for tabletop role-playing games,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence and Interactive Digital Entertainment</source>, Vol. 16 (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>AAAI</publisher-name>), <fpage>59</fpage>&#x02013;<lpage>65</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ferreira</surname> <given-names>L. N.</given-names></name> <name><surname>Whitehead</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Learning to generate music with sentiment,&#x0201D;</article-title> in <source>Proceedings of the 20th International Society for Music Information Retrieval Conference</source> (<publisher-loc>Delft</publisher-loc>).</citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gagnon</surname> <given-names>L.</given-names></name> <name><surname>Peretz</surname> <given-names>I.</given-names></name></person-group> (<year>2003</year>). <article-title>Mode and tempo relative contributions to &#x0201C;happy-sad&#x0201D; judgements in equitone melodies</article-title>. <source>Cogn. Emot</source>. <volume>17</volume>, <fpage>25</fpage>&#x02013;<lpage>40</lpage>. <pub-id pub-id-type="doi">10.1080/02699930302279</pub-id><pub-id pub-id-type="pmid">29715736</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Guan</surname> <given-names>F.</given-names></name> <name><surname>Yu</surname> <given-names>C.</given-names></name> <name><surname>Yang</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;A GAN model with self-attention mechanism to generate multi-instruments symbolic music,&#x0201D;</article-title> in <source>Proceedings of the International Joint Conference on Neural Networks</source> (<publisher-loc>Budapest</publisher-loc>).</citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hadjeres</surname> <given-names>G.</given-names></name> <name><surname>Nielsen</surname> <given-names>F.</given-names></name></person-group> (<year>2017</year>). <article-title>Interactive music generation with positional constraints using anticipation-rnns</article-title>. <source>arXiv preprint arXiv:1709.06404</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1709.06404</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>Y. L.</given-names></name> <name><surname>Zhang</surname> <given-names>X. L.</given-names></name> <name><surname>Ao</surname> <given-names>W.</given-names></name> <name><surname>Huang</surname> <given-names>J. Z.</given-names></name></person-group> (<year>2018</year>). <article-title>Determining the optimal temperature parameter for Softmax function in reinforcement learning</article-title>. <source>Appl. Soft Comput. J</source>. <volume>70</volume>, <fpage>80</fpage>&#x02013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2018.05.012</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Herremans</surname> <given-names>D.</given-names></name> <name><surname>Chew</surname> <given-names>E.</given-names></name></person-group> (<year>2019</year>). <article-title>MorpheuS: generating structured music with constrained patterns and tension</article-title>. <source>IEEE Trans. Affect. Comput</source>. <volume>10</volume>, <fpage>510</fpage>&#x02013;<lpage>523</lpage>. <pub-id pub-id-type="doi">10.1109/TAFFC.2017.2737984</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Herremans</surname> <given-names>D.</given-names></name> <name><surname>Chuan</surname> <given-names>C. H.</given-names></name> <name><surname>Chew</surname> <given-names>E.</given-names></name></person-group> (<year>2017</year>). <article-title>A functional taxonomy of music generation systems</article-title>. <source>ACM Comput. Surveys</source> <volume>50</volume>, <fpage>1</fpage>&#x02013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1145/3108242</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hevner</surname> <given-names>K.</given-names></name></person-group> (<year>1935</year>). <article-title>The affective character of the major and minor modes in music</article-title>. <source>Am. J. Psychol</source>. <volume>47</volume>, <fpage>103</fpage>&#x02013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.2307/1416710</pub-id><pub-id pub-id-type="pmid">23914179</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hevner</surname> <given-names>K.</given-names></name></person-group> (<year>1936</year>). <article-title>Experimental studies of the elements of expression in music</article-title>. <source>Am. J. Psychol</source>. <volume>48</volume>, <fpage>246</fpage>&#x02013;<lpage>268</lpage>. <pub-id pub-id-type="doi">10.2307/1415746</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Anil</surname> <given-names>C.</given-names></name> <name><surname>Bao</surname> <given-names>X.</given-names></name> <name><surname>Oore</surname> <given-names>S.</given-names></name> <name><surname>Grosse</surname> <given-names>R. B.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Timbretron: a wavenet(Cyclegan(CqT(Audio))) pipeline for musical timbre transfer,&#x0201D;</article-title> in <source>7th International Conference on Learning Representations</source> (<publisher-loc>New Orleans, LA</publisher-loc>).</citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hunter</surname> <given-names>P. G.</given-names></name> <name><surname>Schellenberg</surname> <given-names>E. G.</given-names></name> <name><surname>Schimmack</surname> <given-names>U.</given-names></name></person-group> (<year>2008</year>). <article-title>Mixed affective responses to music with conflicting cues</article-title>. <source>Cogn. Emot</source>. <volume>22</volume>, <fpage>327</fpage>&#x02013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.1080/02699930701438145</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hunter</surname> <given-names>P. G.</given-names></name> <name><surname>Schellenberg</surname> <given-names>E. G.</given-names></name> <name><surname>Schimmack</surname> <given-names>U.</given-names></name></person-group> (<year>2010</year>). <article-title>Feelings and perceptions of happiness and sadness induced by music: similarities, differences, and mixed emotions</article-title>. <source>Psychol. Aesthet. Creat. Arts</source> <volume>4</volume>, <fpage>47</fpage>. <pub-id pub-id-type="doi">10.1037/a0016873</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hyde</surname> <given-names>K. L.</given-names></name> <name><surname>Lerch</surname> <given-names>J.</given-names></name> <name><surname>Norton</surname> <given-names>A.</given-names></name> <name><surname>Forgeard</surname> <given-names>M.</given-names></name> <name><surname>Winner</surname> <given-names>E.</given-names></name> <name><surname>Evans</surname> <given-names>A. C.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>Musical training shapes structural brain development</article-title>. <source>J. Neurosci</source>. <volume>29</volume>, <fpage>3019</fpage>&#x02013;<lpage>3025</lpage>. <pub-id pub-id-type="doi">10.1523/JNEUROSCI.5118-08.2009</pub-id><pub-id pub-id-type="pmid">19279238</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>C.</given-names></name> <name><surname>Tie</surname> <given-names>Y.</given-names></name> <name><surname>Bai</surname> <given-names>Y.</given-names></name> <name><surname>Lv</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>A style-specific music composition neural network</article-title>. <source>Neural Process. Lett</source>. <volume>52</volume>, <fpage>1893</fpage>&#x02013;<lpage>1912</lpage>. <pub-id pub-id-type="doi">10.1007/s11063-020-10241-8</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Johnson</surname> <given-names>D. D.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Generating polyphonic music using tied parallel networks,&#x0201D;</article-title> in <source>Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics), Vol. 10198</source> (<publisher-loc>Amsterdam</publisher-loc>).</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kirke</surname> <given-names>A.</given-names></name> <name><surname>Miranda</surname> <given-names>E. R.</given-names></name></person-group> (<year>2009</year>). <article-title>A survey of computer systems for expressive music performance</article-title>. <source>ACM Comput. Surv</source>. 42, 41. <pub-id pub-id-type="doi">10.1145/1592451.1592454</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mao</surname> <given-names>H. H.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;DeepJ: style-specific music generation,&#x0201D;</article-title> in <source>Proceedings-12th IEEE International Conference on Semantic Computing, ICSC 2018</source> (<publisher-loc>Laguna Hills, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Micallef Grimaud</surname> <given-names>A.</given-names></name> <name><surname>Eerola</surname> <given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>An interactive approach to emotional expression through musical cues</article-title>. <source>Music Sci</source>. 5, 20592043211061745. <pub-id pub-id-type="doi">10.1177/20592043211061745</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oore</surname> <given-names>S.</given-names></name> <name><surname>Simon</surname> <given-names>I.</given-names></name> <name><surname>Dieleman</surname> <given-names>S.</given-names></name> <name><surname>Eck</surname> <given-names>D.</given-names></name> <name><surname>Simonyan</surname> <given-names>K.</given-names></name></person-group> (<year>2020</year>). <article-title>This time with feeling: learning expressive musical performance</article-title>. <source>Neural Comput. Appl</source>. <volume>32</volume>, <fpage>955</fpage>&#x02013;<lpage>967</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-018-3758-9</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Parncutt</surname> <given-names>R.</given-names></name></person-group> (<year>2014</year>). <article-title>The emotional connotations of major versus minor tonality: one or more origins?</article-title> <source>Musicae Sci</source>. <volume>18</volume>, <fpage>324</fpage>&#x02013;<lpage>353</lpage>. <pub-id pub-id-type="doi">10.1177/1029864914542842</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raffel</surname> <given-names>C.</given-names></name> <name><surname>Ellis</surname> <given-names>D. P.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Intuitive analysis, creation and manipulation of midi data with pretty midi,&#x0201D;</article-title> in <source>15th International Society for Music Information Retrieval Conference Late Breaking and Demo Papers</source>, (Taipei), <fpage>84</fpage>&#x02013;<lpage>93</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raynor</surname> <given-names>H.</given-names></name> <name><surname>Meyer</surname> <given-names>L. B.</given-names></name></person-group> (<year>1958</year>). <article-title>Emotion and meaning in music</article-title>. <source>Musical Times</source> <volume>99</volume>, <fpage>1380</fpage>. <pub-id pub-id-type="doi">10.2307/937584</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rigg</surname> <given-names>M. G.</given-names></name></person-group> (<year>1940</year>). <article-title>Speed as a determiner of musical mood</article-title>. <source>J. Exp. Psychol</source>. <volume>27</volume>, <fpage>566</fpage>&#x02013;<lpage>571</lpage>. <pub-id pub-id-type="doi">10.1037/h0058652</pub-id><pub-id pub-id-type="pmid">18509502</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Russell</surname> <given-names>J. A.</given-names></name></person-group> (<year>1980</year>). <article-title>A circumplex model of affect</article-title>. <source>J. Pers. Soc. Psychol</source>. 39, 1161. <pub-id pub-id-type="doi">10.1037/h0077714</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Schimbinschi</surname> <given-names>F.</given-names></name> <name><surname>Walder</surname> <given-names>C.</given-names></name> <name><surname>Erfani</surname> <given-names>S. M.</given-names></name> <name><surname>Bailey</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;SynthNet: Learning to synthesize music end-to-end,&#x0201D;</article-title> in <source>IJCAI International Joint Conference on Artificial Intelligence</source> (<publisher-loc>Macao</publisher-loc>).</citation>
</ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Subramani</surname> <given-names>K.</given-names></name> <name><surname>Rao</surname> <given-names>P.</given-names></name> <name><surname>D&#x00027;Hooge</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Vapar synth-a variational parametric model for audio synthesis,&#x0201D;</article-title> in <source>ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings, volume 2020-May</source> (<publisher-loc>Barcelona</publisher-loc>: <publisher-name>IEEE</publisher-name>).</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Todd</surname> <given-names>P. M.</given-names></name></person-group> (<year>1989</year>). <article-title>Connectionist approach to algorithmic composition</article-title>. <source>Comput. Music J</source>. <volume>13</volume>, <fpage>27</fpage>&#x02013;<lpage>43</lpage>. <pub-id pub-id-type="doi">10.2307/3679551</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tzanetakis</surname> <given-names>G.</given-names></name> <name><surname>Ermolinskyi</surname> <given-names>A.</given-names></name> <name><surname>Cook</surname> <given-names>P.</given-names></name></person-group> (<year>2003</year>). <article-title>Pitch histograms in audio and symbolic music information retrieval</article-title>. <source>Int. J. Phytoremediation</source>. <volume>21</volume>, <fpage>143</fpage>&#x02013;<lpage>152</lpage>. <pub-id pub-id-type="doi">10.1076/jnmr.32.2.143.16743</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>van den Oord</surname> <given-names>A.</given-names></name> <name><surname>Dieleman</surname> <given-names>S.</given-names></name> <name><surname>Zen</surname> <given-names>H.</given-names></name> <name><surname>Simonyan</surname> <given-names>K.</given-names></name> <name><surname>Vinyals</surname> <given-names>O.</given-names></name> <name><surname>Graves</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>WaveNet: a generative model for raw audio based on PixelCNN architecture</article-title>. <source>arXiv</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1609.03499</pub-id></citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wagener</surname> <given-names>G. L.</given-names></name> <name><surname>Berning</surname> <given-names>M.</given-names></name> <name><surname>Costa</surname> <given-names>A. P.</given-names></name> <name><surname>Steffgen</surname> <given-names>G.</given-names></name> <name><surname>Melzer</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>Effects of emotional music on facial emotion recognition in children with autism spectrum disorder (asd)</article-title>. <source>J. Autism. Dev. Disord</source>. <volume>51</volume>, <fpage>3256</fpage>&#x02013;<lpage>3265</lpage>. <pub-id pub-id-type="doi">10.1007/s10803-020-04781-0</pub-id><pub-id pub-id-type="pmid">33201423</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Waite</surname> <given-names>E.</given-names></name></person-group> (<year>2016</year>). <source>Generating Long-Term Structure in Songs and Stories. Magenta Bolg</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://magenta.tensorflow.org/2016/07/15/lookback-rnn-attention-rnn">https://magenta.tensorflow.org/2016/07/15/lookback-rnn-attention-rnn</ext-link></citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Westergaard</surname> <given-names>P.</given-names></name> <name><surname>Hiller</surname> <given-names>L. A.</given-names></name> <name><surname>Isaacson</surname> <given-names>L. M.</given-names></name></person-group> (<year>1959</year>). <article-title>Experimental Music. Composition with an electronic computer</article-title>. <source>J. Music Theory</source> <volume>3</volume>, <fpage>842857</fpage>. <pub-id pub-id-type="doi">10.2307/842857</pub-id></citation>
</ref>
<ref id="B43">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>L. C.</given-names></name> <name><surname>Chou</surname> <given-names>S. Y.</given-names></name> <name><surname>Yang</surname> <given-names>Y. H.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Midinet: a convolutional generative adversarial network for symbolic-domain music generation,&#x0201D;</article-title> in <source>Proceedings of the 18th International Society for Music Information Retrieval Conference</source> (<publisher-loc>Suzhou</publisher-loc>).</citation>
</ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>N.</given-names></name></person-group> (<year>2020</year>). <article-title>Learning adversarial transformer for symbolic music generation</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <pub-id pub-id-type="doi">10.1109/TNNLS.2020.2990746</pub-id><pub-id pub-id-type="pmid">32614773</pub-id></citation></ref>
<ref id="B45">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>S.</given-names></name> <name><surname>Cai</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;An emotional symbolic music generation system based on LSTM networks,&#x0201D;</article-title> in <source>Proceedings of 2019 IEEE 3rd Information Technology, Networking, Electronic and Automation Control Conference, ITNEC 2019</source> (<publisher-loc>Chengdu</publisher-loc>: <publisher-name>IEEE</publisher-name>).</citation>
</ref>
</ref-list>
<fn-group>
<fn id="fn0001"><p><sup>1</sup>The codes are available on <ext-link ext-link-type="uri" xlink:href="https://github.com/KaitongZheng/EmotionBox">https://github.com/KaitongZheng/EmotionBox</ext-link>.</p></fn>
<fn id="fn0002"><p><sup>2</sup>The training data can be found on <ext-link ext-link-type="uri" xlink:href="http://www.piano-midi.de/">http://www.piano-midi.de/</ext-link>.</p></fn>
<fn id="fn0003"><p><sup>3</sup>The subjective listening test files can be found on <ext-link ext-link-type="uri" xlink:href="https://github.com/KaitongZheng/EmotionBoxDEMO">https://github.com/KaitongZheng/EmotionBoxDEMO</ext-link>.</p></fn>
</fn-group>
</back>
</article>