<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Public Health</journal-id>
<journal-title-group>
<journal-title>Frontiers in Public Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Public Health</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-2565</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpubh.2025.1633924</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Study on the path of combining music and digital health technology to promote the health of older adult groups</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Ma</surname> <given-names>Chuang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3054566"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hu</surname> <given-names>Bo</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname> <given-names>Shixue</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ma</surname> <given-names>Xiaomei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Music, Southwest University</institution>, <city>Chongqing</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Guang&#x00027;anmen Hospital South Campus, China Academy of Chinese Medical Sciences</institution>, <city>Beijing</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Oncology, The First Affiliated Hospital of Chongqing Medical University</institution>, <city>Chongqing</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Chuang Ma, <email xlink:href="mailto:17783068887@163.com">17783068887@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-26">
<day>26</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1633924</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>18</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Ma, Hu, Chen and Ma.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Ma, Hu, Chen and Ma</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-26">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Objective</title>
<p>As the global population ages, non-pharmacological interventions such as personalized music therapy show promise for wellbeing in older adults. We propose the Fusion-Attentive Temporal Network (FAT-Net). This dual-stream model processes minute level heart-rate and music on/off data alongside daily summary features to predict a composite health score.</p></sec>
<sec>
<title>Methods</title>
<p>Data from 92 participants over 45 &#x000B1; 10 days were augmented fourfold using jittering, time-warping, magnitude scaling, and SMOTE. The temporal stream uses Conv1D, BiLSTM, and self-attention pooling. The summary stream uses a three-layer MLP. Cross-modal attention fuses both embeddings.</p></sec>
<sec>
<title>Results</title>
<p>Over ten runs, FAT-Net achieved RMSE = 0.35 &#x000B1; 0.005 (22.7% reduction vs. Random Forest), MAE = 0.28 &#x000B1; 0.005 (19.5% reduction), and <italic>R</italic><sup>2</sup> = 0.87 &#x000B1; 0.008 (17.3% improvement). Pearson&#x00027;s <italic>r</italic> between predictions and true values was 0.93.</p></sec>
<sec>
<title>Conclusion</title>
<p>FAT-Net&#x00027;s attention-based fusion provides a robust, interpretable approach for forecasting daily wellbeing in older adults.</p></sec></abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>Health</kwd>
<kwd>Health of older adult groups</kwd>
<kwd>health score</kwd>
<kwd>music and digital health</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="3"/>
<equation-count count="10"/>
<ref-count count="20"/>
<page-count count="11"/>
<word-count count="5459"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Aging and Public Health</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<sec>
<label>1.1</label>
<title>Background and significance</title>
<p>As the global population ages, the need for scalable, proactive solutions to support health and wellbeing among older adults has become increasingly urgent. By 2050, adults aged 60 and over are projected to comprise 22% of the global population (<xref ref-type="bibr" rid="B1">1</xref>). This demographic shift is accompanied by a rise in chronic diseases and cognitive decline (<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B3">3</xref>), placing immense pressure on healthcare systems. In this context, non-pharmacological interventions such as music therapy have shown promise in promoting mental and emotional well-being, particularly by enhancing mood, memory, and social engagement in older adults (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B4">4</xref>). Additionally, music listening has been linked to modulation of physiological indicators like heart rate variability (HRV) and stress biomarkers (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>).</p>
<p>Advancements in wearable technology enable the continuous collection of physiological signals such as heart rate and HRV (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>). These data offer new opportunities to develop personalized digital therapeutics that can adapt in real-time to individual needs. Machine learning and deep learning models&#x02014;including Random Forest (<xref ref-type="bibr" rid="B8">8</xref>), XGBoost (<xref ref-type="bibr" rid="B9">9</xref>), LSTM (<xref ref-type="bibr" rid="B10">10</xref>), and TCN (<xref ref-type="bibr" rid="B11">11</xref>)&#x02014;have been employed to extract insights from physiological data. However, most existing models fail to effectively integrate behavioral and physiological modalities (<xref ref-type="bibr" rid="B12">12</xref>). Furthermore, composite health indices that merge affective states (e.g., PANAS) and physiological metrics such as HRV are increasingly used to provide a holistic measure of wellbeing (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B13">13</xref>).</p>
<p>Despite these advances, the application of music-driven health prediction remains underexplored in real-world aging care contexts. Digital platforms for older adult care often lack dynamic personalization and explainability. Bridging this gap requires novel methods that can simultaneously model complex multimodal data and offer interpretable outputs to support clinician and user trust (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B14">14</xref>).</p></sec>
<sec>
<label>1.2</label>
<title>Related work</title>
<p>Prior research has examined the role of music in health interventions across diverse domains. Faulkner et al.(<xref ref-type="bibr" rid="B4">4</xref>) developed <italic>Rhythm2Recovery</italic>, a rhythmic music and reflection-based program that improved emotional regulation and social reconnection in trauma recovery settings. Davidoff (<xref ref-type="bibr" rid="B3">3</xref>) emphasized the parallels between musicianship and medical practice, suggesting music-based training can bolster stress resilience in healthcare professionals.</p>
<p>In the domain of wearable health, Groh et al.(<xref ref-type="bibr" rid="B15">15</xref>) introduced lightweight, explainable models for on-device symptom detection using mechano-acoustic signals. Their interpretability strategies mirror our attention-based approach to understanding physiological responses during music listening. Wang et al.(<xref ref-type="bibr" rid="B5">5</xref>) demonstrated that musical features such as valence and tempo can be extracted using deep learning and correlate with mental energy&#x02014;a concept we extend to older adults users.</p>
<p>Meta-analyses by Raglio (<xref ref-type="bibr" rid="B1">1</xref>) confirm the effectiveness of music interventions in improving mood and reducing stress across older adult populations. Meanwhile, studies in educational and digital contexts (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B17">17</xref>) have highlighted how music paired with real-time feedback can enhance self-regulation and emotional wellbeing. Bulaj et al.(<xref ref-type="bibr" rid="B14">14</xref>) demonstrated the potential of combining pharmacological treatments with personalized music playlists to empower patients. Liu et al.(<xref ref-type="bibr" rid="B18">18</xref>) applied biofeedback to adapt music to passengers&#x00027; real-time heart-rate states, illustrating the potential for responsive, music-driven systems.</p>
<p>Complementary works in healthcare education and reflective practice have shown that integrating expressive arts&#x02014;including music&#x02014;can improve empathy, engagement, and cognitive performance in trainees (<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>). The PANAS scale, widely used in music therapy studies, provides a validated tool for capturing emotional states and forms a key input to our modeling approach (<xref ref-type="bibr" rid="B13">13</xref>).</p></sec>
<sec>
<label>1.3</label>
<title>Open challenges</title>
<p>Despite these promising developments, several challenges remain. Many prior studies rely on small, localized samples, which limits generalizability to broader older adult populations (<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>). Moreover, few models integrate minute-level physiological signals with high-level behavioral summaries into a unified framework (<xref ref-type="bibr" rid="B12">12</xref>). Wearable devices also face resource constraints, requiring models that are both accurate and lightweight (<xref ref-type="bibr" rid="B15">15</xref>). Interpretability is another major concern, as black-box models may hinder clinical adoption without transparent mechanisms for decision-making (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B15">15</xref>). Lastly, user variability in music preferences and sensor engagement demands adaptive systems that remain robust across diverse use cases (<xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B18">18</xref>), while minimizing fatigue from self-reporting (<xref ref-type="bibr" rid="B13">13</xref>).</p></sec>
<sec>
<label>1.4</label>
<title>Research motivation</title>
<p>We aim to address these gaps by focusing specifically on older adult individuals living in community settings who can benefit from proactive, music-driven digital health interventions. While music therapy has proven beneficial, most existing digital health models ignore music behavior as a variable of interest (<xref ref-type="bibr" rid="B1">1</xref>). Likewise, although heart-rate monitors collect minute-level data, they are often underutilized in fusion with subjective and behavioral features. Our proposed FAT-Net model bridges this divide by integrating physiological dynamics with music listening patterns, enabling interpretable predictions of next-day health outcomes. This integration not only improves forecasting accuracy but also supports clinical decision-making and user engagement through attention-based explanations.</p></sec>
<sec>
<label>1.5</label>
<title>Hypothesis and contributions</title>
<p>We hypothesize that cross-modal attention in FAT-Net will significantly improve next-day health score prediction compared to unimodal baselines. Specifically, we expect reductions in RMSE and MAE greater than 15%, and improvements in <italic>R</italic><sup>2</sup>, with attention mechanisms highlighting meaningful interactions (e.g., HRV dips during high-tempo music). These interpretable insights are aligned with clinical understanding of stress responses and user engagement, and contribute to increased transparency and trust.</p>
<p>The key contributions of this paper are as follows:</p>
<list list-type="order">
<list-item><p>We propose FAT-Net, a dual-stream attention model that fuses physiological and behavioral data for daily health prediction.</p></list-item>
<list-item><p>We curate and augment a multimodal dataset combining minute-level heart-rate signals and self-reported music engagement in older adults.</p></list-item>
<list-item><p>We demonstrate significant improvements over baselines: 23% RMSE reduction and 17% <italic>R</italic><sup>2</sup> improvement.</p></list-item>
<list-item><p>We visualize attention weights to interpret model predictions, identifying critical time segments and feature contributions.</p></list-item>
</list></sec>
<sec>
<label>1.6</label>
<title>Glossary of terms and acronyms</title>
<p>To improve clarity for interdisciplinary readers, we provide a glossary of key technical terms and acronyms used throughout the manuscript. This glossary includes definitions for commonly used concepts such as FAT-Net, HRV, and PANAS. Readers unfamiliar with these terms may refer to <xref ref-type="table" rid="T1">Table 1</xref> for concise explanations.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Glossary of key terms and acronyms used in this paper.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Term/acronym</bold></th>
<th valign="top" align="left"><bold>Definition</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FAT-Net</td>
<td valign="top" align="left">Fusion-Attentive Temporal Network (proposed dual-stream predictive model)</td>
</tr>
<tr>
<td valign="top" align="left">HRV</td>
<td valign="top" align="left">Heart Rate Variability, a measure of autonomic nervous system activity</td>
</tr>
<tr>
<td valign="top" align="left">PANAS</td>
<td valign="top" align="left">Positive and Negative Affect Schedule, a validated self-report mood scale</td>
</tr>
<tr>
<td valign="top" align="left">RMSE</td>
<td valign="top" align="left">Root Mean Squared Error, a common regression evaluation metric</td>
</tr>
<tr>
<td valign="top" align="left">MAE</td>
<td valign="top" align="left">Mean Absolute Error, another regression performance measure</td>
</tr>
<tr>
<td valign="top" align="left"><italic>R</italic><sup>2</sup></td>
<td valign="top" align="left">Coefficient of Determination, indicating variance explained by the model</td>
</tr>
<tr>
<td valign="top" align="left">BiLSTM</td>
<td valign="top" align="left">Bidirectional Long Short-Term Memory, a recurrent neural network architecture</td>
</tr>
<tr>
<td valign="top" align="left">SMOTE</td>
<td valign="top" align="left">Synthetic Minority Over-sampling Technique, used for data augmentation</td>
</tr>
<tr>
<td valign="top" align="left">PPG</td>
<td valign="top" align="left">Photoplethysmography, a method of measuring heart rate via light absorption</td>
</tr></tbody>
</table>
</table-wrap>
</sec></sec>
<sec id="s2">
<label>2</label>
<title>Data collection</title>
<sec>
<label>2.1</label>
<title>Participants and recruitment</title>
<p>We recruited community-dwelling adults aged &#x02265;60 years through a multi-pronged outreach strategy, which included flyers at senior centers, announcements at local health clinics, and targeted invitations via online forums and email lists. Prospective participants accessed a secure Google Form link where they provided informed consent (IRB&#x00023;2025-065) before enrollment. Of the 714 individuals approached, 132 (18.5%) initiated the survey; 92 (65% of initiators) completed daily reporting for the entire study duration (45 &#x000B1; 10 days, mean &#x000B1; SD). Dropout reasons (<italic>n</italic> = 40) were categorized as technical difficulties (30%), loss to follow-up (45%), and withdrawal of consent (25%). Participant demographics included a mean age of 67.8 &#x000B1; 5.1 years, 58% female, and a baseline BMI of 26.4 &#x000B1; 3.8 kg/m<sup>2</sup>. This cohort size and adherence rate provided sufficient statistical power (&#x0003E;0.8) to detect moderate effect sizes (Cohen&#x00027;s <italic>d &#x0003D;</italic> 0.5) in health-score changes over time.</p></sec>
<sec>
<label>2.2</label>
<title>Instrumentation and measures</title>
<p>Data were captured via two complementary modalities:</p>
<list list-type="simple">
<list-item><p><bold>(a) Online Survey (Google Form):</bold></p></list-item></list>
<list list-type="bullet">
<list-item><p><italic>PANAS Positive Affect:</italic> Ten items rated on a 5-point Likert scale (1 = &#x0201C;very slightly&#x0201D; to 5 = &#x0201C;extremely&#x0201D;), validated for older populations (<xref ref-type="bibr" rid="B13">13</xref>).</p></list-item>
<list-item><p><italic>Sleep Quality:</italic> Participants logged bedtime and waketime, and rated perceived restfulness on a 5-point semantic scale.</p></list-item>
<list-item><p><italic>Music Listening Logs:</italic> For each listening session, participants reported track title, artist, start/end timestamps, and subjective enjoyment (1&#x02013;5).</p></list-item>
</list>
<list list-type="simple">
<list-item><p><bold>(b) Wearable Device:</bold></p></list-item></list>
<list list-type="bullet">
<list-item><p><italic>Model:</italic> Empatica E4 wristband (64 Hz PPG, validated against ECG for HRV metrics (<xref ref-type="bibr" rid="B6">6</xref>)).</p></list-item>
<list-item><p><italic>Physiological Metrics:</italic> Resting heart rate (RHR), heart rate variability (HRV) indices (RMSSD, SDNN), step count, and sedentary bout frequency.</p></list-item>
<list-item><p><italic>Sleep Metrics:</italic> Actigraphy-derived measures including total sleep time, sleep efficiency, and wake after sleep onset (WASO).</p></list-item>
</list></sec>
<sec>
<label>2.3</label>
<title>Feature specification</title>
<p>We engineered a comprehensive set of daily features spanning demographics, music intervention characteristics, physiological signals, and psychological outcomes. <xref ref-type="table" rid="T2">Table 2</xref> details each feature, its source, and collection frequency. <xref ref-type="table" rid="T2">Table 2</xref> shows that our dataset balances self-reported and sensor-derived measures to capture multidimensional aspects of participant health and behavior.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Optimistically arranged dataset features by predictive importance.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Category</bold></th>
<th valign="top" align="left"><bold>Feature</bold></th>
<th valign="top" align="left"><bold>Description</bold></th>
<th valign="top" align="left"><bold>Type</bold></th>
<th valign="top" align="left"><bold>Source</bold></th>
<th valign="top" align="left"><bold>Frequency</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Psychological</td>
<td valign="top" align="left">PANAS positive affect</td>
<td valign="top" align="left">Sum score of positive-affect items (10&#x02014;50)</td>
<td valign="top" align="left">Integer</td>
<td valign="top" align="left">Survey</td>
<td valign="top" align="left">Daily</td>
</tr>
<tr>
<td valign="top" align="left">Music intervention</td>
<td valign="top" align="left">Listening Duration</td>
<td valign="top" align="left">Total minutes of music listened per day</td>
<td valign="top" align="left">Float (min)</td>
<td valign="top" align="left">Survey / logs</td>
<td valign="top" align="left">Daily</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">Average Tempo (BPM)</td>
<td valign="top" align="left">Mean beats per minute of tracks (MIR)</td>
<td valign="top" align="left">Float</td>
<td valign="top" align="left">API / MIR</td>
<td valign="top" align="left">Daily</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">Valence &#x00026; Arousal</td>
<td valign="top" align="left">Emotional ratings per track (1&#x02013;9 scale)</td>
<td valign="top" align="left">Float</td>
<td valign="top" align="left">API</td>
<td valign="top" align="left">Daily</td>
</tr>
<tr>
<td valign="top" align="left">Physiological</td>
<td valign="top" align="left">Resting heart rate</td>
<td valign="top" align="left">Lowest 5-min avg HR during waking hours</td>
<td valign="top" align="left">Float (bpm)</td>
<td valign="top" align="left">Empatica E4</td>
<td valign="top" align="left">Daily summary</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">HRV (RMSSD, SDNN)</td>
<td valign="top" align="left">Time-domain HRV metrics (ms)</td>
<td valign="top" align="left">Float (ms)</td>
<td valign="top" align="left">Empatica E4</td>
<td valign="top" align="left">Daily summary</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">Sleep Efficiency</td>
<td valign="top" align="left">% time in bed spent asleep</td>
<td valign="top" align="left">Float (%)</td>
<td valign="top" align="left">Actigraphy</td>
<td valign="top" align="left">Nightly summary</td>
</tr>
<tr>
<td valign="top" align="left">Baseline demographics</td>
<td valign="top" align="left">Age</td>
<td valign="top" align="left">Participant age in years</td>
<td valign="top" align="left">Integer</td>
<td valign="top" align="left">Survey</td>
<td valign="top" align="left">Baseline</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">Gender</td>
<td valign="top" align="left">Self-reported gender identity</td>
<td valign="top" align="left">Categorical</td>
<td valign="top" align="left">Survey</td>
<td valign="top" align="left">Baseline</td>
</tr>
 <tr>
<td/>
<td valign="top" align="left">BMI</td>
<td valign="top" align="left">Calculated from self-reported height/weight</td>
<td valign="top" align="left">Float</td>
<td valign="top" align="left">Survey</td>
<td valign="top" align="left">Baseline</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">Comorbidities</td>
<td valign="top" align="left">Hypertension, diabetes, etc. (yes/no)</td>
<td valign="top" align="left">Categorical</td>
<td valign="top" align="left">Survey</td>
<td valign="top" align="left">Baseline</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>2.4</label>
<title>Data refinement and pre-processing</title>
<p>To ensure data integrity and analytic validity, we applied the following refinement steps:</p>
<list list-type="simple">
<list-item><p><bold>(a) Outlier Detection:</bold> Data points outside physiologically plausible ranges (e.g., RHR &#x0003C; 30 bpm or &#x0003E; 120 bpm; HRV &#x0003E; 200 ms; sleep efficiency &#x0003E;100%) were flagged and removed, accounting for &#x0003C; 0.5% of total records.</p></list-item>
<list-item><p><bold>(b) Missing Data Handling:</bold> Days with &#x02264; 10% missing values were imputed via linear interpolation over time. Participants with &#x0003E;10% missing days (<italic>n</italic> = 8) were excluded to minimize bias.</p></list-item>
<list-item><p><bold>(c) Feature Engineering:</bold></p></list-item></list>
<list list-type="bullet">
<list-item><p>&#x00394;HRV: Day-to-day change in RMSSD.</p></list-item>
<list-item><p>Listening Intensity: BPM-weighted listening duration (min &#x000D7; BPM).</p></list-item>
<list-item><p>Sleep Fragmentation: WASO divided by total sleep time.</p></list-item>
</list>
<list list-type="simple">
<list-item><p><bold>(d) Normalization:</bold> All continuous features were standardized (zero mean, unit variance) across participants to facilitate model convergence and interpretability.</p></list-item>
</list>
<p>This multi-step pipeline yielded a clean, analysis-ready dataset, with balanced representation across key variables.</p></sec>
<sec>
<label>2.5</label>
<title>Data augmentation approaches</title>
<p>To augment the limited time-series data from 92 participants, we applied four augmentation techniques. <xref ref-type="table" rid="T3">Table 3</xref> summarizes these approaches. It shows how each method modifies feature distributions to enhance model generalizability.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Overview of augmentation techniques.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Technique</bold></th>
<th valign="top" align="left"><bold>Description</bold></th>
<th valign="top" align="left"><bold>Parameters</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Jittering</td>
<td valign="top" align="left">Inject Gaussian noise into continuous features to simulate sensor variability</td>
<td valign="top" align="left">&#x003C3;<italic> &#x0003D;</italic> 2% of feature range</td>
</tr>
<tr>
<td valign="top" align="left">Time warping</td>
<td valign="top" align="left">Randomly stretch/compress listening-duration series to mimic temporal variability</td>
<td valign="top" align="left">Stretch factor &#x02208;[0.9, 1.1]</td>
</tr>
<tr>
<td valign="top" align="left">Magnitude scaling</td>
<td valign="top" align="left">Scale entire daily feature vectors to model physiological fluctuations</td>
<td valign="top" align="left">Scale factor &#x02208;[0.9, 1.1]</td>
</tr>
<tr>
<td valign="top" align="left">SMOTE</td>
<td valign="top" align="left">Generate synthetic samples in composite health-score space to balance distribution tails</td>
<td valign="top" align="left"><italic>k &#x0003D;</italic> 5 nearest neighbors</td>
</tr></tbody>
</table>
</table-wrap></sec>
<sec>
<label>2.6</label>
<title>Health score metric</title>
<p>The primary target variable of our predictive models is a day-level composite <italic>Health Score</italic>, integrating three key dimensions of wellbeing for each participant on day <italic>i</italic>:</p>
<disp-formula id="E1"><mml:math id="M1"><mml:mtable columnalign="right"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">PANAS positive-affect sum (10&#x02013;50)</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Sleep efficiency (%)</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">Resting HRV (RMSSD, ms)</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<sec>
<label>2.6.1</label>
<title>Normalization</title>
<p>Each component is standardized to zero mean and unit variance across the cohort:</p>
<disp-formula id="E2"><mml:math id="M2"><mml:mrow><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where &#x003BC; and &#x003C3; denote the overall mean and standard deviation of each measure.</p></sec>
<sec>
<label>2.6.2</label>
<title>Composite score calculation</title>
<list list-type="simple">
<list-item><p>(a) Equal-weight Sum:</p></list-item></list>
<disp-formula id="E3"><mml:math id="M3"><mml:mrow><mml:mtext>Healt</mml:mtext><mml:msub><mml:mrow><mml:mtext>h</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<list list-type="simple">
<list-item><p>(b) PCA-derived Score: Perform principal component analysis on the matrix [<italic>Z</italic><sub><italic>M</italic></sub>, <italic>Z</italic><sub><italic>S</italic></sub>, <italic>Z</italic><sub><italic>H</italic></sub>] across all days and participants, then set</p></list-item></list>
<disp-formula id="E4"><mml:math id="M4"><mml:mrow><mml:mtext>Healt</mml:mtext><mml:msub><mml:mrow><mml:mtext>h</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext class="textrm" mathvariant="normal">PC</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>Z</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>using the first principal component as a data-driven weighting.</p></sec>
<sec>
<label>2.6.3</label>
<title>Weight selection and reliability</title>
<list list-type="simple">
<list-item><p>(a) For the equal-weight method, we assessed internal consistency via Cronbach&#x00027;s &#x003B1;, targeting &#x003B1;&#x0003E;0.7.</p></list-item>
<list-item><p>(b) For PCA, we confirmed that the first component explained at least 60% of the total variance in [<italic>Z</italic><sub><italic>M</italic></sub>, <italic>Z</italic><sub><italic>S</italic></sub>, <italic>Z</italic><sub><italic>H</italic></sub>].</p></list-item>
</list></sec>
<sec>
<label>2.6.4</label>
<title>Validation and interpretation</title>
<list list-type="simple">
<list-item><p>(a) Distributional Check: Shapiro&#x02014;Wilk tests indicated approximate normality (<italic>p</italic>&#x0003E;0.05), with skewness and kurtosis within &#x000B1;1.</p></list-item>
<list-item><p>(b) Responsiveness: Day-to-day Health Score changes correlated strongly with self-reported global health ratings (Spearman&#x00027;s &#x003C1;&#x0003E;0.6, <italic>p</italic> &#x0003C; 0.001).</p></list-item>
<list-item><p>(c) Optional Binary Labeling: A &#x0201C;Good&#x0201D; vs. &#x0201C;Not-Good&#x0201D; health classification uses the 75th percentile threshold of the continuous score, validated against clinical interviews (82% agreement).</p></list-item>
</list>
<p>This continuous Health Score serves as the regression target in all predictive modeling. The relationships among PANAS, sleep efficiency, HRV, and the composite Health Score are further illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, which presents an annotated correlation heatmap highlighting the strongest associations among these variables. As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, the mean Health Score increases monotonically across PANAS positive-affect quartiles, indicating a clear association between emotional state and overall health status. The distribution of Health Scores across sleep efficiency quartiles is depicted in <xref ref-type="fig" rid="F3">Figure 3</xref>, showing progressively higher medians and reduced variability with improved sleep efficiency.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Figure shows the annotated correlation heatmap among PANAS, Sleep Efficiency, HRV, and the composite Health Score.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0001.tif">
<alt-text content-type="machine-generated">Correlation heatmap showing relationships among PANAS, SleepEff, HRV, and HealthScore. Strongest positive correlations are between HealthScore and SleepEff (0.68) and HealthScore and HRV (0.61). Color intensity increases with correlation strength.</alt-text>
</graphic>
</fig>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Figure shows the mean Health Score grouped by PANAS positive-affect quartiles.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0002.tif">
<alt-text content-type="machine-generated">Bar graph illustrating mean health scores by PANAS quartile. Q1 shows a negative score, Q2 near zero, Q3 slightly positive, and Q4 has the highest positive score.</alt-text>
</graphic>
</fig>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Figure shows the distribution of health score across sleep efficiency quartiles.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0003.tif">
<alt-text content-type="machine-generated">Box plot titled &#x0201C;Health Score by Sleep Efficiency Quartile&#x0201D; showing health scores across four sleep efficiency quartiles (Q1 to Q4). Health scores range from negative 1.5 to positive 1.5. Each quartile's median and interquartile range are depicted, with increasing medians from Q1 to Q4.</alt-text>
</graphic>
</fig></sec></sec>
<sec>
<label>2.7</label>
<title>Additional details on instrumentation and data collection</title>
<p>To improve the reproducibility of our methods, we provide further details on the instrumentation and procedures used in the study. All physiological data were collected using the Empatica E4 wristband, a medically validated wearable device equipped with a 64 Hz photoplethysmography (PPG) sensor. This sensor captures continuous heart-rate data and enables the extraction of heart rate variability (HRV) metrics such as RMSSD and SDNN. The E4 also records movement via a 3-axis accelerometer, which was used to compute sleep-related parameters including total sleep time and sleep efficiency. Participants were instructed to wear the device on their non-dominant wrist during waking hours, removing it only for charging or bathing. Before data collection began, each participant received a brief orientation on proper usage of the device and how to complete the daily online survey. The survey included self-reported measures of affect (PANAS), music listening logs (track title, listening time, enjoyment rating), and perceived sleep quality. Physiological data were preprocessed using Empatica&#x00027;s SDK to ensure consistency and accuracy. Raw signals were cleaned, and outliers (e.g., implausible heart-rate values) were removed prior to feature extraction. This multimodal framework allowed us to integrate objective minute-level signals with self-reported behavioral data, providing a comprehensive view of each participant&#x00027;s daily health status.</p></sec></sec>
<sec id="s3">
<label>3</label>
<title>Fusion-attentive temporal network (FAT-Net)</title>
<p>To capture both rapid physiological fluctuations and cumulative summary trends, FAT-Net integrates minute-level time-series encoding with day-level feature embeddings. <xref ref-type="fig" rid="F4">Figure 4</xref> illustrates the overall architecture and data flow of the proposed model. The temporal stream applies a stacked Conv1D front-end followed by a BiLSTM to model local and long-range heart-rate dynamics, while the summary stream encodes day-level behavioral features using a lightweight multilayer perceptron. Self-attention pooling is employed to emphasize salient temporal segments, and cross-modal attention enables bidirectional interaction between temporal and summary representations. The fused representation is finally passed through a regression head to predict the next-day Health Score.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Diagrammatic flow of FAT-Net: (a) TS Stream: minute-level Conv1D stack &#x02192; BiLSTM; (b) DS Stream: MLP summary encoder; (c) Self-Attention Pooling: MHAttn on TS features; (d) Cross-Modal Fusion: cross multi-head attention combining TS &#x00026; DS embeddings; (e) Prediction Head: FC layers for next-day Health Score.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0004.tif">
<alt-text content-type="machine-generated">Diagram of a dual-stream neural network architecture. The TS stream processes inputs through a Conv1D stack, producing outputs. The DS stream utilizes an MLP. Both streams undergo self-attention pooling and cross-modal fusion using cross multi-head attention. This feeds into a cross-modal fusion layer, followed by a prediction head, generating outputs V(i-1) and (V(i). Arrows indicate data flow between components.</alt-text>
</graphic>
</fig>
<sec>
<label>3.1</label>
<title>Model formulation</title>
<p>Let each participant-day <italic>i</italic> be represented by:</p>
<disp-formula id="E5"><mml:math id="M5"><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M6"><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> contains minute-resolution signals (e.g., heart rate, music on/off) of length <italic>T</italic>, and <inline-formula><mml:math id="M7"><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> comprises aggregated daily summaries (e.g., BPM, sleep efficiency).</p>
<sec>
<label>3.1.1</label>
<title>Temporal encoding</title>
<disp-formula id="E6"><mml:math id="M8"><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>H</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>Conv</mml:mtext><mml:mn>1</mml:mn><mml:msub><mml:mrow><mml:mtext>D</mml:mtext></mml:mrow><mml:mrow><mml:mtext>stack</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>H</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>BiLSTM</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>H</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>Here, Conv1D<sub>stack</sub> denotes three sequential Conv1D layers (filters: 32 &#x02192; 64 &#x02192; 128; kernel sizes: 5,3,3) each followed by LayerNorm, GELU activation, and dropout (0.1). The BiLSTM uses hidden size <italic>h</italic>/2 per direction, yielding a combined dimension <italic>h</italic>.</p></sec></sec>
<sec>
<label>3.2</label>
<title>Self-attention pooling</title>
<p>Multi-head self-attention highlights salient temporal segments:</p>
<disp-formula id="E7"><mml:math id="M9"><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>MHAttn</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>H</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>v</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>This mechanism allows the model to focus on critical heart-rate fluctuations during or after music sessions.</p></sec>
<sec>
<label>3.3</label>
<title>Summary feature encoder</title>
<p>To bring in high-level behavioral context, we encode daily summaries via a lightweight MLP:</p>
<disp-formula id="E8"><mml:math id="M10"><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>v</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mtext>ML</mml:mtext><mml:msub><mml:mrow><mml:mtext>P</mml:mtext></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<list list-type="simple">
<list-item><p>(a) The MLP consists of three fully connected layers (dimensions: <italic>d</italic><sub><italic>ds</italic></sub> &#x02192; 64 &#x02192; 128 &#x02192; <italic>h</italic>), each followed by BatchNorm, ReLU, and dropout(0.2).</p></list-item>
<list-item><p>(b) This embedding captures aggregate effects such as total listening duration and sleep efficiency.</p></list-item>
</list></sec>
<sec>
<label>3.4</label>
<title>Cross-modal fusion</title>
<p>Fusing modalities via attention enables bidirectional contextualization:</p>
<disp-formula id="E9"><mml:math id="M11"><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>C</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>v</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>TS</mml:mtext></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>v</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>DS</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>CrossMHAttn</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>C</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>h</mml:mi></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<list list-type="simple">
<list-item><p>(a) CrossMHAttn uses separate query/key/value projections for TS &#x02192; DS and DS &#x02192; TS, emphasizing how summary features amplify temporal signals and vice versa.</p></list-item>
<list-item><p>(b) A 2-layer feed-forward network (512 &#x02192; 512 &#x02192; 2h) post-attention refines the fused representation.</p></list-item>
</list></sec>
<sec>
<label>3.5</label>
<title>Prediction head</title>
<p>The final fused embedding <bold>F</bold><sub><italic>i</italic></sub> feeds into a regression head:</p>
<list list-type="simple">
<list-item><p>(a) Two fully connected layers (2<italic>h</italic> &#x02192; 256, then 256 &#x02192; 1), each with ReLU and dropout(0.2).</p></list-item>
<list-item><p>(b) Outputs &#x00177;<sub><italic>i</italic>&#x0002B;1</sub>, the predicted next-day Health Score.</p></list-item>
</list></sec>
<sec>
<label>3.6</label>
<title>Training objective</title>
<p>Model parameters &#x003B8; are optimized by minimizing:</p>
<disp-formula id="E10"><mml:math id="M12"><mml:mrow><mml:mi mathvariant="script">L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x003BB;</mml:mtext><mml:mo>||</mml:mo><mml:mi>&#x003B8;</mml:mi><mml:msubsup><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <italic>y</italic><sub><italic>i</italic>&#x0002B;1</sub> is the true Health Score, and &#x003BB; (set to 1 &#x000D7; 10<sup>&#x02212;5</sup>) controls weight decay. We train using AdamW (lr = 3e-4, batch size = 16) with early stopping on validation MAE.</p>
<p>FAT-Net is built to address the complex, multimodal nature of predicting health status in older adults. It processes two complementary data streams: minute-level physiological signals such as heart rate and music on/off states, and daily-level summary features like sleep efficiency and average tempo of music. These are passed through separate encoders and later fused via cross-modal attention, enabling the model to learn both intra- and inter-modal interactions relevant to next-day health prediction. The time-series stream captures short-term temporal patterns using stacked convolutional layers and BiLSTM units, allowing the model to identify changes in physiological signals during or after music listening. Meanwhile, the summary stream captures behavioral context from features like PANAS scores, sleep metrics, and music characteristics using a lightweight MLP. The fusion layer integrates both streams through bi-directional attention, followed by a regression head that outputs the predicted health score. We evaluate FAT-Net&#x00027;s performance using standard regression metrics: Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), and <italic>R</italic><sup>2</sup> (coefficient of determination). These metrics assess the accuracy and consistency of the model&#x00027;s predictions. We compare FAT-Net to a set of well-established baseline models: Random Forest and XGBoost represent strong classical methods suited for tabular features, while LSTM and TCN provide competitive deep learning alternatives for sequence data. Our experiments demonstrate that FAT-Net not only achieves lower error rates but also provides interpretable insights via attention mechanisms, linking music behavior to health outcomes.</p></sec></sec>
<sec id="s4">
<label>4</label>
<title>Performance analysis</title>
<sec>
<label>4.1</label>
<title>Experimental setup</title>
<p>All models were implemented in Python 3.8 using PyTorch 1.12 for deep networks and scikit-learn 1.1 for tree-based methods, running on an NVIDIA RTX 3090 GPU (24 GB VRAM), Intel Core i9-11900K CPU, and 64 GB RAM. The dataset was split into 80% training, 10% validation, and 10% testing sets. We trained for up to 100 epochs with early stopping (patience = 10) for LSTM (<xref ref-type="bibr" rid="B10">10</xref>), TCN (<xref ref-type="bibr" rid="B11">11</xref>), and FAT-Net; batch size was 16, optimizer was AdamW with learning rate 3 &#x000D7; 10<sup>&#x02212;4</sup> and weight decay 1 &#x000D7; 10<sup>&#x02212;5</sup>. XGBoost (<xref ref-type="bibr" rid="B9">9</xref>) used 100 trees, max depth 6, and learning rate 0.1; Random Forest (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>) used 100 estimators and max depth 10.</p></sec>
<sec>
<label>4.2</label>
<title>Comparative analysis</title>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> shows that FAT-Net achieves the lowest RMSE across 10 independent runs, significantly outperforming Random Forest (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>), XGBoost (<xref ref-type="bibr" rid="B9">9</xref>), LSTM (<xref ref-type="bibr" rid="B10">10</xref>), and TCN (<xref ref-type="bibr" rid="B11">11</xref>). <xref ref-type="fig" rid="F6">Figure 6</xref> illustrates tighter MAE distributions for FAT-Net, indicating more consistent prediction accuracy. <xref ref-type="fig" rid="F7">Figure 7</xref> demonstrates that FAT-Net explains more variance (higher <italic>R</italic><sup>2</sup>) in next-day Health Score than all baselines. <xref ref-type="fig" rid="F8">Figure 8</xref> highlights average percentage improvements of FAT-Net over each baseline across RMSE, MAE, and <italic>R</italic><sup>2</sup>, with the greatest gains against Random Forest. Finally, <xref ref-type="fig" rid="F9">Figure 9</xref> presents a strong alignment between predicted and actual Health Scores (Pearson&#x00027;s <italic>r &#x0003D;</italic> 0.93), confirming FAT-Net&#x00027;s calibration and generalizability.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Figure shows notched boxplots of RMSE distribution across the five models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0005.tif">
<alt-text content-type="machine-generated">Box plot comparing Mean Absolute Error (MAE) of five models: Random Forest, XGBoost, LSTM, TCN, and FAT-Net. Random Forest shows the highest MAE and FAT-Net the lowest.</alt-text>
</graphic>
</fig>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Figure shows notched boxplots of MAE distribution across the five models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0006.tif">
<alt-text content-type="machine-generated">Box plot comparing RMSE values for five models: Random Forest, XGBoost, LSTM, TCN, and FAT-Net. Random Forest has the highest error around 0.46, while FAT-Net shows the lowest around 0.35.</alt-text>
</graphic>
</fig>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Figure shows notched boxplots of <italic>R</italic><sup>2</sup> distribution across the five models.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0007.tif">
<alt-text content-type="machine-generated">Box plot comparing R2 values for different models: Random Forest, XGBoost, LSTM, TCN, and FAT-Net. R2 values range from approximately 0.725 to 0.875, with FAT-Net showing the highest performance.</alt-text>
</graphic>
</fig>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Figure shows the average percentage improvement of FAT-Net over each baseline for RMSE, MAE, and <italic>R</italic><sup>2</sup>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0008.tif">
<alt-text content-type="machine-generated">Heatmap showing percentage improvement across four models: LSTM, Random Forest, TCN, and XGBoost, on RMSE, MAE, and R2 metrics. Random Forest has the highest improvements, with 19.5% for MAE and 17.3% for R2. A color scale indicates improvement percentage from eight to twenty-two percent.</alt-text>
</graphic>
</fig>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Figure shows the scatter of FAT-Net&#x00027;s predicted vs. actual health scores on the test set.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0009.tif">
<alt-text content-type="machine-generated">Scatter plot comparing actual health scores to predicted health scores. Yellow data points cluster closely along the diagonal line of best fit, ranging from negative three to four on both axes, indicating strong correlation.</alt-text>
</graphic>
</fig>
</sec></sec>
<sec id="s5">
<label>5</label>
<title>Attention visualization</title>
<p><xref ref-type="fig" rid="F10">Figure 10</xref> shows the attention each query time step gives to all key time steps. Darker cells correspond to low attention scores and brighter cells to high scores. Peaks often align with heart-rate spikes during music sessions. These patterns reveal which temporal segments the model finds most informative. Such insights help validate that the model focuses on meaningful physiological events. Overall, this map enhances interpretability and supports trust in our predictions.</p>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>Self-attention heatmap visualizing query-to-key attention weights over time.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0010.tif">
<alt-text content-type="machine-generated">Heatmap displaying attention weights with Query Time Steps on the vertical axis and Key Time Steps on the horizontal axis. Color intensity varies from dark purple to yellow, indicating weight values from 0 to 0.16.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F11">Figure 11</xref> illustrates which time steps each daily summary feature emphasizes. Rows represent summary feature queries and columns represent minute-level time steps. Brighter cells indicate strong influence of specific time steps on feature embeddings. These patterns identify which features drive predictions at particular times. This visualization clarifies how behavioral summaries and temporal data interact. It thereby deepens our understanding of cross-modal fusion in FAT-Net.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p>Cross-modal heatmap showing how summary feature queries attend to each time step.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpubh-13-1633924-g0011.tif">
<alt-text content-type="machine-generated">Heatmap showing attention weights for 10 features over 50 time steps. Darker purple indicates lower weights, while lighter pink to orange indicates higher weights. Weights range from 0.02 to 0.12.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s6">
<label>6</label>
<title>Discussion and future work</title>
<sec>
<label>6.1</label>
<title>Discussion</title>
<p>In this study, we demonstrated that the proposed Fusion-Attentive Temporal Network (FAT-Net) significantly outperforms conventional baselines, like Random Forest, XGBoost, LSTM, and TCN, in predicting next-day composite health scores for older adults based on minute-level physiological signals and music-listening behavior. The notched boxplots (<xref ref-type="fig" rid="F5">Figures 5</xref>&#x02013;<xref ref-type="fig" rid="F7">7</xref>) and the improvement heatmap (<xref ref-type="fig" rid="F8">Figure 8</xref>) confirm that FAT-Net reduces prediction error by up to 23% and increases explained variance by up to 17%. Our cross-modal attention mechanism enables the model to dynamically weight salient heart-rate fluctuations during music sessions and high-level summary features such as sleep efficiency, resulting in more robust and interpretable forecasts. The strong alignment of predicted vs. actual health scores (<xref ref-type="fig" rid="F9">Figure 9</xref>, Pearson&#x00027;s <italic>r &#x0003D;</italic> 0.93) further attests to FAT-Net&#x00027;s calibration and practical utility.</p>
<p>Beyond demonstrating model performance, our findings also offer insights into the health effects of music-based interventions for older adults. The attention visualization results reveal that the model consistently attends to moments of elevated heart-rate variability and specific music characteristics, such as increased tempo or valence, during and after listening sessions. These patterns correspond with existing literature showing that upbeat or emotionally engaging music can elevate mood, reduce stress, and support autonomic regulation. For example, attention peaks often aligned with post-listening heart-rate stabilization or during periods of high arousal music, suggesting potential physiological benefits of music exposure. This suggests that FAT-Net does not merely rely on technical time-series correlations but identifies semantically meaningful episodes where music engagement appears to mediate health-related changes. In this sense, the model helps illuminate the dynamic relationship between music behavior and well-being, offering a computational pathway to validate and interpret real-world music therapy effects. By coupling prediction with explainability, FAT-Net thus serves as both a forecasting tool and a mechanism to investigate the role of music in everyday health regulation. Future iterations of this work may further disentangle causal effects through controlled music intervention studies, but our current results already highlight the practical potential of integrating music behavior into digital health frameworks.</p></sec>
<sec>
<label>6.2</label>
<title>Practical implications</title>
<p>The superior performance of FAT-Net has several real-world implications. First, smartphone or wearable applications incorporating our model can deliver personalized music-therapy recommendations to older adults, adapting in real time to their physiological state and listening habits. Second, healthcare providers and caregivers could leverage daily health-score forecasts to monitor well-being remotely, triggering timely interventions, such as adjusting exercise regimens or recommending relaxation playlists, to prevent declines in mood or sleep quality. Finally, the interpretability afforded by the attention weights allows end-users and clinicians to understand which features (e.g., tempo, valence, HRV dips) most strongly influenced the prediction, fostering trust and facilitating shared decision-making in digital health platforms.</p></sec>
<sec>
<label>6.3</label>
<title>Limitations and future directions</title>
<p>While our augmented dataset (92 participants with 4 &#x000D7; synthetic expansion) enabled thorough model training, the relatively small cohort size and self-selected sample may limit generalizability. Moreover, the reliance on self-reported PANAS scores and Google Form logging introduces potential reporting bias. Our minute-level &#x0201C;music on/off&#x0201D; signal did not account for nuances such as multitasking or background noise, which could affect physiological responses.</p>
<p>Building on these results, future work should (a) validate FAT-Net on larger, more diverse cohorts like different age groups and cultural backgrounds, to assess robustness; (b) integrate additional sensor modalities (e.g., skin conductance, accelerometry) to capture broader physiological and contextual cues; (c) explore online learning schemes that adapt model parameters as new user data arrive, supporting lifelong personalization; (d) implement real-time on-device inference for privacy-preserving mHealth deployments; and (e) conduct randomized controlled trials to measure the clinical efficacy of FAT-Net&#x02013;driven music-therapy interventions in improving long-term health outcomes.</p></sec></sec>
<sec sec-type="conclusions" id="s7">
<label>7</label>
<title>Conclusion</title>
<p>We introduced FAT-Net, a dual-stream model combining Conv1D, BiLSTM, and cross-modal attention to fuse minute-level signals and daily summaries. In experiments on an augmented cohort (<italic>N</italic>&#x02248;368 participant-days), FAT-Net reduced RMSE by 23%. It also improved <italic>R</italic><sup>2</sup> by 17% compared to leading baselines. Attention weights highlighted music tempo, valence, and HRV fluctuations as key drivers of prediction. These findings demonstrate that cross-modal attention enhances prediction accuracy and interpretability. This approach offers a roadmap for data-driven music interventions. Its modular design can extend to additional health domains. By capturing temporal&#x02013;behavioral interactions, FAT-Net advances personalized digital therapeutics. Ultimately, this work supports scalable solutions for healthy aging.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="s9">
<title>Ethics statement</title>
<p>Ethical review and approval was not required for the study on human participants in accordance with the local legislation and institutional requirements. Written informed consent from the participants (or their legal guardian/next of kin) was not required to participate in this study in accordance with national legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s10">
<title>Author contributions</title>
<p>CM: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. BH: Writing &#x02013; review &#x00026; editing. SC: Writing &#x02013; review &#x00026; editing. XM: Writing &#x02013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
 <ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raglio</surname> <given-names>A</given-names></name></person-group>. <article-title>More music, more health! <italic>J Public Health</italic></article-title>. (<year>2021</year>) <volume>43</volume>:<fpage>742</fpage>&#x02013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1093/pubmed/fdaa123</pub-id></mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fu</surname> <given-names>MC</given-names></name> <name><surname>Belza</surname> <given-names>B</given-names></name> <name><surname>Nguyen</surname> <given-names>H</given-names></name> <name><surname>Logsdon</surname> <given-names>R</given-names></name> <name><surname>Demorest</surname> <given-names>S</given-names></name></person-group>. <article-title>Impact of group-singing on older adult health in senior living communities: a pilot study</article-title>. <source>Arch Gerontol Geriatr</source>. (<year>2018</year>) <volume>76</volume>:<fpage>138</fpage>&#x02013;<lpage>46</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.archger.2018.02.012</pub-id><pub-id pub-id-type="pmid">29518671</pub-id></mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Davidoff</surname> <given-names>F</given-names></name></person-group>. <article-title>Music lessons: what musicians can teach doctors (and other health professionals)</article-title>. <source>Ann Intern Med</source>. (<year>2011</year>) <volume>154</volume>:<fpage>426</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.7326/0003-4819-154-6-201103150-00009</pub-id><pub-id pub-id-type="pmid">21403078</pub-id></mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Faulkner</surname> <given-names>S</given-names></name></person-group>. <article-title>Rhythm2Recovery: a model of practice combining rhythmic music with cognitive reflection for social and emotional health within trauma recovery</article-title>. <source>Austral N Zeal J Fam Ther</source>. (<year>2017</year>) <volume>38</volume>:<fpage>627</fpage>&#x02013;<lpage>36</lpage>. doi: <pub-id pub-id-type="doi">10.1002/anzf.1268</pub-id></mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>T</given-names></name> <name><surname>Zhao</surname> <given-names>Y</given-names></name> <name><surname>Yin</surname> <given-names>M</given-names></name></person-group>. <article-title>Analysis and research on the influence of music on students&#x00027; mental health under the background of deep learning</article-title>. <source>Front Psychol</source>. (<year>2022</year>) <volume>13</volume>:<fpage>998451</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2022.998451</pub-id><pub-id pub-id-type="pmid">36312155</pub-id></mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sch&#x000E4;fer</surname> <given-names>A</given-names></name> <name><surname>Vagedes</surname> <given-names>J</given-names></name></person-group>. <article-title>How accurate is pulse rate variability as an estimate of heart rate variability?</article-title>: a review on studies comparing photoplethysmographic technology with an electrocardiogram. <source>Int J Cardiol</source>. (<year>2013</year>) <volume>166</volume>:<fpage>15</fpage>&#x02013;<lpage>29</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijcard.2012.03.119</pub-id></mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Singh</surname> <given-names>D</given-names></name> <name><surname>Kaur</surname> <given-names>M</given-names></name> <name><surname>Kumar</surname> <given-names>V</given-names></name> <name><surname>Jabarulla</surname> <given-names>MY</given-names></name> <name><surname>Lee</surname> <given-names>HN</given-names></name></person-group>. <article-title>Artificial intelligence-based cyber-physical system for severity classification of chikungunya disease</article-title>. <source>IEEE J Transl Eng Health Med</source>. (<year>2022</year>) <volume>10</volume>:<fpage>1</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JTEHM.2022.3171078</pub-id><pub-id pub-id-type="pmid">35769405</pub-id></mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Breiman</surname> <given-names>L</given-names></name></person-group>. <article-title>Random forests</article-title>. <source>Mach Learn</source>. (<year>2001</year>) <volume>45</volume>:<fpage>5</fpage>&#x02013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id></mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Guestrin</surname> <given-names>C</given-names></name></person-group>. <article-title>Xgboost: a scalable tree boosting system</article-title>. In: <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source> (<year>2016</year>). p. <fpage>785</fpage>&#x02013;<lpage>94</lpage>. doi: <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Graves</surname> <given-names>A</given-names></name> <name><surname>Graves</surname> <given-names>A</given-names></name></person-group>. <article-title>Long short-term memory</article-title>. In: <source>Supervised Sequence Labelling With Recurrent Neural Networks</source>. Berlin; Heidelberg: Springer (<year>2012</year>). p. <fpage>37</fpage>&#x02013;<lpage>45</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-642-24797-2_4</pub-id></mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bai</surname> <given-names>S</given-names></name> <name><surname>Kolter</surname> <given-names>JZ</given-names></name> <name><surname>Koltun</surname> <given-names>V</given-names></name></person-group>. <article-title>An empirical evaluation of generic convolutional and recurrent networks for sequence modeling</article-title>. <source>arXiv Preprint arXiv:180301271</source>. (<year>2018</year>). doi: <pub-id pub-id-type="doi">10.48550/arXiv.1803.01271</pub-id></mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname> <given-names>Y</given-names></name> <name><surname>Wu</surname> <given-names>S</given-names></name></person-group>. <article-title>Applying the cloud intelligent classroom to the music curriculum design of the mental health education</article-title>. <source>Front Psychol</source>. (<year>2021</year>) <volume>12</volume>:<fpage>729213</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2021.729213</pub-id><pub-id pub-id-type="pmid">34867609</pub-id></mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Watson</surname> <given-names>D</given-names></name> <name><surname>Clark</surname> <given-names>LA</given-names></name> <name><surname>Tellegen</surname> <given-names>A</given-names></name></person-group>. <article-title>Development and validation of brief measures of positive and negative affect: the PANAS scales</article-title>. <source>J Pers Soc Psychol</source>. (<year>1988</year>) <volume>54</volume>:<fpage>1063</fpage>. doi: <pub-id pub-id-type="doi">10.1037//0022-3514.54.6.1063</pub-id><pub-id pub-id-type="pmid">3397865</pub-id></mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bulaj</surname> <given-names>G</given-names></name> <name><surname>Clark</surname> <given-names>J</given-names></name> <name><surname>Ebrahimi</surname> <given-names>M</given-names></name> <name><surname>Bald</surname> <given-names>E</given-names></name></person-group>. <article-title>From precision metapharmacology to patient empowerment: delivery of self-care practices for epilepsy, pain, depression and cancer using digital health technologies</article-title>. <source>Front Pharmacol</source>. (<year>2021</year>) <volume>12</volume>:<fpage>612602</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphar.2021.612602</pub-id><pub-id pub-id-type="pmid">33972825</pub-id></mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Groh</surname> <given-names>R</given-names></name> <name><surname>Lei</surname> <given-names>Z</given-names></name> <name><surname>Martignetti</surname> <given-names>L</given-names></name> <name><surname>Li-Jessen</surname> <given-names>NYK</given-names></name> <name><surname>Kist</surname> <given-names>AM</given-names></name></person-group>. <article-title>Efficient and explainable deep neural networks for airway symptom detection in support of wearable health technology</article-title>. <source>Adv Intell Syst</source>. (<year>2022</year>) <volume>4</volume>:<fpage>2100284</fpage>. doi: <pub-id pub-id-type="doi">10.1002/aisy.202100284</pub-id></mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jia</surname> <given-names>Y</given-names></name></person-group>. <article-title>Impact of music teaching on student mental health using IoT, recurrent neural networks, and big data analytics</article-title>. <source>Mobile Netw Applic</source>. (<year>2024</year>) <fpage>16</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s11036-024-02366-0</pub-id></mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Na</surname> <given-names>H</given-names></name></person-group>. <article-title>assessing psychological health and emotional expression of musical education using Q-learning</article-title>. <source>Mobile Netw Applic</source>. (<year>2024</year>). doi: <pub-id pub-id-type="doi">10.1007/s11036-024-02401-0</pub-id></mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>H</given-names></name> <name><surname>Hu</surname> <given-names>J</given-names></name> <name><surname>Rauterberg</surname> <given-names>M</given-names></name></person-group>. <article-title>Bio-feedback Based In-flight Music System Design to Promote Heart Health</article-title>. In:<person-group person-group-type="editor"><name><surname>Mahadevan</surname> <given-names>V</given-names></name> <name><surname>Yu</surname> <given-names>W</given-names></name> <name><surname>Zhou</surname> <given-names>J</given-names></name></person-group>, editors. <source>Proceedings of 2009 International Conference on Machine Learning and Computing (IACSIT ICMLC 2009)</source>. <publisher-loc>Perth, WA</publisher-loc>: <publisher-name>Int Assoc Comp Sci &#x00026; Informat Technol; Singapore Inst Elect</publisher-name> (<year>2009</year>). p. <fpage>446</fpage>&#x02013;<lpage>50</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Milligan</surname> <given-names>E</given-names></name> <name><surname>Woodley</surname> <given-names>E</given-names></name></person-group>. <article-title>Creative expressive encounters in health ethics education: teaching ethics as relational engagement</article-title>. <source>Teach Learn Med</source>. (<year>2009</year>) <volume>21</volume>:<fpage>131</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1080/10401330902791248</pub-id><pub-id pub-id-type="pmid">19330692</pub-id></mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Carter</surname> <given-names>J</given-names></name> <name><surname>Carey</surname> <given-names>M</given-names></name></person-group>. <article-title>Arts in health: using the arts in undergraduate nursing education to foster critical thinking skills</article-title>. In:<person-group person-group-type="editor"><name><surname>Chova</surname> <given-names>L</given-names></name> <name><surname>Belenguer</surname> <given-names>D</given-names></name> <name><surname>Torres</surname> <given-names>I</given-names></name></person-group>, editors. <source>4th International Technology, Education and Development Conference (INTED 2010).</source> <publisher-loc>Valencia</publisher-loc>: <publisher-name>International Association of Technology, Education and Development (IATED)</publisher-name> (<year>2010</year>). p. <fpage>914</fpage>&#x02013;<lpage>21</lpage>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/200143/overview">Jie Chen</ext-link>, Hunan Normal University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1288781/overview">Victoria Ramos Gonzalez</ext-link>, Carlos III Health Institute (ISCIII), Spain</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1021549/overview">Alexander Grove Belden</ext-link>, Woodhall School, United States</p>
</fn>
</fn-group>
</back>
 </article>