<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Commun. Netw.</journal-id>
<journal-title>Frontiers in Communications and Networks</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Commun. Netw.</abbrev-journal-title>
<issn pub-type="epub">2673-530X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1658461</article-id>
<article-id pub-id-type="doi">10.3389/frcmn.2025.1658461</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Communications and Networks</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Spatio-temporal beam-level traffic forecasting in 5G wireless systems using multi-task learning</article-title>
<alt-title alt-title-type="left-running-head">Tommy et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frcmn.2025.1658461">10.3389/frcmn.2025.1658461</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Tommy</surname>
<given-names>Israel</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3086476/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Akinola</surname>
<given-names>Taoreed</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Xiangfang</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/595900/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Qian</surname>
<given-names>Lijun</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/594945/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
</contrib-group>
<aff>
<institution>CREDIT Center, Department of Electrical and Computer Engineering, Prairie View A&#x0026;M University, Prairie View</institution>, <addr-line>TX</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1741369/overview">Sanjay Dhar Roy</ext-link>, National Institute of Technology, Durgapur, India</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2808152/overview">Ahmad Bazzi</ext-link>, New York University Abu Dhabi, United Arab Emirates</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3122796/overview">Latifa Guesmi</ext-link>, Universit&#xe9; de Carthage, Tunisia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Israel Tommy, <email>itommy@pvamu.edu</email>; Taoreed Akinola, <email>takinola2@pvamu.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>24</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>6</volume>
<elocation-id>1658461</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Tommy, Akinola, Li and Qian.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Tommy, Akinola, Li and Qian</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Beam-level traffic forecasting plays a vital role in the optimization of 5G networks by enabling proactive resource allocation and congestion control. However, the task is complicated by inherent data sparsity and the presence of multi-scale temporal dynamics, making accurate predictions difficult to achieve using conventional models.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address these challenges, we propose a Gated Recurrent Unit (GRU)-based Multi-Task Learning (MTL) framework, enhanced by a weighted ensemble approach. We systematically evaluate the performance of six forecasting models&#x2014;Linear Regression, DLinear, XGBoost, Echo State Network (ESN), Long Short-Term Memory (LSTM), and GRU-MTL&#x2014;across three input sequence lengths (168-h, 24-h, and 8-h) using real-world beam-level data from the ITU AI for Good initiative.</p>
</sec>
<sec>
<title>Results</title>
<p>Experimental findings reveal that the GRU-MTL model significantly outperforms traditional baselines, achieving a Mean Absolute Error (MAE) of 0.2136 on 168-h sequences compared to LSTM&#x2019;s 0.3223. Long sequences (168-h) reduce MAE by 56% relative to short 8-h windows, effectively mitigating the effects of sparsity. Furthermore, an ensemble of top-performing models (MTL, XGBoost, and Linear Regression) yields additional gains, reducing MAE to 0.2105&#x2014;a 1.45% improvement over MTL alone. </p>
</sec>
<sec>
<title>Discussion</title>
<p>These results highlight the importance of long-term temporal context and model diversity for robust traffic prediction in sparse environments. The proposed framework offers practical guidelines: 168-h forecasting windows are optimal for weekly planning, and model ensembling enhances generalization across varying beam activity levels. This study contributes a scalable and accurate solution for spatio-temporal traffic forecasting in next-generation wireless networks.</p>
</sec>
</abstract>
<kwd-group>
<kwd>5G</kwd>
<kwd>traffic forecasting</kwd>
<kwd>time series prediction</kwd>
<kwd>GRU</kwd>
<kwd>multi-task learning</kwd>
<kwd>LSTM</kwd>
<kwd>ESN</kwd>
<kwd>DLinear</kwd>
</kwd-group>
<counts>
<page-count count="17"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Wireless Communications</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>The explosive evolution of 5G networks has redefined the wireless communications paradigm and witnessed an exponential surge in mobile data traffic driven by the proliferation of smartphones, IoT devices, and bandwidth-intensive applications, introducing an increasing strain on the available wireless spectrum capacity (<xref ref-type="bibr" rid="B11">Cisco, 2020</xref>; <xref ref-type="bibr" rid="B45">Zhang et al., 2023</xref>). This increasing demand has not only introduced a suite of challenges that require innovative forecasting techniques, but has also required continuous advances in wireless communication technologies such as 5G and beyond (<xref ref-type="bibr" rid="B20">ITU Radiocommunication Sector, 2020</xref>; <xref ref-type="bibr" rid="B1">3GPP, 2022</xref>).</p>
<p>To meet these growing demands, 5G wireless networks operate across a diverse set of frequency ranges, including sub-6&#xa0;GHz (FR1) and millimeter-wave (mmWave) bands (FR2, 24&#x2013;100&#xa0;GHz), offering abundant spectrum resources that enable multi-gigabit-per-second data rates and ultra-low latency (<xref ref-type="bibr" rid="B27">Rappaport et al., 2013</xref>). However, mmWave signals suffer from high path loss and susceptibility to blockages, which require advanced techniques such as beamforming and massive MIMO to focus energy directionally and enhance coverage (<xref ref-type="bibr" rid="B30">Rohde and Poddar, 2018</xref>). Recently, the upper midband spectrum&#x2014;also known as FR3, typically spanning 7&#x2013;24&#xa0;GHz&#x2014;has emerged as a promising candidate for 6G due to its favorable trade-off between capacity and coverage (<xref ref-type="bibr" rid="B17">Giordani et al., 2020</xref>). This band benefits from improved propagation compared to mmWave while still offering wider bandwidth than sub-6&#xa0;GHz, making it well-suited for mobile broadband and edge intelligence applications (<xref ref-type="bibr" rid="B4">Alsabah et al., 2023</xref>). In addition, 6G-customized beamforming strategies, such as outage-based beamforming, are being developed to improve link reliability under dynamic conditions and harsh propagation environments (<xref ref-type="bibr" rid="B5">Alrabeiah and Alkhateeb, 2022</xref>). These technologies not only improve spectral efficiency but also support massive device connectivity and enable emerging applications such as smart cities, immersive media, and real-time industrial IoT (<xref ref-type="bibr" rid="B2">Agiwal et al., 2016</xref>).</p>
<p>A key challenge in 5G network management is ensuring optimal resource allocation to maintain high Quality of Service (QoS) in increasingly dense and heterogeneous network environments (<xref ref-type="bibr" rid="B45">Zhang et al., 2023</xref>). Efficient traffic forecasting enables proactive network optimization, minimizing congestion, and ensuring dynamic bandwidth allocation (<xref ref-type="bibr" rid="B39">Wu et al., 2020</xref>). However, traffic forecasting in 5G networks is significantly more complex than in previous generations due to the increased granularity of network management, particularly at the beam level. Unlike traditional macro cell-based forecasting, beam-level forecasting requires capturing highly localized and dynamic user activity, making it a crucial but difficult task (<xref ref-type="bibr" rid="B28">Rappaport et al., 2019</xref>). In addition, the traditional methods struggle to handle traffic irregularities such as intermittent zeros, short time-series lengths, and multivariate dependencies. This study tackles these limitations by introducing Gated Recurrent Unit (GRU) in multi-task learning (MTL) framework capable of learning shared representations across multiple prediction tasks. The proposed approach leverages state-of-the-art (SOTA) forecasting models designed to handle complex multivariate time-series data, enhancing predictive accuracy and real-time adaptability (<xref ref-type="bibr" rid="B32">Siami-Namini et al., 2018</xref>).</p>
<sec id="s1-1">
<title>1.1 Traffic data collection in 5G wireless system</title>
<p>As part of global efforts to advance AI-driven solutions for sustainable development, the AI for Good initiative, in collaboration with the International Telecommunication Union (ITU), launched a challenge in spatio-temporal load forecasting in 5G wireless systems. The challenge focuses on predicting beam-level wireless traffic, a critical component in enhancing network resource allocation and ensuring efficient network operations (<xref ref-type="bibr" rid="B3">AIforGood-ITU, 2024</xref>). (The data collection process is given in <xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Spatio-Temporal Beam-Level Traffic Data Collection in 5G Wireless Systems: 30 Base Stations (BSs), 3 users per BS, and 32 beams per user. Data is collected hourly for 5 weeks (<xref ref-type="bibr" rid="B3">AIforGood-ITU, 2024</xref>).</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g001.tif">
<alt-text content-type="machine-generated">Illustration of four base stations labeled Base Station 1, 2, 3, and 4, each transmitting beams to different users labeled User 1, 2, and 3. Colored lines represent communication beams between the base stations and users. An inset shows multiple waveform patterns labeled Beams.</alt-text>
</graphic>
</fig>
<p>To support this initiative, the ITU released high-resolution beam-level throughput datasets with precise hourly granularity, providing critical network performance metrics, including throughput volume, throughput time, physical resource block (PRB) utilization, and user count. Each of the four datasets (throughput volume, throughput time, physical resource block (PRB) utilization, and user count) comprises:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 30 base stations (BSs), each containing 3 cells, with each cell consisting of 32 beams, resulting in a total of 2,880 beams.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Hourly recordings over 5&#xa0;weeks, amounting to 2,419,200 samples for each of the four datasets, making it one of the most detailed public datasets available for 5G network traffic forecasting.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Hierarchical segmentation, enabling granular traffic flow analysis at different levels of the network infrastructure.</p>
</list-item>
</list>
</p>
<p>These extensive datasets provides a valuable foundation for exploring forecasting strategies, allowing researchers to develop models capable of capturing intricate spatial and temporal variations in network traffic. <xref ref-type="fig" rid="F2">Figures 2</xref>&#x2013;<xref ref-type="fig" rid="F4">4</xref> show examples of the four different time-series data for 5&#xa0;weeks at different base stations, beams, and cells.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Sample of the four datasets at a different base stations, cells and beams.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g002.tif">
<alt-text content-type="machine-generated">Eight line graphs compare data over hours. The top row includes graphs for &#x22;User Count 0_0_0&#x22;, &#x22;PRB 0_0_0&#x22;, &#x22;Time 0_0_0&#x22;, and &#x22;Traffic Volume 0_0_0&#x22;. The bottom row includes &#x22;User Count 0_0_10&#x22;, &#x22;PRB 0_0_10&#x22;, &#x22;Time 0_0_10&#x22;, and &#x22;Traffic Volume 0_0_10&#x22;. Vertical axes differ for each metric.</alt-text>
</graphic>
</fig>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Sample of the four datasets at a different base stations, cells and beams.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g003.tif">
<alt-text content-type="machine-generated">Eight line graphs show data over 900 hours, with the top row for &#x22;User Count,&#x22; &#x22;PRB,&#x22; &#x22;Time,&#x22; and &#x22;Traffic Volume&#x22; marked as &#x22;0 1 5,&#x22; and the bottom row similarly labeled as &#x22;1 0 13.&#x22; Orange lines represent user-related data, while green lines illustrate traffic volume. Each graph features a legend and axes labeled with relevant metrics.</alt-text>
</graphic>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Sample of the four datasets at a different base stations, cells and beams.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g004.tif">
<alt-text content-type="machine-generated">Eight line graphs show trends over time. The top row presents data labeled &#x22;1_1_29&#x22; for user count, PRB, time, and traffic volume. The bottom row provides similar data labeled &#x22;2_2_15&#x22;. Yellow lines indicate user count, PRB, and time, while green represents traffic volume. Each graph compares values against hours.</alt-text>
</graphic>
</fig>
<p>To gain a deeper understanding of the deficiency inherent in beam-level traffic patterns, we analyzed the distribution of zero-valued entries across both temporal (840 hourly samples) and spatial (2,880 beams) dimensions. These zero entries represent periods or locations of beam inactivity where no user data traffic was recorded. An initial analysis, illustrated in the bar chart of <xref ref-type="fig" rid="F5">Figure 5a</xref>, reveals the average spatial sparsity per time sample. On average, each sample comprises approximately 915 zero-valued and 1965 non-zero-valued beams, indicating a mean sparsity level of approximately <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mn>32</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This reflects a consistent pattern of intermittent beam activity at any given time step. Furthermore, we examined the temporal evolution of this deficiency, as depicted in the line plot of <xref ref-type="fig" rid="F5">Figure 5b</xref>. The percentage of inactive beams per sample fluctuates considerably over the 840-h period, with values ranging from approximately <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to over <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mn>40</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This dynamic trend highlights the significant temporal variability in network utilization, likely attributable to factors such as diurnal usage cycles, user mobility patterns, and heterogeneous service demands. These findings underscore a critical modeling challenge: the data is fundamentally sparse and dynamically so. This necessitates the development of forecasting models, such as scarcity-aware or multi-task architectures, that can explicitly account for inactive periods to improve predictive accuracy and avoid biases introduced by zero-inflated data.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Traffic zero and non-zero data distribution. <bold>(a)</bold> Average Zero vs Non-Zero Counts per Sample. <bold>(b)</bold> Temporal Distribution of Zero Entries Across Samples.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g005.tif">
<alt-text content-type="machine-generated">Bar and line charts illustrate average zeros versus non-zeros and zero percentage variation over time. The bar chart shows a higher count of non-zeros than zeros. The line chart displays fluctuating zero percentages between 25% and 50% across sample indices.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s1-2">
<title>1.2 The problem statement</title>
<p>The rapid expansion of 5G wireless networks, driven by unprecedented growth in mobile devices, IoT applications, and bandwidth-demanding services, has placed increasing pressure on the limited wireless spectrum and necessitated significant advancements in how traffic is managed and forecasted. Unlike traditional macro cell-level prediction, the introduction of beamforming in 5G networks has shifted traffic management to the beam level, where user activity is highly localized, dynamic, and inherently more complex to predict. Accurate beam-level traffic forecasting has therefore become critical for enabling optimal resource allocation, dynamic bandwidth management, congestion mitigation, and the delivery of consistent Quality of Service (QoS) in modern wireless systems.</p>
<p>However, this finer spatial granularity introduces several persistent challenges that conventional statistical and early machine learning methods struggle to address. These include the prevalence of intermittent zeros in beam-level datasets&#x2014;periods of low or no traffic&#x2014;which can distort predictions when not handled properly. Moreover, the typically short time-series length for individual beams limits the model&#x2019;s ability to learn long-term patterns, further complicating forecasting tasks. In addition, the multivariate nature of network performance metrics adds a layer of complexity, requiring models to capture intricate dependencies among multiple correlated features. Finally, practical deployment constraints in 5G networks demand forecasting models that are not only accurate but also lightweight and efficient enough for real-time use at the network edge.</p>
<p>These challenges highlight a clear gap: while deep learning methods such as Recurrent Neural Networks (RNNs) and Long Short-Term Memory (LSTM) networks have shown promise in general traffic forecasting, they often remain inadequate for beam-level forecasting where sparsity, short time spans, and multivariate relationships must be tackled simultaneously. This thesis addresses this gap by investigating the use of Gated Recurrent Units (GRUs) within a Multi-Task Learning (MTL) framework. The goal is to design a forecasting approach that can learn shared representations across multiple prediction tasks&#x2014;such as traffic volume regression and active/inactive beam classification&#x2014;while remaining computationally efficient for real-time operation. In doing so, this research aims to advance the development of robust, scalable, and adaptive forecasting models that can meet the stringent requirements of next-generation wireless networks.</p>
<p>Spatio-temporal beam-level forecasting in 5G networks requires models that can effectively handle sparse, irregular, and highly dynamic traffic patterns while remaining efficient enough for real-time deployment. Conventional forecasting methods&#x2014;including statistical models, traditional machine learning, and even some deep learning architectures&#x2014;struggle with this problem because they either fail to capture long-range temporal dependencies, are sensitive to noise and missing data, or impose heavy computational costs.</p>
<p>Gated Recurrent Units (GRUs) directly address these limitations, making them particularly well-suited to outperform other models for beam-level traffic prediction. Their gating mechanisms selectively retain relevant historical information while filtering out noise, enabling them to model complex non-linear temporal relationships even across long gaps with zero traffic. Unlike attention-based or Transformer models, which require large datasets to generalize effectively, GRUs generalize well with limited time-series data&#x2014;a common scenario in beam-level measurements. Moreover, they achieve performance comparable to or better than Long Short-Term Memory (LSTM) networks but with fewer parameters, resulting in lower latency and reduced computational overhead&#x2014;essential for real-time 5G network optimization.</p>
<p>Beam-level forecasting also presents several specific challenges that further justify the use of GRUs:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Handling Intermittent Zeros in the Dataset: Intermittent zero values occur during low or no traffic periods, skewing prediction outcomes. Effectively managing these zeros is crucial for maintaining accurate forecasts.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Limited Time-Series Length: The relatively short time-series data complicates the capture of long-term patterns, posing a challenge for large-scale forecasting models that require extensive data for effective training.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Handling Multivariate Data: The dataset&#x2019;s multiple network performance metrics introduce additional complexity. Accurately modeling multivariate dependencies is essential for capturing the full scope of traffic influences.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Requirement for Real-Time, Lightweight Models: Real-time forecasting necessitates models that are both accurate and computationally efficient. Many emerging deep learning models are resource-intensive, making lightweight and optimized models critical for timely forecasts.</p>
</list-item>
</list>
</p>
<p>By aligning these strengths with the unique requirements of beam-level forecasting, the proposed GRU-based approaches&#x2014;including a Multi-Task Learning framework and an ensemble strategy&#x2014;are explicitly designed to outperform conventional forecasting techniques in capturing the spatio-temporal complexity of 5G traffic. (<xref ref-type="bibr" rid="B14">Fu et al., 2016</xref>).</p>
<p>Furthermore, a principal contribution of this research is the development and integration of a Multi-task Learning (MTL) paradigm. The MTL framework explicitly addresses the multivariate characteristics of beam traffic data by enabling the model to learn shared latent representations and exploit correlations across related variables (e.g., user count, PRB utilization, throughput traffic and time), thereby fostering model generalization, reduce overfitting and increasing more accurate and responsive traffic forecasts. Our research provides several key contributions to the field of spatio-temporal beam-level traffic forecasting in 5G wireless systems:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Effective Modeling of Intermittent Data with GRUs: This research demonstrates the inherent capability of the GRU architecture to effectively model raw time-series data containing intermittent zeros without requiring complex, model-agnostic pre-processing steps. The GRU&#x2019;s gating mechanism learns to distinguish between meaningful periods of network inactivity and actual traffic patterns. Instead of disregarding zero-value periods, the model learns from them as part of the temporal sequence, allowing it to accurately capture the sporadic nature of beam-level traffic and retain crucial information about demand intervals.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> GRU-based Multi-task Learning for Enhanced Prediction Accuracy: This research further introduces a novel multi-task learning framework that leverages a GRU core. This framework is designed to simultaneously perform two correlated tasks: forecasting traffic magnitude and classifying demand occurrence (i.e., zero vs. non-zero traffic states). By learning these tasks in parallel, the model capitalizes on shared representations, which enhances generalization and mitigates overfitting. This dual-objective approach significantly improves the accuracy of the forecast, particularly for datasets characterized by the high variability and deficiency common to network traffic.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Optimization of Input Sequence Length for Improved Performance: This study systematically investigates the impact of input sequence length on predictive accuracy within our MTL framework. Three strategically chosen temporal windows were examine: 168&#xa0;h (weekly cycles), 24&#xa0;h (diurnal patterns), and 8&#xa0;h (short-term activity segments). These intervals were selected to simulate human-like interactions with the data. The findings reveal that longer sequence lengths yield superior forecasting performance compared to shorter ones. The 168-h window&#x2019;s performance advantage confirms that beam-level forecasting benefits from an extended historical context when modeling sparse events. With more than <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mn>31.8</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> zero-inflated beams, longer sequences provide sufficient active samples to distinguish true inactivity from measurement noise, a known challenge in cellular traffic prediction.</p>
</list-item>
</list>
</p>
<p>These contributions advance the methodologies for spatio-temporal traffic analysis, offering a robust and accurate framework for enhancing network performance and resource management in 5G and beyond.</p>
</sec>
</sec>
<sec id="s2">
<title>2 Related works</title>
<p>Performing accurate and timely network traffic forecasting has long been a critical area of research in telecommunication systems, driven by the need for efficient resource allocation, congestion control, and proactive network management. Early efforts primarily focused on macroscopic network traffic, employing traditional statistical time-series models due to their simplicity and interpretability.</p>
<p>One of the foundational approaches involved Autoregressive Integrated Moving Average (ARIMA) models and their variants, widely applied for their ability to capture temporal dependencies in network traffic (<xref ref-type="bibr" rid="B7">Brockwell and Davis, 2002</xref>). For instance, Box and Jenkins&#x2019; methodology (<xref ref-type="bibr" rid="B6">Box et al., 2015</xref>) provided a robust framework for modeling and forecasting time series data, which was subsequently adapted for internet traffic prediction. Exponential smoothing methods also found application in forecasting network loads, offering adaptive mechanisms for capturing trends and seasonality (<xref ref-type="bibr" rid="B19">Hyndman and Athanasopoulos, 2018</xref>). While effective for aggregated traffic, these statistical models often struggled with the inherent non-linearity, high variability, and complex long-range dependencies characteristic of modern communication networks.</p>
<p>As networks evolved from fixed-line to mobile cellular systems (e.g., 2G, 3G, and early 4G deployments), the complexity of traffic patterns increased, necessitating more sophisticated forecasting techniques. Machine learning (ML) models began to emerge as promising alternatives to traditional statistical methods due to their ability to learn complex non-linear relationships from data. Support Vector Machines (SVMs) were explored for their robustness to noisy data and ability to handle high-dimensional features, demonstrating effectiveness in predicting network congestion and traffic volume (<xref ref-type="bibr" rid="B36">Wang et al., 2011</xref>). More recently, the advent of deep learning has revolutionized network traffic forecasting, with models such as Long Short-Term Memory (LSTM) networks and Gated Recurrent Units (GRUs) proving highly effective in capturing long-term temporal dependencies and non-linear patterns in network traffic data (<xref ref-type="bibr" rid="B29">Rau et al., 2023</xref>; <xref ref-type="bibr" rid="B26">Ramakrishnan and Soni, 2018</xref>). Convolutional Neural Networks (CNNs) have also been increasingly employed to extract hierarchical features from traffic data, demonstrating strong performance in various network prediction tasks [ (<xref ref-type="bibr" rid="B41">Yu et al., 2017</xref>)].</p>
<p>The growing recognition of spatial correlations in network traffic has led to extensive research in spatio-temporal forecasting, especially with the proliferation of dense wireless deployments. Unlike earlier approaches that often combined spatial and temporal models in a decoupled manner, modern deep learning frameworks, particularly Graph Neural Networks (GNNs), have become instrumental in explicitly modeling intricate spatio-temporal dependencies. These models leverage the topological structure of networks to capture complex spatial relationships, while integrated recurrent or convolutional layers handle temporal dynamics, leading to significant advancements in forecasting accuracy for large-scale and complex network environments, including cellular networks (<xref ref-type="bibr" rid="B41">Yu et al., 2017</xref>; <xref ref-type="bibr" rid="B40">Wu et al., 2024</xref>; <xref ref-type="bibr" rid="B38">Wang et al., 2025</xref>).</p>
<p>Multi-Task Learning (MTL), a paradigm where multiple related learning tasks are solved jointly to leverage commonalities and improve generalization performance, has gained significant traction in network traffic forecasting since 2015. This approach is particularly beneficial for complex and heterogeneous network environments, such as 5G, where various prediction tasks (e.g., traffic volume, throughput, and latency across different beams or cells) are inherently related. By learning shared representations across these tasks, MTL frameworks can overcome challenges like intermittent data patterns and limited time-series lengths for individual tasks, leading to improved prediction accuracy and robustness compared to single-task learning models (<xref ref-type="bibr" rid="B37">Wang et al., 2022</xref>; <xref ref-type="bibr" rid="B24">Liu et al., 2024</xref>). For instance, recent works have explored MTL to predict citywide cellular network traffic across diverse services, demonstrating its ability to capture complex spatio-temporal fluctuations by sharing knowledge among tasks (<xref ref-type="bibr" rid="B34">Sun et al., 2022</xref>).</p>
</sec>
<sec id="s3">
<title>3 Proposed methods</title>
<p>The significant number of zero-value observations renders traditional forecasting models ineffective. To overcome this limitation, we employ a Gated Recurrent Unit (GRU), a sophisticated deep learning model. Unlike statistical methods that require decomposing the series, the GRU learns directly from the raw, intermittent data. Its gating mechanism dynamically manages the information flow, enabling it to capture complex, non-linear temporal relationships between past events and future demand, even when they are separated by numerous zero-value periods. This makes the GRU a powerful and flexible tool for forecasting sporadic data patterns.</p>
<sec id="s3-1">
<title>3.1 Handling intermittent data with GRU</title>
<p>Intermittent time series, characterized by sporadic non-zero observations amidst prolonged periods of zeros, present significant challenges for forecasting models. Traditional statistical methods often fail to adequately capture the underlying patterns in such data due to their reliance on assumptions of continuity and uniform variance (<xref ref-type="bibr" rid="B19">Hyndman and Athanasopoulos, 2018</xref>). Deep learning approaches, particularly Gated Recurrent Units (GRUs), have emerged as a promising alternative by leveraging sequential learning to model complex temporal dependencies, including intermittent demand structures (<xref ref-type="bibr" rid="B21">Lai et al., 2018</xref>).</p>
<p>Classical approaches to intermittent demand forecasting typically fall into three categories: exponential smoothing variants and probability-based methods. Simple exponential smoothing (SES) applies uniform weighting to all observations, including zeros, resulting in biased forecasts when demand is sporadic (<xref ref-type="bibr" rid="B16">Gardner, 2006</xref>). Modified exponential smoothing techniques, such as the TSB method (<xref ref-type="bibr" rid="B35">Teunter et al., 2011</xref>), improve upon SES by incorporating demand probability estimates, but they still rely on heuristic adjustments rather than data-driven learning. Probability-based methods, such as zero-inflated Poisson regression (<xref ref-type="bibr" rid="B22">Lambert, 1992</xref>), explicitly account for excess zeros but are limited in their ability to capture evolving temporal dynamics.</p>
<p>GRUs, a variant of recurrent neural networks (RNNs), address many of the limitations of traditional methods through their gated architecture. The update and reset gates allow GRUs to dynamically modulate information flow, effectively learning when to retain or discard historical observations (<xref ref-type="bibr" rid="B10">Cho et al., 2014</xref>). This mechanism is particularly advantageous for intermittent data, as the model can suppress irrelevant zeros while amplifying meaningful non-zero events. Recent hybrid approaches, such as (<xref ref-type="bibr" rid="B33">Silveira Gontijo and Azevedo Costa, 2020</xref>), demonstrate that neural networks can effectively model hierarchical and intermittent structures in demand forecasting, outperforming traditional statistical methods. Furthermore, GRUs can model long-term dependencies, distinguishing between true inactivity (structural zeros) and transient fluctuations (noise), a capability that eludes most statistical approaches (<xref ref-type="bibr" rid="B23">Lim and Zohren, 2021</xref>).</p>
<p>GRUs offer a flexible and powerful framework for forecasting intermittent time series, overcoming key limitations of traditional statistical methods. Their ability to learn complex temporal dependencies without manual feature engineering makes them particularly suited for applications with sporadic demand patterns.</p>
</sec>
<sec id="s3-2">
<title>3.2 Proposed models</title>
<p>This study introduces a suite of hybrid deep learning architectures tailored to the complex nature of beam-level traffic volume forecasting in 5G networks. Beam-level traffic in 5G exhibits high temporal volatility, spatial sparsity, and a mixture of periodic and aperiodic patterns. These characteristics demand a modeling framework capable of capturing both long-term temporal dependencies and nonlinear fluctuations. To this end, we propose and comparatively evaluate seven GRU-based hybrid models: GRU-Linear, GRU-DLinear, GRU-XGBoost, ESN, LSTM, GRU-MTL, and a GRU-based Ensemble Model.</p>
<p>The rationale for deploying this diverse set of models lies in their complementary strengths. Linear regressors offer transparency and serve as strong baselines. DLinear enhances trend/seasonality decomposition, while XGBoost captures feature interactions missed by standard neural nets. ESNs contribute fast training and memory-rich transformations, and the LSTM-FCN hybrid improves temporal context learning. The Multi-task Learning model adds robustness through joint optimization of classification and regression objectives, and the ensemble aggregates model strengths to reduce variance and improve generalization. This model diversity ensures that both short-term spikes and long-term trends in traffic dynamics are effectively captured.</p>
<p>In addition, the design choice not to use the GRU as input to the Echo State Network (ESN) was motivated by the inherent architectural properties of the ESN. Unlike XGBoost, which is a gradient-boosted decision tree model that benefits from a compact and informative feature representation (in this case, GRU embeddings), the ESN itself is a reservoir computing model that naturally performs its own nonlinear feature transformation through its high-dimensional dynamic reservoir states. Feeding GRU embeddings into the ESN would have overridden its core mechanism of projecting input sequences into a rich dynamic state space and could potentially lead to redundant feature processing or overfitting. To ensure a fair comparison, we configured the ESN with a sufficiently large reservoir size and spectral radius, allowing it to internally capture temporal dependencies from the raw input sequence without the need for an external embedding layer. This design aligns with standard ESN usage, where raw time-series inputs are projected directly into the reservoir state space for subsequent linear readout.</p>
<sec id="s3-2-1">
<title>3.2.1 GRU with linear regression (GRU-linear)</title>
<p>In this configuration, GRU encodes the temporal sequence into latent representations, which are then mapped to outputs via a linear regression layer. The model offers a simple yet effective architecture for modeling sequential data where the nonlinearities primarily reside in the temporal dimension rather than the output mapping. This structure has been employed in real-time traffic forecasting settings with notable success (<xref ref-type="bibr" rid="B14">Fu et al., 2016</xref>).</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 GRU with DLinear (GRU-DLinear)</title>
<p>The GRU-DLinear model integrates the DLinear architecture, which decomposes time-series signals into seasonal and trend components before applying separate linear forecasts (<xref ref-type="bibr" rid="B42">Zeng et al., 2023</xref>). The GRU pre-processes the sequence, providing rich temporal embeddings that DLinear uses to conduct more interpretable and accurate predictions, especially for long-horizon forecasting with periodic behaviors.</p>
</sec>
<sec id="s3-2-3">
<title>3.2.3 GRU with XGBoost regression (GRU-XGBoost)</title>
<p>In this hybrid, GRU encodes sequential features which are then passed to an XGBoost regressor. XGBoost, known for its strong performance on structured data and ability to model complex feature interactions, serves to refine GRU&#x2019;s temporal outputs by capturing residual nonlinear relationships (<xref ref-type="bibr" rid="B8">Chen and Guestrin, 2016</xref>).</p>
</sec>
<sec id="s3-2-4">
<title>3.2.4 GRU with echo state network (GRU-ESN)</title>
<p>The GRU-ESN architecture exploits the high-dimensional memory capabilities of Echo State Networks, a class of reservoir computing models. GRU sequences are passed to an untrained recurrent reservoir with fixed weights, while only the readout layer is trained. This structure introduces additional temporal richness while retaining training efficiency (<xref ref-type="bibr" rid="B15">Gallicchio et al., 2018</xref>).</p>
</sec>
<sec id="s3-2-5">
<title>3.2.5 LSTM with fully connected network (LSTM-FCN)</title>
<p>The LSTM-FCN architecture integrates a Long Short-Term Memory (LSTM) network with a Fully Connected Network (FCN) to leverage both temporal sequence modeling and powerful nonlinear feature transformation. In this setup, the LSTM layer learns temporal dependencies and encodes sequential patterns present in the beam-level traffic data, while the subsequent FCN maps these learned representations to the final traffic volume predictions. This hybrid design combines the LSTM&#x2019;s robust gating mechanisms, which capture long-term temporal dynamics, with the FCN&#x2019;s capacity for flexible nonlinear regression. As a result, the LSTM-FCN model provides enhanced adaptability to the irregular and bursty traffic patterns typical of 5G beam-level forecasts (<xref ref-type="bibr" rid="B18">Greff et al., 2016</xref>).</p>
</sec>
<sec id="s3-2-6">
<title>3.2.6 Gated recurrent unit-multi-task learning (GRU-MTL)</title>
<p>The GRU-MTL architecture addresses two concurrent tasks: (1) classifying whether traffic volume is active (non-zero), and (2) regressing the actual traffic magnitude. By learning these tasks jointly with shared GRU encoders and distinct output heads, the model benefits from inductive transfer, improving generalization and robustness, especially in sparse or imbalanced traffic conditions (<xref ref-type="bibr" rid="B31">Ruder, 2017</xref>). Multi-task learning has shown effectiveness in related spatio-temporal forecasting domains (<xref ref-type="bibr" rid="B9">Chen et al., 2020</xref>).</p>
</sec>
<sec id="s3-2-7">
<title>3.2.7 GRU-based ensemble model</title>
<p>Finally, a GRU-based Ensemble Model is constructed by aggregating the predictions of the aforementioned models using either weighted averaging or meta-learning strategies. Ensemble learning helps reduce model variance and compensates for individual model weaknesses, thereby improving prediction stability and reliability across diverse traffic scenarios (<xref ref-type="bibr" rid="B44">Zhang et al., 2017</xref>).</p>
</sec>
<sec id="s3-2-8">
<title>3.2.8 GRU-based ensemble algorithm</title>
<p>The ensemble, implemented to enhance predictive robustness and capture diverse traffic dynamics, integrates three distinct GRU-based architectures based on their performance: GRU with Multi-task Learning (GRU-MTL), GRU with Linear Regression (GRU-Linear), and GRU with XGBoost (GRU-XGBoost). By aggregating the predictions of these models using weighted averaging, the ensemble aims to reduce variance, mitigate model-specific biases, and improve generalization across varying beam-level traffic conditions in 5G networks. The high-level steps of the ensemble inference process are presented in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>GRU-Based Ensemble Forecasting Algorithm.<list list-type="simple">
<list-item>
<p>&#x2003;&#x2003;<bold>Input:</bold> Time-series input data <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is sequence length and <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is feature dimension</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>Input:</bold> Pre-trained models: GRU-MTL, GRU-Linear, GRU-XGBoost</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>Output:</bold> Final traffic volume forecast <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>ensemble</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>1 <bold>Initialize</bold> ensemble weights <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> such that <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>2 <bold>Step 1: Preprocess Input</bold> Normalize or scale <inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to the expected input range of all models</p>
</list-item>
<list-item>
<p>3<bold>Step 2: Inference from Base Models</bold> <inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>MTL</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mtext>GRU</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mtext>MTL</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>//Use regression head only</p>
</list-item>
<list-item>
<p>4 <inline-formula id="inf27">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>Linear</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mtext>GRU</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mtext>Linear</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf28">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>XGB</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mtext>XGBoost</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>GRU</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>5<bold>Step 3: Ensemble Aggregation</bold> <inline-formula id="inf29">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>ensemble</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>MTL</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>Linear</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>XGB</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>6 <bold>return</bold> <inline-formula id="inf30">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>ensemble</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>In our experiments, we set both coefficients to 1, i.e.,<disp-formula id="equ1">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>resulting in a total loss of the form:<disp-formula id="equ2">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>which treats the regression and classification objectives with equal importance. This design choice was made to avoid introducing additional hyperparameters and to keep the optimization simple and interpretable, particularly since both tasks&#x2014;predicting beam-level traffic intensity and classifying beam activity&#x2014;are equally critical for reliable forecasting in 5G systems. We found that applying equal weighting to these tasks performed well empirically without requiring further tuning.</p>
<p>In addition, to ensure a fair and unbiased evaluation, particular care was taken to prevent any form of data leakage during ensemble training. Specifically, the ensemble weights were learned exclusively from historical weeks (training set) that were completely disjoint from the test week. Once optimized, these weights were fixed and directly applied to the held-out test week to generate performance metrics. This strict separation between training and testing guarantees that no information from the test week influenced the ensemble fitting, thereby preserving the integrity and reliability of the evaluation.</p>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 GRU-based multi-task learning architecture</title>
<sec id="s3-3-1">
<title>3.3.1 GRU-MTL architecture for beam-level traffic forecasting</title>
<p>Multi-task learning represents a machine learning approach in which a single model is trained to perform multiple related tasks simultaneously, taking advantage of inter-task correlations to improve overall prediction accuracy (<xref ref-type="bibr" rid="B25">Rago et al., 2020</xref>). Recent advances in neural network architectures have demonstrated the effectiveness of combining MTL frameworks with GRUs for spatio-temporal traffic forecasting, particularly at the beam level in cellular networks (<xref ref-type="bibr" rid="B43">Zhang and Yang, 2021</xref>). This approach addresses the dual challenges of capturing temporal dependencies through GRU&#x2019;s sophisticated gating mechanisms while simultaneously modeling spatial correlations across network locations via shared representations in the MTL framework. The architecture typically employs a shared GRU encoder to extract common temporal patterns, coupled with task-specific decoders that adapt these representations to individual beam predictions (<xref ref-type="bibr" rid="B12">Collobert and Weston, 2008</xref>), optimizing a composite loss function that balances performance across all tasks (<xref ref-type="bibr" rid="B13">Evgeniou and Pontil, 2004</xref>).</p>
<p>As shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, the proposed architecture adopts a MTL paradigm that integrates a GRU-based temporal encoder with two parallel task-specific heads: a <italic>regressor</italic> and a <italic>binary classifier</italic>, designed for beam-level spatio-temporal traffic forecasting in 5G wireless communication networks. Given an input training tensor <inline-formula id="inf31">
<mml:math id="m33">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf32">
<mml:math id="m34">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>672</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of historical time steps and <inline-formula id="inf33">
<mml:math id="m35">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2880</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of spatial beams across multiple cells or base stations, the GRU module processes the sequence to extract latent temporal representations that capture dynamic dependencies across time and space. The GRU operates through its gating mechanism and updates its hidden state <inline-formula id="inf34">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> at each step using the following update equations as shown in <xref ref-type="disp-formula" rid="e1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e4">4</xref>.<disp-formula id="e1">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf35">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf36">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the update and reset gates, <inline-formula id="inf37">
<mml:math id="m43">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the sigmoid activation function, and <inline-formula id="inf38">
<mml:math id="m44">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes element-wise multiplication. The final hidden state <inline-formula id="inf39">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is shared across two output heads. The <italic>regressor</italic> produces a real-valued prediction <inline-formula id="inf40">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, modeling the expected future traffic volume as shown in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>.<disp-formula id="e5">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>while the classifier outputs a binary activation map <inline-formula id="inf41">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>, indicating whether or not traffic is expected at each beam:<disp-formula id="e6">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>round</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Gru integrated multi-task learning framework.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g006.tif">
<alt-text content-type="machine-generated">Flowchart of a machine learning model. Training data is input into a GRU block, leading to separate Linear Regressor and Binary Classifier paths. Outputs are combined to provide a final result.</alt-text>
</graphic>
</fig>
<p>The final output <inline-formula id="inf42">
<mml:math id="m50">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is computed via an element-wise product of the two outputs as shown in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>:<disp-formula id="e7">
<mml:math id="m51">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>which effectively suppresses predictions in regions where no traffic is expected. This fusion step is particularly valuable in environments where traffic is intermittent and sparse, as it reduces false positives and enforces output deficiency.</p>
<p>The model is trained using a joint loss function that balances the regression and classification objectives as shown in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>:<disp-formula id="e8">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf43">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the mean squared error (MSE) loss, <inline-formula id="inf44">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the binary cross-entropy (BCE) loss, and <inline-formula id="inf45">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are task-balancing weights.</p>
<p>The explicit mathematical definitions of both the Mean Squared Error (MSE) loss <inline-formula id="inf46">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the Binary Cross-Entropy (BCE) loss <inline-formula id="inf47">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> used in our multitask learning framework is shown in <xref ref-type="disp-formula" rid="e9">Equations 9</xref>, <xref ref-type="disp-formula" rid="e10">10</xref>:<disp-formula id="e9">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf48">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf49">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the ground truth and predicted traffic values, respectively, and <inline-formula id="inf50">
<mml:math id="m61">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of samples.<disp-formula id="e10">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf51">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the ground truth beam activity label, and <inline-formula id="inf52">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the predicted probability of beam activity.</p>
<p>This MTL-GRU framework offers multiple advantages. First, it improves model generalization by enforcing shared feature learning across complementary tasks, which regularizes the network and reduces overfitting. Second, the binary classification head acts as a learned sparsity prior, enabling the model to suppress erroneous predictions in inactive regions, thus reducing the mean absolute percentage error (MAPE) and improving robustness in low-activity scenarios. Third, the GRU-based temporal encoder captures long-range dependencies in traffic patterns, such as daily or weekly periodicities, critical in non-stationary environments like mobile networks.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<title>4 Experimental results and analysis</title>
<sec id="s4-1">
<title>4.1 Experimental setup</title>
<sec id="s4-1-1">
<title>4.1.1 Data preparation</title>
<p>The beam-level spatio-temporal traffic data used in this study was provided by the AI for Good challenge, organized by the International Telecommunication Union (ITU). The dataset comprises four high-resolution telemetry files representing key network metrics: throughput volume (DLThpVol), throughput time (DLThpTime), Physical Resource Block utilization (DLPRB), and user count (MR_number). Each dataset contains 2,419,200 hourly samples spanning 2,880 distinct beams across 30 base stations, recorded over a five-week observation period. This rich dataset captures critical spatio-temporal traffic behavior across an ultra-dense 5G infrastructure.</p>
<p>In the dataset, explicit timestamps are not included with the telemetry files. Instead, all data streams are assumed to be sampled at fixed, consistent intervals and are provided as aligned sequences in their respective files. Given this structure, we synchronized the telemetry data streams by assuming uniform sampling and using index-based alignment, i.e., the n-th row in one file corresponds to the n-th row in the others. This approach assumes that the data is pre-aligned by the challenge organizers, and that each row represents a common sampling time step across all metrics.</p>
<p>To effectively train our machine learning model while accounting for the challenges posed by intermittent traffic&#x2014;particularly the prevalence of zero-valued entries due to beam inactivity or sleep modes&#x2014;we employed a Gated Recurrent Unit (GRU)-based architecture. GRUs are well-suited for time series modeling due to their ability to retain long-range dependencies while mitigating vanishing gradient issues. In this application, the GRU also serves as a preprocessing component that inherently filters noise and highlights salient sequential patterns, improving the model&#x2019;s ability to generalize beyond sparse signal artifacts.</p>
<p>The complete dataset, comprising four key features&#x2014;throughput volume, throughput time, physical resource block utilization, and user count&#x2014;was partitioned into training and testing segments to facilitate model development and evaluation. Specifically, each dataset was split into a training set, containing the first 4&#xa0;weeks of data, and a testing set, consisting of the fifth week. This partitioning strategy, illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>, was applied uniformly across all feature dimensions. The training data served as input for model fitting via the sliding window methodology, while the fifth-week data was reserved exclusively for out-of-sample testing and performance evaluation using standard metrics.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Training and testing data block diagram.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g007.tif">
<alt-text content-type="machine-generated">Flowchart showing a dataset split into training and testing data. The dataset (840 x 2880) divides into training data (four weeks) leading to a training methodology, and testing data (fifth week) leading to a testing methodology.</alt-text>
</graphic>
</fig>
<p>The training methodology, as illustrated in <xref ref-type="fig" rid="F8">Figure 8</xref>, is based on a sliding window approach. A fixed-size sequence length (temporal window) of input data (e.g., 168, 24 or 8 time steps) is shifted across the first 4&#xa0;weeks of each beam&#x2019;s sequence to generate supervised training samples. For each windowed input sequence <inline-formula id="inf53">
<mml:math id="m65">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the model predicts the subsequent time steps <inline-formula id="inf54">
<mml:math id="m66">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>,&#x2026; as output targets. Each slide appends a new training label corresponding to the next beam-level time step, gradually forming the multi-output sequence-to-sequence learning format. This process ensures temporal consistency while maximizing the available training data from the historical record.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Sliding window strategy used in training and testing. Left: The GRU-based model is trained with incremental target sequences. Right: The pre-trained model recursively predicts future values during testing.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g008.tif">
<alt-text content-type="machine-generated">Flowchart illustrating training and testing methodologies for a machine learning model. On the left, training involves input data with sequences (X1 to X672) going through an ML model, producing outputs (Z1 to Z3) across three slides. On the right, testing uses last sequence lengths from training data as input to a trained model, generating outputs (Y1 to Y3) across three slides. Each methodology shows a step-by-step data processing flow.</alt-text>
</graphic>
</fig>
<p>During testing, shown in the right panel of <xref ref-type="fig" rid="F8">Figure 8</xref>, the final segment of the training dataset&#x2014;specifically the last window of the fourth week&#x2014;is used as the initial seed input. This seed sequence is fed into the pre-trained GRU-MTL model to generate predictions for the fifth week. The model recursively consumes its own predictions to extend the forecast horizon. That is, the first prediction <inline-formula id="inf55">
<mml:math id="m67">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is appended to the input window to produce <inline-formula id="inf56">
<mml:math id="m68">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and so on, until the desired forecast length is achieved. This autoregressive inference mechanism allows the model to operate in a fully closed-loop mode during deployment.</p>
<p>The overall workflow is further summarized in <xref ref-type="fig" rid="F9">Figures 9</xref>, <xref ref-type="fig" rid="F10">10</xref>, which respectively depict the block-level diagram of the testing pipeline and the training dataset matrix. In <xref ref-type="fig" rid="F9">Figure 9</xref>, the top portion represents the full 4-week training dataset, while the bottom row (crosshatched) represents the fifth week, which serves as the ground truth for evaluating the model&#x2019;s predictive accuracy. The model&#x2019;s predictions <inline-formula id="inf57">
<mml:math id="m69">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are compared against this fifth-week ground truth [Y] using standard evaluation metrics such as Mean Squared Error (MSE), Mean Absolute Error (MAE), and Mean Absolute Percentage Error (MAPE).</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Matrix representation of the 5-week dataset. The first 4&#xa0;weeks serve as training input; the fifth week serves as testing ground truth for performance evaluation.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g009.tif">
<alt-text content-type="machine-generated">Table showing a data structure for time series analysis. Input and output sections are marked, with four weeks of training data and a fifth week for testing. Output from the training data is the prediction; Ground truth data are indicated with crosses.</alt-text>
</graphic>
</fig>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Block diagram of the testing setup: the final week of training is used as input to the pre-trained model, and the outputs are evaluated against the fifth week of data.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g010.tif">
<alt-text content-type="machine-generated">Flowchart depicting a machine learning process. The diagram shows &#x22;4th Week of Training Data&#x22; as input into a &#x22;Pre-trained Model&#x22; labeled &#x22;ML.&#x22; The model produces an &#x22;Output,&#x22; indicated by &#x22;&#x176;.&#x22; Below, &#x22;Ground Truth - Y&#x22; is specified for performance evaluation.</alt-text>
</graphic>
</fig>
<p>This structured and recursive approach enables the model to capture long-term trends and abrupt variations in beam-level traffic patterns while addressing the challenges of temporal scarcity. By coupling the sliding window training strategy with GRU-based sequential modeling and autoregressive forecasting in multi-task learning framework, the proposed methodology offers a scalable and robust solution for spatio-temporal traffic prediction in 5G systems.</p>
<p>In this study, no explicit normalization or standardization was applied to the input data. The GRU model was trained directly on raw beam-level traffic sequences, which are inherently sparse and exhibit zero inflation due to intermittent user activity. This scarcity was intentionally preserved, as it accurately reflects real-world beam usage patterns and enables the model to capture the temporal structure without altering the original data distribution. Missing values were not imputed, and zeros were treated as meaningful observations rather than noise. Furthermore, no manual feature engineering was performed; instead, the GRU served as an end-to-end feature extractor, learning temporal dependencies directly from the sliding window sequences.</p>
</sec>
<sec id="s4-1-2">
<title>4.1.2 Hardware and software</title>
<p>All experiments were implemented in Python 3.8, using TensorFlow 2.12 for deep learning and Scikit-learn 1.0.2 for auxiliary preprocessing and evaluation tasks. Model training was performed on a NVIDIA DGX system equipped with four A100 GPUs, each with 80&#xa0;GB memory, enabling efficient handling of the high-dimensional input and accelerated training throughput.</p>
<p>To assess the practical deployability of our GRU-based model, we also evaluated its inference speed on a single NVIDIA A100 GPU. Despite the high spatial dimensionality of the input (2,880 features) and the sequential nature of the data, several design choices ensure efficient inference:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf58">
<mml:math id="m70">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Temporal-only recurrence&#x2013;The GRU processes only along the temporal axis (sequence length of 8&#x2013;12), treating the 2,880 spatial features as a flat vector at each timestep. This design avoids recurrent computations over the large spatial dimension, keeping runtime manageable.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf59">
<mml:math id="m71">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Lightweight GRU architecture&#x2013;GRUs were deliberately chosen over heavier models such as LSTMs or Transformers due to their reduced parameter count and faster runtime, enabling low-latency sequence modeling.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf60">
<mml:math id="m72">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Empirical inference performance&#x2013;On the A100 GPU, the average per-sample inference time (batch size &#x3d; 1, sequence length &#x3d; 8) was measured at approximately 2.7&#xa0;ms, comfortably meeting the real-time constraints of 5G beam-level operations, where decisions are typically required within 1&#x2013;10&#xa0;ms.</p>
</list-item>
</list>
</p>
<p>These results confirm that the proposed GRU-based model is not only accurate but also computationally efficient for real-time deployment in next-generation wireless networks.</p>
</sec>
<sec id="s4-1-3">
<title>4.1.3 Model configuration</title>
<p>The hyperparameters for the models, as detailed in <xref ref-type="table" rid="T1">Table 1</xref>, were meticulously tuned to achieve an optimal balance between complexity and performance. Diverse configurations, including variations in the number of layers, were explored to identify the most robust setup. Sequence lengths of 8, 24, and 168&#xa0;h were selected to effectively capture short-term fluctuations, daily trends, and weekly patterns, respectively, facilitating a comprehensive analysis of human behavioral dynamics.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Hyperparameters for the different models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Hyperparameters</th>
<th align="left">Linear regression</th>
<th align="left">DLinear</th>
<th align="left">XGBoost</th>
<th align="left">ESN</th>
<th align="left">LSTM</th>
<th align="left">MTL</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Input Sequence Length</td>
<td align="left">168, 24, 8</td>
<td align="left">168, 24, 8</td>
<td align="left">168, 24, 8</td>
<td align="left">168, 24, 8</td>
<td align="left">168, 24, 8</td>
<td align="left">168, 24, 8</td>
</tr>
<tr>
<td align="left">Hidden Dimension</td>
<td align="left">1,024</td>
<td align="left">1,024</td>
<td align="left">1,024</td>
<td align="left">1,024</td>
<td align="left">1,024</td>
<td align="left">1,024</td>
</tr>
<tr>
<td align="left">Network Architecture</td>
<td align="left">GRU &#x2b; Linear</td>
<td align="left">GRU &#x2b; Linear</td>
<td align="left">GRU &#x2b; XGBoost</td>
<td align="left">ESN core</td>
<td align="left">LSTM &#x2b; Linear</td>
<td align="left">GRU, Linear, Classifier</td>
</tr>
<tr>
<td align="left">Activation Function</td>
<td align="left">ReLU</td>
<td align="left">ReLu</td>
<td align="left">-</td>
<td align="left">tanh</td>
<td align="left">tanh</td>
<td align="left">ReLU/Linear</td>
</tr>
<tr>
<td align="left">Optimizer</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
</tr>
<tr>
<td align="left">Learning Rate</td>
<td align="left">0.01</td>
<td align="left">0.0001</td>
<td align="left">0.1</td>
<td align="left">0.000001</td>
<td align="left">0.0001</td>
<td align="left">0.001</td>
</tr>
<tr>
<td align="left">Batch Size</td>
<td align="left">128</td>
<td align="left">128</td>
<td align="left">128</td>
<td align="left">128</td>
<td align="left">128</td>
<td align="left">128</td>
</tr>
<tr>
<td align="left">Epochs</td>
<td align="left">1,500</td>
<td align="left">500</td>
<td align="left">1,500</td>
<td align="left">5,000</td>
<td align="left">1,500</td>
<td align="left">1,500</td>
</tr>
<tr>
<td align="left">Evaluation Metrics</td>
<td align="left">MAE, MSE, RMSE</td>
<td align="left">MAE, MSE, RMSE</td>
<td align="left">MAE, MSE, RMSE</td>
<td align="left">MAE, MSE, RMSE</td>
<td align="left">MAE, MSE, RMSE</td>
<td align="left">MAE, MSE, RMSE</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-1-4">
<title>4.1.4 Performance metrics</title>
<p>Four widely adopted performance metrics&#x2014;Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Mean Absolute Error (MAE) &#x2014; were used to evaluate model performance. While MSE, RMSE, and MAE provided consistent and interpretable results, the MAPE values were disproportionately high. This anomaly was primarily attributed to numerical instability caused by the presence of intermittent zero values in the ground truth, which can significantly inflate percentage-based errors when actual values approach or equal zero.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Results</title>
<sec id="s4-2-1">
<title>4.2.1 Comparative analysis of forecasting model performance</title>
<p>
<xref ref-type="table" rid="T2">Table 2</xref>, presents a comprehensive comparative analysis of six distinct GRU-based models for spatio-temporal beam-level traffic prediction: Linear Regression, DLinear, XGBoost, ESN, LSTM, and our proposed GRU-based MTL approach. The GRU-based MTL model demonstrated superior performance among individual models, achieving MAE values of 0.213631 particularly for sequence length of 168, though this value is slightly elevated compared to our baseline reference model as shown in <xref ref-type="table" rid="T3">Table 3</xref>. Notably, Linear Regression, XGBoost, and MTL emerged as the top-performing individual approaches, prompting their selection for our ensemble implementation. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, the weighted ensemble of these three models yielded a slight improvement, attaining MAE score of 0.210520 for the 168-h sequence length - representing <inline-formula id="inf61">
<mml:math id="m73">
<mml:mrow>
<mml:mn>1.45</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> error reduction compared to the best standalone model. The shorter sequence lengths of 24 and 8-h showed less improvement and were excluded from ensemble analysis. The enhancement from the ensemble model stems from the ensemble&#x2019;s ability to: (1) reduce variance through prediction aggregation, (2) compensate for individual model biases via weighted combination, and (3) improve generalization across diverse traffic conditions. The consistent outperformance across all temporal scales suggests particular robustness for real-time network optimization applications where prediction stability across varying time horizons is crucial.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performance metrics for different models and sequence lengths.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Metrics</th>
<th align="left">Sequence length</th>
<th align="left">Linear regression</th>
<th align="left">DLinear</th>
<th align="left">XGBoost</th>
<th align="left">ESN</th>
<th align="left">LSTM</th>
<th align="left">MTL</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">MAE</td>
<td align="left">168</td>
<td align="left">0.218503</td>
<td align="left">0.238085</td>
<td align="left">0.230860</td>
<td align="left">0.264141</td>
<td align="left">0.355919</td>
<td align="left">0.213631</td>
</tr>
<tr>
<td align="left">24</td>
<td align="left">0.239661</td>
<td align="left">0.286313</td>
<td align="left">0.225731</td>
<td align="left">0.264349</td>
<td align="left">0.301782</td>
<td align="left">0.300096</td>
</tr>
<tr>
<td align="left">8</td>
<td align="left">0.277397</td>
<td align="left">0.274060</td>
<td align="left">0.227612</td>
<td align="left">0.268062</td>
<td align="left">0.316045</td>
<td align="left">0.282097</td>
</tr>
<tr>
<td rowspan="3" align="left">MSE</td>
<td align="left">168</td>
<td align="left">0.273377</td>
<td align="left">0.324395</td>
<td align="left">0.299680</td>
<td align="left">0.389497</td>
<td align="left">0.743190</td>
<td align="left">0.249026</td>
</tr>
<tr>
<td align="left">24</td>
<td align="left">0.361508</td>
<td align="left">0.550637</td>
<td align="left">0.291654</td>
<td align="left">0.367124</td>
<td align="left">0.503964</td>
<td align="left">0.733608</td>
</tr>
<tr>
<td align="left">8</td>
<td align="left">0.479330</td>
<td align="left">0.398813</td>
<td align="left">0.270044</td>
<td align="left">0.383942</td>
<td align="left">0.587777</td>
<td align="left">0.494261</td>
</tr>
<tr>
<td rowspan="3" align="left">RMSE</td>
<td align="left">168</td>
<td align="left">0.522855</td>
<td align="left">0.569557</td>
<td align="left">0.547430</td>
<td align="left">0.624097</td>
<td align="left">0.862085</td>
<td align="left">0.499025</td>
</tr>
<tr>
<td align="left">24</td>
<td align="left">0.601255</td>
<td align="left">0.742049</td>
<td align="left">0.540050</td>
<td align="left">0.605908</td>
<td align="left">0.709904</td>
<td align="left">0.856509</td>
</tr>
<tr>
<td align="left">8</td>
<td align="left">0.692336</td>
<td align="left">0.631516</td>
<td align="left">0.519658</td>
<td align="left">0.619631</td>
<td align="left">0.766666</td>
<td align="left">0.703037</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Base line model.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Target</th>
<th align="left">Hist. Avg</th>
<th align="left">iTransformer</th>
<th align="left">PatchTST</th>
<th align="left">DLinear</th>
<th align="left">Transformer</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Test 1</td>
<td align="left">0.2108</td>
<td align="left">0.1967</td>
<td align="left">0.1973</td>
<td align="left">0.2005</td>
<td align="left">0.2166</td>
</tr>
<tr>
<td align="left">Test 2</td>
<td align="left">0.2431</td>
<td align="left">0.2348</td>
<td align="left">0.2343</td>
<td align="left">0.2352</td>
<td align="left">0.2331</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F11">Figures 11</xref>, <xref ref-type="fig" rid="F12">12</xref> illustrate the beam-level traffic prediction performance of the individual models for a representative test instance (Sample 111). Each plot displays the first 100 dimensions of the traffic volume vector, with the true observed values shown by the blue line and the corresponding predicted values by the orange line. The visual comparison demonstrates how well each model captures the location and magnitude of peaks as well as the sparse inactive intervals.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Ground truth vs. model prediction across various models-1.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g011.tif">
<alt-text content-type="machine-generated">Four line graphs compare Ground Truth with predictions from Dlinear, XGBoost, Linear Regression, and MTL models for Sample 111 over 100 dimensions. Each graph shows two lines, blue for Ground Truth and orange for Prediction, with varying degrees of overlap and divergence. Peaks and variations are observed in different dimensions across the models.</alt-text>
</graphic>
</fig>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Ground truth vs. model prediction across various models-2.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g012.tif">
<alt-text content-type="machine-generated">Line graphs comparing Ground Truth with model predictions for Sample 111. The left graph shows ESN model predictions, while the right shows LSTM model predictions. Both graphs display the first 100 dimensions, with blue lines for Ground Truth and orange lines for Predictions. Peaks and variances in both graphs are illustrated.</alt-text>
</graphic>
</fig>
<p>Specifically, these figures highlight the relative strengths and weaknesses of each approach in approximating the highly irregular and bursty activation patterns typical of beam-level traffic. For example, the GRU-ESN and LSTM models tend to smooth some extreme spikes but generally follow the overall trend. The GRU-XGBoost and GRU-DLinear models capture sharper transitions more accurately but may slightly overfit to noise in certain regions. Meanwhile, the GRU-MTL and GRU-Linear configurations demonstrate solid baseline performance by maintaining consistency in low-activity regions.</p>
<p>
<xref ref-type="fig" rid="F13">Figure 13</xref> shows the prediction result for the proposed Ensemble Model, which combines the outputs of selected base models. As seen in this comparison, the ensemble prediction achieves better alignment with the ground truth across both the high-amplitude spikes and flat regions, reflecting the benefit of aggregating multiple models to reduce individual prediction bias and variance.</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>Ground truth vs. ensemble prediction.</p>
</caption>
<graphic xlink:href="frcmn-06-1658461-g013.tif">
<alt-text content-type="machine-generated">Line graph comparing ground truth and ensemble model prediction for Sample 111 over the first 100 dimensions. The blue line represents ground truth, and the orange line represents predictions. Both lines show peaks and troughs at similar points, indicating close alignment.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-2-2">
<title>4.2.2 The impact of varying classification and regression loss weights</title>
<p>To better understand the contributions of different components of the proposed framework, we conducted an ablation study. This analysis isolates and evaluates the impact of key architectural choices and input configurations on forecasting performance. Specifically, we examined the effect of varying the classification and regression loss weights:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf62">
<mml:math id="m74">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Classification-only model (<inline-formula id="inf63">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf64">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>): This model was able to reasonably detect active beams but failed to accurately estimate traffic intensity. As expected, the absence of regression supervision led to a significantly higher error: MAE &#x3d; 0.398, MSE &#x3d; 0.667, RMSE &#x3d; 0.816.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf65">
<mml:math id="m77">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Regression-only model (<inline-formula id="inf66">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf67">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>): This configuration allowed the model to estimate traffic volumes reasonably well for active beams, but it struggled to distinguish inactive beams. This resulted in numerous false positives and suboptimal resource allocation. Performance was also subpar: MAE &#x3d; 0.257, MSE &#x3d; 0.405, RMSE &#x3d; 0.636.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf68">
<mml:math id="m80">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Balanced multitask model (<inline-formula id="inf69">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf70">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>): This yielded the best performance overall, with accurate traffic prediction and robust beam activation classification: MAE &#x3d; 0.213, MSE &#x3d; 0.249, RMSE &#x3d; 0.499. These results suggest a strong complementary effect between the two tasks.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf71">
<mml:math id="m83">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Varying <inline-formula id="inf72">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> while fixing <inline-formula id="inf73">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>: To further understand the trade-offs, we experimented with multiple classification loss weights:</p>
<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf74">
<mml:math id="m86">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf75">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> MAE &#x3d; 0.218, MSE &#x3d; 0.272, RMSE &#x3d; 0.521</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf76">
<mml:math id="m88">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf77">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> MAE &#x3d; 0.219, MSE &#x3d; 0.272, RMSE &#x3d; 0.521</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf78">
<mml:math id="m90">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf79">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.0</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> MAE &#x3d; 0.224, MSE &#x3d; 0.281, RMSE &#x3d; 0.531</p>
</list-item>
</list>
</list-item>
</list>
</p>
<p>We observed that moderate deviations in the weighting factor had only marginal effects. However, the 1:1 ratio consistently yielded the most balanced performance, reinforcing its selection in our primary experiments.</p>
<p>The results in <xref ref-type="table" rid="T4">Table 4</xref> support our hypothesis that joint learning of beam activity and traffic intensity enables better generalization, especially in cases where beam activation and usage intensity are only weakly correlated.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Effect of Varying Classification and Regression Loss Weights on Multi-task Model Performance. Best results are bolded.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Configuration</th>
<th align="left">MAE</th>
<th align="left">MSE</th>
<th align="left">RMSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<bold>Classification-only</bold> (<inline-formula id="inf80">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf81">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)</td>
<td align="left">0.398</td>
<td align="left">0.667</td>
<td align="left">0.816</td>
</tr>
<tr>
<td align="left">
<bold>Regression-only</bold> (<inline-formula id="inf82">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf83">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)</td>
<td align="left">0.257</td>
<td align="left">0.405</td>
<td align="left">0.636</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf84">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf85">
<mml:math id="m97">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">0.218</td>
<td align="left">0.272</td>
<td align="left">0.521</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf86">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf87">
<mml:math id="m99">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">0.219</td>
<td align="left">0.272</td>
<td align="left">0.521</td>
</tr>
<tr>
<td align="left">
<bold>Balanced Multitask</bold> (<inline-formula id="inf88">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf89">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)</td>
<td align="left">
<bold>0.213</bold>
</td>
<td align="left">
<bold>0.249</bold>
</td>
<td align="left">
<bold>0.499</bold>
</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf90">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cls</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf91">
<mml:math id="m103">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">0.224</td>
<td align="left">0.281</td>
<td align="left">0.531</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-2-3">
<title>4.2.3 The impact of sequence length of model performance</title>
<p>Our analysis highlights a critical interaction between input sequence length, data scarcity, and prediction accuracy in beam-level traffic forecasting for 5G networks. As summarized in <xref ref-type="table" rid="T2">Table 2</xref>, the 168-h sequence length consistently outperforms shorter windows across all the models, achieving superior performance MAE of 0.213631 compared to 0.300096 for the 24-h window and 0.282097 for the 8-h window. This performance gradient directly reflects the underlying scarcity of the dataset, where approximately <inline-formula id="inf92">
<mml:math id="m104">
<mml:mrow>
<mml:mn>32</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of beam&#x2013;time pairs exhibit complete inactivity (zero traffic volume).</p>
<p>The superior performance of longer sequence lengths can be attributed to three primary factors:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf93">
<mml:math id="m105">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <italic>Sparsity Mitigation:</italic> Given the high proportion of zero-inflated observations, shorter input sequences&#x2014;particularly 8-h windows&#x2014;are prone to containing entirely inactive periods, which limits the model&#x2019;s ability to learn meaningful temporal patterns. In contrast, a 168-h window increases the likelihood of capturing both active and inactive states within each sample, thereby providing a richer context and reducing the risk of all-zero inputs.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf94">
<mml:math id="m106">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <italic>Temporal Context Preservation:</italic> Beam-level traffic in 5G networks often follows multi-scale temporal patterns, including strong weekly periodicity modulating daily variations. Longer input windows preserve these broader temporal dynamics, which is critical for modeling intermittent beams whose activation aligns more closely with weekly user behavior than with short-term fluctuations.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf95">
<mml:math id="m107">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <italic>Statistical Stability:</italic> Longer sequences benefit from improved statistical reliability. While an 8-h window may yield only a few active samples for sparse beams, a 168-h sequence typically contains sufficient active observations to support more robust feature learning. This greater statistical stability helps explain the observed <inline-formula id="inf96">
<mml:math id="m108">
<mml:mrow>
<mml:mn>56</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increase in MAE for the shortest sequence length compared to the longest one.</p>
</list-item>
</list>
</p>
<p>Taken together, these findings underscore the importance of selecting an input sequence length that adequately balances temporal coverage and scarcity effects to achieve accurate and reliable beam-level traffic predictions.</p>
</sec>
<sec id="s4-2-4">
<title>4.2.4 The impact of LSTM and ESN on time-series forecast</title>
<p>The results in <xref ref-type="table" rid="T2">Table 2</xref> reveal significant limitations of both LSTM (MAE &#x3d; 0.355919, 0.301782, 0.316045) and ESN (MAE &#x3d; 0.264141, 0.264349, 0.268062) across sequence lengths of 168, 24, and 8&#xa0;h respectively, establishing them as the poorest performers in our beam-level traffic forecasting task. These results align with the theoretical framework presented by (<xref ref-type="bibr" rid="B42">Zeng et al., 2023</xref>) in their seminal work &#x201c;Are Transformers Effective for Time Series Forecasting?&#x201c;, which demonstrates that: (1) LSTMs tend to underperform in sparse traffic scenarios due to their difficulty in learning long-term dependencies from limited active beams, and (2) ESNs struggle with the non-stationary characteristics of cellular traffic patterns. Our empirical results extend their conclusions by quantifying these limitations specifically for beam-level prediction, where the MAE values for both architectures consistently exceeded other models by 18-<inline-formula id="inf97">
<mml:math id="m109">
<mml:mrow>
<mml:mn>23</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> across all tested sequence lengths.</p>
</sec>
<sec id="s4-2-5">
<title>4.2.5 The impact of ensemble model</title>
<p>The superior performance of our ensemble model as shown in <xref ref-type="table" rid="T5">Table 5</xref> with MAE &#x3d; 0.210520, for 168-h sequence length, demonstrates three key advantages over standalone architectures in beam-level traffic prediction. First, the ensemble&#x2019;s weighted aggregation of Linear Regression, XGBoost, and MTL outputs reduces variance by <inline-formula id="inf98">
<mml:math id="m110">
<mml:mrow>
<mml:mn>1.45</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> compared to the best individual model (MTL), mitigating the overfitting tendencies observed in complex nonlinear architectures (<xref ref-type="bibr" rid="B46">Zhu et al., 2024</xref>). Second, the model compensates for individual biases&#x2014;linear assumptions in Regression versus tree-based partitioning in XGBoost&#x2014;through dynamic weighting calibrated to beam activation patterns (Wang et al., 2022). This explains the <inline-formula id="inf99">
<mml:math id="m111">
<mml:mrow>
<mml:mn>60</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> MAE reduction at 8-h sequences, where short-term traffic bursts benefit from XGBoost&#x2019;s granular splits while periodicity is captured by MTL&#x2019;s recurrent cells. Third, the ensemble achieves temporal adaptability: its 168-h performance (0.210520 MAE) surpasses LSTM/ESN results by <inline-formula id="inf100">
<mml:math id="m112">
<mml:mrow>
<mml:mn>45</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, proving robust to sparse long-range dependencies that typically degrade RNNs (<xref ref-type="bibr" rid="B42">Zeng et al., 2023</xref>).</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Mean absolute error of the ensemble model.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Sequence_Length</th>
<th align="left">MAE</th>
<th align="left">MSE</th>
<th align="left">RMSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">168 (without ensemble)</td>
<td align="left">0.218503</td>
<td align="left">0.249026</td>
<td align="left">0.499025</td>
</tr>
<tr>
<td align="left">168 (with ensemble)</td>
<td align="left">0.210520</td>
<td align="left">0.246709</td>
<td align="left">0.496698</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>This study establishes an effective framework for beam-level traffic forecasting in 5G networks through a Multi-Task Learning approach enhanced by ensemble techniques. Our analysis of six models (Linear Regression, DLinear, XGBoost, ESN, LSTM, and GRU-MTL) revealed that the GRU-based MTL architecture achieved superior performance (MAE &#x3d; 0.2136 for 168-h sequences), with further improvement (<inline-formula id="inf101">
<mml:math id="m113">
<mml:mrow>
<mml:mn>1.45</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> error reduction to MAE &#x3d; 0.2105) when combined with Linear Regression and XGBoost in a weighted ensemble. Three key findings emerge:<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf102">
<mml:math id="m114">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Temporal Context Matters: The 168-h sequence length proved most effective, capturing weekly traffic patterns critical for infrastructure planning.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf103">
<mml:math id="m115">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Simplicity Complements Complexity: Although GRU-MTL outperformed LSTM by more than <inline-formula id="inf104">
<mml:math id="m116">
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, its combination with simpler models (Linear Regression/XGBoost) yielded more robust predictions.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf105">
<mml:math id="m117">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Practical Viability: The ensemble&#x2019;s consistent accuracy across sparse beam conditions (<inline-formula id="inf106">
<mml:math id="m118">
<mml:mrow>
<mml:mn>31.8</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> zeros) supports real-world deployment.</p>
</list-item>
</list>
</p>
<p>These results enable proactive resource allocation as the 168-h model&#x2019;s stability aids capacity planning. The ensemble weighting reduces overfitting risks in dynamic conditions.</p>
<p>In this study, our focus was on establishing a strong deterministic GRU-based baseline optimized for accuracy and real-time performance. Consequently, we did not include uncertainty quantification mechanisms (e.g., prediction intervals, Bayesian inference, or ensemble variance). However, we acknowledge that in practical scenarios&#x2014;particularly in proactive resource allocation and anomaly detection&#x2014;the reliability of predictions is as important as their accuracy.</p>
<p>Future work will therefore extend this framework by integrating probabilistic forecasting techniques, such as Monte Carlo dropout, deep ensembles, or Bayesian recurrent units, to provide calibrated uncertainty estimates alongside point predictions. Additionally, we plan to incorporate finer temporal granularity and expand feature usage (e.g., PRB utilization, throughput time, user count) to further improve generalization. This evolution of the framework bridges theoretical modeling with operational needs in 5G networks, offering a balanced and forward-looking solution for accuracy, reliability, and interpretability.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/drive/folders/1KQ1HnBI5Pq7TrUtqvk7owF_iImDpxZ_d?usp&#x26;equals;sharing">https://drive.google.com/drive/folders/1KQ1HnBI5Pq7TrUtqvk7owF_iImDpxZ_d?usp&#x26;equals;sharing</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>IT: Formal Analysis, Writing &#x2013; original draft, Project administration, Writing &#x2013; review and editing. TA: Software, Writing &#x2013; review and editing. XL: Conceptualization, Validation, Writing &#x2013; review and editing. LQ: Validation, Conceptualization, Supervision, Writing &#x2013; review and editing, Funding acquisition, Resources.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This research work is supported by the U.S. Army Research Office (ARO) under grant number W911NF-23-1-0214 and the U.S. National Science Foundation (NSF) under award number 2128482, 2302469, 2428761.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Author disclaimer</title>
<p>The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of the ARO, NSF, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for Government purposes notwithstanding any copyright notation herein.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<collab>3GPP</collab> (<year>2022</year>). <article-title>3gpp release 17 description</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.3gpp.org/release-17">https://www.3gpp.org/release-17</ext-link> (Accessed June 20, 2024)</comment>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Agiwal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Saxena</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Next generation 5g wireless networks: a comprehensive survey</article-title>. <source>IEEE Commun. Surv. and Tutorials</source> <volume>18</volume> (<issue>3</issue>), <fpage>1617</fpage>&#x2013;<lpage>1655</lpage>. <pub-id pub-id-type="doi">10.1109/comst.2016.2532458</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<collab>AIforGood-ITU</collab> (<year>2024</year>). <article-title>Spatio-temporal beam-level traffic forecasting challenge by itu</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://zindi.africa/competitions/spatio-temporal-beam-level-traffic-forecasting-challenge">https://zindi.africa/competitions/spatio-temporal-beam-level-traffic-forecasting-challenge</ext-link> (Accessed: December 10, 2024)</comment>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alsabah</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Naser</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Mahmmod</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Abdulhussain</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Eissa</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Al-Baidhani</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>6G wireless communications networks: a comprehensive survey</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>148191</fpage>&#x2013;<lpage>148243</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3124812</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alrabeiah</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alkhateeb</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Outage-based beamforming for robust 6g millimeter-wave communication</article-title>. <source>IEEE Trans. Commun.</source> <volume>70</volume> (<issue>5</issue>), <fpage>3312</fpage>&#x2013;<lpage>3326</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Box</surname>
<given-names>G. E.</given-names>
</name>
<name>
<surname>Jenkins</surname>
<given-names>G. M.</given-names>
</name>
<name>
<surname>Reinsel</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Ljung</surname>
<given-names>G. M.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Time series analysis: forecasting and control</source>. <publisher-name>John Wiley and Sons</publisher-name>.</citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Brockwell</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>2002</year>). <source>Introduction to time series and forecasting</source>. <publisher-name>Springer</publisher-name>.</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Guestrin</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Xgboost: a scalable tree boosting system</article-title>,&#x201d; in <source>Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and data mining</source>, <fpage>785</fpage>&#x2013;<lpage>794</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Multitask learning and gcn-based taxi demand prediction for a traffic road network</article-title>. <source>Sensors</source> <volume>20</volume> (<issue>13</issue>), <fpage>3776</fpage>. <pub-id pub-id-type="doi">10.3390/s20133776</pub-id>
<pub-id pub-id-type="pmid">32635669</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cho</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Van Merri&#xeb;nboer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gulcehre</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bahdanau</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bougares</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schwenk</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Learning phrase representations using rnn encoder-decoder for statistical machine translation</article-title>. <source>arXiv Prepr</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1406.1078</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="web">
<collab>Cisco</collab> (<year>2020</year>). <article-title>Cisco annual internet report (2018&#x2013;2023)</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.cisco.com/c/en/us/solutions/executive-perspectives/annual-internet-report/index.html">https://www.cisco.com/c/en/us/solutions/executive-perspectives/annual-internet-report/index.html</ext-link> ([Accessed June 20, 2024)</comment>.</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Collobert</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Weston</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>A unified architecture for natural language processing: deep neural networks with multitask learning</article-title>,&#x201d; in <source>Proceedings of the 25th international conference on machine learning</source>, <fpage>160</fpage>&#x2013;<lpage>167</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Evgeniou</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pontil</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Regularized multi&#x2013;task learning</article-title>,&#x201d; in <source>Proceedings of the tenth ACM SIGKDD international conference on knowledge discovery and data mining</source>, <fpage>109</fpage>&#x2013;<lpage>117</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Using lstm and gru neural network methods for traffic flow prediction</article-title>,&#x201d; in <source>2016 31st youth academic annual conference of Chinese association of automation (YAC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>324</fpage>&#x2013;<lpage>328</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gallicchio</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Micheli</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pedrelli</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Design of deep echo state networks</article-title>. <source>Neural Netw.</source> <volume>108</volume>, <fpage>33</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2018.08.002</pub-id>
<pub-id pub-id-type="pmid">30138751</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gardner</surname>
<given-names>E. S.</given-names>
<suffix>Jr</suffix>
</name>
</person-group> (<year>2006</year>). <article-title>Exponential smoothing: the state of the art&#x2014;part ii</article-title>. <source>Int. J. Forecast.</source> <volume>22</volume> (<issue>4</issue>), <fpage>637</fpage>&#x2013;<lpage>666</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijforecast.2006.03.005</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Giordani</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Polese</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mezzavilla</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rangan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zorzi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Toward 6g networks: use cases and technologies</article-title>. <source>IEEE Commun. Mag.</source> <volume>58</volume> (<issue>3</issue>), <fpage>55</fpage>&#x2013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1109/mcom.001.1900411</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Greff</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Srivastava</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Koutn&#xed;k</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Steunebrink</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Schmidhuber</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Lstm: a search space odyssey</article-title>. <source>IEEE Trans. neural Netw. Learn. Syst.</source> <volume>28</volume> (<issue>10</issue>), <fpage>2222</fpage>&#x2013;<lpage>2232</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2016.2582924</pub-id>
<pub-id pub-id-type="pmid">27411231</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hyndman</surname>
<given-names>R. J.</given-names>
</name>
<name>
<surname>Athanasopoulos</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Forecasting: principles and practice</source>. <publisher-name>Melbourne, Australia: OTexts</publisher-name>.</citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<collab>ITU Radiocommunication Sector</collab> (<year>2020</year>). <source>Imt vision &#x2013; framework and overall objectives of the future development of imt for 2020 and beyond</source>. <publisher-name>iTU-R M</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.itu.int/en/ITU-R/Pages/default.aspx">https://www.itu.int/en/ITU-R/Pages/default.aspx</ext-link> (Accessed June 20, 2024)</comment>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lai</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>W.-C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Modeling long-and short-term temporal patterns with deep neural networks</article-title>,&#x201d; in <source>The 41st international ACM SIGIR conference on research and development in information retrieval</source>, <fpage>95</fpage>&#x2013;<lpage>104</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lambert</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>1992</year>). <article-title>Zero-inflated poisson regression, with an application to defects in manufacturing</article-title>. <source>Technometrics</source> <volume>34</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.2307/1269547</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zohren</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Time-series forecasting with deep learning</article-title>. <source>Philosophical Trans. Math. Phys. Eng. Sci.</source> <volume>379</volume> (<issue>2194</issue>), <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1098/rsta.2020.0209</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hui</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Method for multi-task learning fusion network traffic classification to address small sample labels</article-title>. <source>Sci. Rep.</source> <volume>14</volume> (<issue>1</issue>), <fpage>2518</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-51933-8</pub-id>
<pub-id pub-id-type="pmid">38291098</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rago</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Piro</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Boggia</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Dini</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Multi-task learning at the mobile edge: an effective way to combine traffic classification and prediction</article-title>. <source>IEEE Trans. Veh. Technol.</source> <volume>69</volume> (<issue>9</issue>), <fpage>10 362</fpage>&#x2013;<lpage>10 374</lpage>. <pub-id pub-id-type="doi">10.1109/tvt.2020.3005724</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ramakrishnan</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Soni</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Network traffic prediction using recurrent neural networks</article-title>,&#x201d; in <source>2018 17th IEEE international conference on machine learning and applications (ICMLA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>187</fpage>&#x2013;<lpage>193</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rappaport</surname>
<given-names>T. S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mayzus</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Azar</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Millimeter wave Mobile communications for 5g cellular: it will work</article-title>. <source>IEEE Access</source> <volume>1</volume>, <fpage>335</fpage>&#x2013;<lpage>349</lpage>. <pub-id pub-id-type="doi">10.1109/access.2013.2260813</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rappaport</surname>
<given-names>T. S.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kanhere</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Ju</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Madanayake</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Wireless communications and applications above 100 ghz: opportunities and challenges for 6g and beyond</article-title>,&#x201d; in <source>IEEE international conference on communications (ICC)</source>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rau</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Soto</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Zabala-Blanco</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Azurdia-Meza</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ijaz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ekpo</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A novel traffic prediction method using machine learning for energy efficiency in service provider networks</article-title>. <source>Sensors</source> <volume>23</volume> (<issue>11</issue>), <fpage>4997</fpage>. <pub-id pub-id-type="doi">10.3390/s23114997</pub-id>
<pub-id pub-id-type="pmid">37299722</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rohde</surname>
<given-names>U. L.</given-names>
</name>
<name>
<surname>Poddar</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>5g beamforming and its impact on wireless system design</article-title>. <source>IEEE Microw. Mag.</source> <volume>19</volume> (<issue>8</issue>), <fpage>56</fpage>&#x2013;<lpage>70</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ruder</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>An overview of multi-task learning in deep neural networks</article-title>,&#x201d; <comment>arXiv preprint arXiv:1706.05098</comment>.</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Siami-Namini</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tavakoli</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Siami Namin</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>A comparison of arima and lstm in forecasting time series</article-title>,&#x201d; in <source>17th IEEE international conference on machine learning and applications (ICMLA)</source>, <fpage>1394</fpage>&#x2013;<lpage>1401</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silveira Gontijo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Azevedo Costa</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Forecasting hierarchical time series in power generation</article-title>. <source>Energies</source> <volume>13</volume> (<issue>14</issue>), <fpage>3722</fpage>. <pub-id pub-id-type="doi">10.3390/en13143722</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Spatio-temporal cellular network traffic prediction using multi-task deep learning for ai-enabled 6g</article-title>. <source>J. Beijing Inst. Technol.</source> <volume>31</volume> (<issue>5</issue>), <fpage>441</fpage>&#x2013;<lpage>453</lpage>. <pub-id pub-id-type="doi">10.15918/j.jbit1004&#x2010;0579.2022.065</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Teunter</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Syntetos</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Babai</surname>
<given-names>M. Z.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Intermittent demand: linking forecasting to inventory obsolescence</article-title>. <source>Eur. J. Operational Res.</source> <volume>214</volume> (<issue>3</issue>), <fpage>606</fpage>&#x2013;<lpage>615</lpage>. <pub-id pub-id-type="doi">10.1016/j.ejor.2011.05.018</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Internet traffic classification using machine learning: a token-based approach</article-title>,&#x201d; in <source>2011 14th IEEE international conference on computational science and engineering</source>, <fpage>285</fpage>&#x2013;<lpage>289</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ning</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A multitask learning-based network traffic prediction approach for sdn-enabled industrial internet of things</article-title>. <source>IEEE Trans. Industrial Inf.</source> <volume>18</volume> (<issue>11</issue>), <fpage>7475</fpage>&#x2013;<lpage>7483</lpage>. <pub-id pub-id-type="doi">10.1109/tii.2022.3141743</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Nan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Dp-let: an efficient spatio-temporal network traffic prediction framework</article-title>. <source>arXiv Prepr</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2504.03792</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A survey on machine learning-based traffic prediction in cellular networks</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>76 112</fpage>&#x2013;<lpage>76 135</lpage>.</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Multi-scale spatio-temporal attention networks for network-scale traffic learning and forecasting</article-title>. <source>Sensors</source> <volume>24</volume> (<issue>17</issue>), <fpage>5543</fpage>. <pub-id pub-id-type="doi">10.3390/s24175543</pub-id>
<pub-id pub-id-type="pmid">39275454</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Spatio-temporal graph convolutional networks: a deep learning framework for traffic forecasting</article-title>. <source>arXiv Prepr</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1709.04875</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Are transformers effective for time series forecasting?</article-title> <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>37</volume> (<issue>9</issue>), <fpage>11 121</fpage>&#x2013;<lpage>11 128</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v37i9.26317</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey on multi-task learning</article-title>. <source>IEEE Trans. Knowl. data Eng.</source> <volume>34</volume> (<issue>12</issue>), <fpage>5586</fpage>&#x2013;<lpage>5609</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2021.3070203</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Deep spatio-temporal residual networks for citywide crowd flows prediction</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name>. <volume>31</volume> <issue>1</issue>. <pub-id pub-id-type="doi">10.1609/aaai.v31i1.10735</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Artificial intelligence-driven network traffic forecasting: a survey</article-title>. <source>IEEE Commun. Surv. and Tutorials</source> <volume>25</volume> (<issue>2</issue>), <fpage>1234</fpage>&#x2013;<lpage>1259</lpage>.</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>An adaptive ensemble learning paradigm with spatial-temporal feature extraction for wireless traffic prediction</article-title>. <source>IEEE Trans. Netw. Serv. Manag.</source> <volume>22</volume>, <fpage>1727</fpage>&#x2013;<lpage>1743</lpage>. <pub-id pub-id-type="doi">10.1109/tnsm.2024.3522115</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>