<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Future Transp.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Future Transportation</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Future Transp.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-5210</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1644979</article-id>
<article-id pub-id-type="doi">10.3389/ffutr.2026.1644979</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep heterogeneity learning for cross-city transit forecasting: a differentially private federated framework with mixture-of-experts and seasonal decomposition</article-title>
<alt-title alt-title-type="left-running-head">Sakhipov et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/ffutr.2026.1644979">10.3389/ffutr.2026.1644979</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Sakhipov</surname>
<given-names>Aivar</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3149179"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Uzdenbayev</surname>
<given-names>Zhanbai</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Begisbayev</surname>
<given-names>Diar</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3072703"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mektepbayeva</surname>
<given-names>Aruzhan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Seiitbek</surname>
<given-names>Ramazan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yedilkhan</surname>
<given-names>Didar</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing &#x2013; review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>School of Software Engineering, Astana IT University</institution>, <city>Astana</city>, <country country="KZ">Kazakhstan</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Information Technology and Artificial Intelligence, Zhetysu University Named After Ilyas Zhansugurov</institution>, <city>Taldykorgan</city>, <country country="KZ">Kazakhstan</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Aivar Sakhipov, <email xlink:href="mailto:aivar.sakhipov@astanait.edu.kz">aivar.sakhipov@astanait.edu.kz</email>; Zhanbai Uzdenbayev, <email xlink:href="mailto:uzdenbayev.zh@zhetysuu.edu.kz">buzdenbayev.zh@zhetysuu.edu.kz</email>; Diar Begisbayev, <email xlink:href="mailto:begisbayev@gmail.com">begisbayev@gmail.com</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-27">
<day>27</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>7</volume>
<elocation-id>1644979</elocation-id>
<history>
<date date-type="received">
<day>11</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>25</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Sakhipov, Uzdenbayev, Begisbayev, Mektepbayeva, Seiitbek and Yedilkhan.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Sakhipov, Uzdenbayev, Begisbayev, Mektepbayeva, Seiitbek and Yedilkhan</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-27">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Accurate prediction of transit flows is fundamental to optimizing intelligent transportation systems; however, centralized forecasting is frequently obstructed by heterogeneous, Non-Independent and Identically Distributed (Non-IID) cross-city data and stringent data privacy regulations.</p>
</sec>
<sec>
<title>Methods</title>
<p>We propose X-FedFormer, a novel framework integrating Federated Learning (FL) with Differential Privacy (DP) and a deep learning architecture combining a Mixture-of-Experts (MoE) mechanism with a Seasonal-Trend Decomposition module. The framework is evaluated on a statistically validated synthetic dataset faithfully simulating realistic inflow and outflow patterns across ten diverse urban environments (90 days of hourly records, 30 routes per city, 64,800 observations per city).</p>
</sec>
<sec>
<title>Results</title>
<p>X-FedFormer significantly outperforms state-of-the-art federated baselines including FedProx, achieving an aggregate coefficient of determination of 0.922 and a mean absolute error (MAE) of 7.93 passengers across all participating cities. A Wilcoxon signed-rank test confirms statistical significance over the strongest baseline (p &#x003D; 0.018). Ablation studies confirm that the MoE and seasonal decomposition modules reduce forecasting error by approximately 11% and 16%, respectively, compared to standard architectures.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The model maintains high predictive utility even under strict differential privacy guarantees (&#x3b5; &#x2248; 2), establishing a viable privacy-utility operating point for practical deployment. These findings present a scalable, robust solution for urban computing that effectively balances algorithmic performance with data sovereignty in smart city applications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>differential privacy</kwd>
<kwd>federated learning</kwd>
<kwd>mixture-of-experts</kwd>
<kwd>seasonal-trend decomposition</kwd>
<kwd>smart cities</kwd>
<kwd>traffic flow forecasting</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was funded by the Committee of Science of the Ministry of Science and Higher Education of the Republic of Kazakhstan, Grant No. BR24992852. This is the sole funding source. The funder had no involvement in study design, data collection, analysis, interpretation, or the decision to publish.</funding-statement>
</funding-group>
<counts>
<fig-count count="13"/>
<table-count count="11"/>
<equation-count count="30"/>
<ref-count count="32"/>
<page-count count="22"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Transportation Systems Modeling</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>The escalating global urbanization trend and the concomitant surge in demand for efficient urban mobility necessitate the development of highly accurate and real-time transit flow forecasting systems. Precise predictions of passenger dynamics are paramount for optimizing public transportation infrastructure, streamlining route planning, effectively managing fleet allocation, and mitigating urban congestion, thereby enhancing the overall service quality within smart city ecosystems <xref ref-type="bibr" rid="B10">Li et al. (2022)</xref>. Historically, traditional time series forecasting methodologies, such as Autoregressive Integrated Moving Average (ARIMA) models, often prove insufficient in capturing the intricate non-linear dynamics, multi-scale temporal dependencies, and multifactorial external influences inherent in contemporary urban transit data. The advent of deep learning, particularly architectures like Recurrent Neural Networks (RNNs) and Transformer-based models, has advanced time series forecasting, demonstrating superior capabilities in modeling complex spatio-temporal patterns <xref ref-type="bibr" rid="B31">Zhang et al. (2023)</xref>; <xref ref-type="bibr" rid="B30">Zhang and Gu (2023)</xref>.</p>
<p>Despite these notable advancements, a formidable challenge persists in developing robust forecasting models that can generalize effectively across multiple, geographically disparate urban environments. Each city uniquely manifests distinct transit network topologies, socio-economic factors, daily commuting patterns, and prevailing external conditions. This variability gives rise to highly heterogeneous, or Non-Independent and Identically Distributed (Non-IID), data distributions among urban centers <xref ref-type="bibr" rid="B26">Xia et al. (2023)</xref>; <xref ref-type="bibr" rid="B7">Feng et al. (2024)</xref>. Consequently, training a singular centralized model on such aggregated heterogeneous data frequently leads to suboptimal performance, as the model struggles to adapt to the specific nuances and unique characteristics of each city. Conversely, deploying separate, independently trained models for every city is computationally resource-intensive, economically impractical, and foregoes the substantial benefits of collaborative learning from a broader, diverse data spectrum.</p>
<p>Furthermore, a paramount concern in any contemporary urban data analytics endeavor is the safeguarding of data privacy. Transit agencies, as custodians of sensitive mobility data, are bound by stringent regulatory frameworks and ethical imperatives that prohibit the direct sharing or centralization of raw, proprietary data for machine learning model training. This pervasive privacy constraint severely curtails the feasibility of conventional centralized machine learning approaches for cross-city analytical applications, creating a critical bottleneck for data-driven urban development.</p>
<p>To address these multifaceted and critical challenges, this paper proposes a <italic>Federated Learning (FL)</italic> framework for cross-city transit flow forecasting. Federated Learning inherently enables multiple urban entities to collaboratively train a powerful global prediction model without ever sharing their raw, sensitive transit data, thereby preserving local data privacy <xref ref-type="bibr" rid="B7">Feng et al. (2024)</xref>; <xref ref-type="bibr" rid="B26">Xia et al. (2023)</xref>; <xref ref-type="bibr" rid="B12">Liu et al. (2023)</xref>. While FL offers substantial intrinsic privacy benefits, potential vulnerabilities to inference or reconstruction attacks from shared model updates necessitate even more robust safeguards. To this end, we integrate <italic>Differential Privacy (DP)</italic> mechanisms into our federated framework. By strategically injecting calibrated noise into the model updates, DP provides strong, mathematically provable guarantees for safeguarding sensitive information during the collaborative training process, ensuring robust privacy protection <xref ref-type="bibr" rid="B8">Gupta and Torra (2023)</xref>; <xref ref-type="bibr" rid="B19">Qi et al. (2021)</xref>; <xref ref-type="bibr" rid="B24">Tang et al. (2022)</xref>.</p>
<p>A pivotal innovation of our proposed approach lies in the development of a sophisticated deep learning architecture specifically engineered to handle the pronounced heterogeneity and multi-scale temporal dependencies characteristic of urban transit data within the federated paradigm. Unlike standard federated averaging (FedAvg), which naively aggregates diverse local models into a single global centroid&#x2014;often leading to &#x201c;performance averaging&#x201d; where the model sub-optimally fits all clients&#x2014;our model leverages a <italic>Mixture-of-Experts (MoE)</italic> framework. This structural design enables the global model to preserve specialized knowledge by dynamically routing input samples to the most relevant &#x201c;expert&#x201d; sub-networks. Consequently, the model fosters an adaptive response to each city&#x2019;s distinct data distribution, effectively mitigating the performance degradation caused by the &#x201c;one-size-fits-all&#x201d; limitation of traditional FL. Complementing this, we incorporate principles of <italic>Seasonal-Trend Decomposition</italic> directly within the deep learning architecture. By explicitly disentangling the time series into trend and seasonal components, the model achieves superior interpretability and robustness, effectively isolating recurring mobility signatures from transient fluctuations.</p>
<p>This synergistic combination of MoE-based heterogeneity modelling and decomposition-based temporal reasoning distinguishes our work from recent contributions. While <xref ref-type="bibr" rid="B18">Pang and Li (2024)</xref> focuses on decomposition in centralized regimes, and FedGODE <xref ref-type="bibr" rid="B3">Al-Huthaifi et al. (2024a)</xref> utilizes graph ODEs, neither addresses the dual challenge of privacy-preserving adaptation and multi-component temporal extraction in a unified framework. Furthermore, we advance beyond recent heterogeneity-aware methods like Fed-TREND <xref ref-type="bibr" rid="B27">Xu et al. (2024)</xref> and Multi-Head FL <xref ref-type="bibr" rid="B13">Syu et al. (2024)</xref> by strictly integrating Differential Privacy, quantifying the privacy-utility trade-off explicitly. We quantify the performance gains attributable to these architectural innovations&#x2014;distinguishing them from the benefits of the federated setup itself&#x2014;through comprehensive ablation studies detailed in <xref ref-type="sec" rid="s4">Section 4</xref>.</p>
<p>To facilitate a rigorous and reproducible evaluation, and to circumvent the pervasive scarcity of readily available, real-world cross-city transit datasets (a direct consequence of privacy concerns and data sharing restrictions), we further introduce a statistically validated <italic>synthetic data generation framework</italic>. This framework is meticulously designed to accurately simulate realistic transit inflow and outflow patterns across multiple cities, capturing their distinct characteristics and complex temporal dynamics. This synthetic dataset serves as a robust and controlled benchmark environment, enabling comprehensive testing of our proposed architecture&#x2019;s generalization capabilities and privacy-preserving properties across diverse urban contexts.</p>
<p>The main contributions of this paper are summarized as follows:<list list-type="bullet">
<list-item>
<p>We propose a Federated Learning framework for cross-city transit flow forecasting, explicitly designed to overcome pervasive data privacy concerns and effectively leverage distributed urban mobility data.</p>
</list-item>
<list-item>
<p>We integrate Differential Privacy into the federated learning process, providing strong, quantifiable privacy guarantees for sensitive transit data throughout collaborative model training.</p>
</list-item>
<list-item>
<p>We develop a sophisticated and adaptable deep learning architecture that uniquely combines a Mixture-of-Experts model with Seasonal-Trend Decomposition principles, specifically tailored to capture and model the heterogeneous and multi-scale temporal dynamics inherent in cross-city transit data.</p>
</list-item>
<list-item>
<p>We introduce a statistically validated synthetic data generation framework that faithfully replicates complex real-world urban transit flow patterns, establishing a robust and ethical environment for benchmarking and evaluating federated learning solutions in privacy-sensitive scenarios.</p>
</list-item>
<list-item>
<p>Through extensive experimentation on a meticulously generated multi-city synthetic dataset, we demonstrate the superior performance, enhanced robustness, and generalization capabilities of our proposed approach compared to relevant baseline models, highlighting its practical applicability for advanced smart urban planning and intelligent transportation systems.</p>
</list-item>
</list>
</p>
<p>The remainder of this paper is organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> provides a comprehensive review of relevant literature in time series forecasting, federated learning, and advanced deep learning architectures. <xref ref-type="sec" rid="s3">Section 3</xref> details our proposed methodology, including the synthetic data generation framework, the federated learning setup, and the deep learning model architecture. <xref ref-type="sec" rid="s4">Section 4</xref> presents the experimental setup, performance metrics, and detailed results of our cross-city forecasting experiments. Finally, <xref ref-type="sec" rid="s5">Section 5</xref> discusses the implications of our findings, explores broader applications, and outlines directions for future work.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Literature review</title>
<p>The literature on traffic flow prediction has evolved significantly, driven by the increasing complexity of urban mobility and the availability of advanced computational techniques. This section reviews key developments in traffic forecasting, the emergence of federated learning as a privacy-preserving paradigm, and various deep learning architectures employed in this domain, ultimately highlighting the existing research gaps addressed by this paper.</p>
<sec id="s2-1">
<label>2.1</label>
<title>Advances in traffic flow prediction</title>
<p>Traffic flow prediction is a cornerstone of Intelligent Transportation Systems (ITS), enabling proactive traffic management, congestion mitigation, and optimized resource allocation <xref ref-type="bibr" rid="B10">Li et al. (2022)</xref>. Early approaches primarily relied on statistical models such as Autoregressive Integrated Moving Average (ARIMA) and Kalman filters, which, while effective for stationary time series, often struggle to capture the complex non-linear and dynamic nature of real-world traffic data. The advent of machine learning marked a significant shift, with methods like Support Vector Regression (SVR) and Artificial Neural Networks (ANNs) demonstrating improved performance by learning non-linear relationships.</p>
<p>More recently, deep learning has revolutionized traffic forecasting due to its capacity to automatically extract intricate spatio-temporal features from large-scale data. Recurrent Neural Networks (RNNs) and their variants, such as Long Short-Term Memory (LSTM) and Gated Recurrent Units (GRUs), have been widely adopted for their ability to model sequential dependencies in time series. Beyond sequential models, convolutional neural networks (CNNs) have been employed to capture spatial correlations in traffic networks. The advent of Transformer-based architectures, initially prominent in natural language processing, has also shown promising results in time series forecasting due to their attention mechanisms, which effectively capture long-range dependencies <xref ref-type="bibr" rid="B30">Zhang and Gu (2023)</xref>. For instance, <xref ref-type="bibr" rid="B30">Zhang and Gu (2023)</xref> proposed a Transformer with a multi-spatial-temporal encoder-decoder for traffic flow prediction, underscoring the shift towards more sophisticated deep learning models <xref ref-type="bibr" rid="B30">Zhang and Gu (2023)</xref>. Classical seasonal-trend decomposition approaches such as STL remain a foundational tool for interpreting periodic time series and motivate explicit decomposition in modern models <xref ref-type="bibr" rid="B5">Cleveland et al. (1990)</xref>. Despite these advancements, a significant challenge remains in developing models that can generalize effectively across diverse urban environments, each characterized by unique traffic patterns and infrastructure.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Federated learning in traffic forecasting</title>
<p>The increasing awareness of data privacy and sovereignty, coupled with the distributed nature of urban data generation, has spurred significant interest in Federated Learning (FL). FL enables multiple clients (e.g., cities, traffic departments) to collaboratively train a shared global model without directly exchanging their raw, sensitive local data, thereby preserving privacy <xref ref-type="bibr" rid="B32">Zhang et al. (2024)</xref>. This paradigm is particularly well-suited for Intelligent Transportation Systems (ITS) where centralized data collection is often impractical due to regulatory constraints, ownership issues, and the sheer volume of data generated at the edge.</p>
<p>Numerous studies have explored the application of FL for traffic flow prediction. <xref ref-type="bibr" rid="B26">Xia et al. (2023)</xref> introduced an FL framework integrated with Graph Convolutional Networks (GCNs) to leverage spatial relationships in traffic networks while maintaining data locality <xref ref-type="bibr" rid="B26">Xia et al. (2023)</xref>. Similarly, <xref ref-type="bibr" rid="B20">Qi et al. (2023)</xref> proposed FedAGCN, an FL framework utilizing asynchronous GCNs to address non-IID data distributions and improve prediction accuracy in a distributed setting <xref ref-type="bibr" rid="B20">Qi et al. (2023)</xref>. <xref ref-type="bibr" rid="B11">Lin et al. (2024)</xref> proposed ST-TPFL, a topology-protected federated learning framework for spatio-temporal traffic flow prediction that further addresses privacy and structural data heterogeneity in distributed urban environments. Al-Huthaifi et al. have made multiple contributions, including FedGODE, which combines FL with Graph Ordinary Differential Equation networks for secure traffic flow prediction <xref ref-type="bibr" rid="B4">Al-Huthaifi et al. (2024b)</xref>, and FedAGAT, employing FL with adaptive graph attention networks for real-time traffic flow prediction <xref ref-type="bibr" rid="B3">Al-Huthaifi et al. (2024a)</xref>. These works underscore the trend of combining FL with graph-based neural networks to model the complex spatio-temporal dependencies inherent in traffic networks.</p>
<p>Beyond graph-based approaches, FedAvg McMahan et al. (2017) and FedProx <xref ref-type="bibr" rid="B9">Li et al. (2020)</xref> remain canonical baselines for aggregation and heterogeneity-aware optimization in federated forecasting.</p>
<p>Further extending FL capabilities, <xref ref-type="bibr" rid="B7">Feng et al. (2024)</xref> investigated federated meta-learning on graphs for traffic flow prediction, aiming for faster adaptation to new or heterogeneous environments <xref ref-type="bibr" rid="B7">Feng et al. (2024)</xref>. Multilevel FL approaches have also emerged, such as that proposed by <xref ref-type="bibr" rid="B12">Liu et al. (2023)</xref>, for intelligent traffic flow forecasting in transportation network management, addressing hierarchical data structures <xref ref-type="bibr" rid="B12">Liu et al. (2023)</xref>. <xref ref-type="bibr" rid="B16">Nidhi and Grover (2024)</xref> provided an analysis of FL for vehicular traffic flow prediction, evaluating various learning algorithms and aggregation approaches, highlighting the diversity in FL strategies Nidhi and Grover (2024). <xref ref-type="bibr" rid="B29">Ye et al. (2024)</xref> explored federated generative artificial intelligence for traffic flow prediction under vehicular computing power networks, indicating the integration of advanced AI paradigms within FL for traffic <xref ref-type="bibr" rid="B29">Ye et al. (2024)</xref>. <xref ref-type="bibr" rid="B28">Yaqub et al. (2024)</xref> proposed a federated learning approach combined with graph neural networks and asynchronous computations to enhance scalability and prediction accuracy under heterogeneous distributed settings. Despite these advancements, handling extreme heterogeneity (Non-IID data) efficiently and robustly remains a significant challenge within FL paradigms, particularly when diverse urban characteristics lead to vastly different local data distributions.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Privacy-preserving mechanisms in federated traffic forecasting</title>
<p>While FL inherently offers privacy benefits by keeping raw data local, the exchange of model updates (e.g., gradients) can still be susceptible to various privacy attacks, including inference and reconstruction attacks. To bolster privacy guarantees, several mechanisms have been integrated with FL in the context of traffic forecasting.</p>
<p>Differential Privacy (DP) is a widely adopted technique that adds calibrated noise to model updates, providing strong, mathematically provable privacy guarantees against individual data point inference. <xref ref-type="bibr" rid="B8">Gupta and Torra (2023)</xref> demonstrated the application of differentially private FL using Transformer architectures for traffic flow prediction, showcasing the feasibility of high accuracy under strong privacy constraints <xref ref-type="bibr" rid="B8">Gupta and Torra (2023)</xref>. <xref ref-type="bibr" rid="B24">Tang et al. (2022)</xref> proposed a differentially private decentralized traffic flow prediction approach based on FL, further emphasizing the importance of DP in distributed traffic systems <xref ref-type="bibr" rid="B24">Tang et al. (2022)</xref>. <xref ref-type="bibr" rid="B2">Akallouch et al. (2022)</xref> also explored prediction and privacy schemes for traffic flow estimation on highway networks, highlighting the necessity of integrated privacy measures <xref ref-type="bibr" rid="B2">Akallouch et al. (2022)</xref>.</p>
<p>Beyond DP, other privacy-enhancing technologies (PETs) have been explored. Blockchain technology has been leveraged to provide secure and transparent data sharing frameworks in FL, ensuring data integrity and traceability while maintaining privacy. <xref ref-type="bibr" rid="B19">Qi et al. (2021)</xref> proposed a privacy-preserving blockchain-based FL framework for traffic flow prediction, showcasing its potential for secure data management <xref ref-type="bibr" rid="B19">Qi et al. (2021)</xref>. Split Learning, an alternative privacy-preserving paradigm where different layers of a neural network are trained on different devices, has also been investigated. <xref ref-type="bibr" rid="B25">Tran et al. (2023)</xref> presented a privacy-preserving traffic flow prediction approach using split learning, offering another avenue for secure collaborative AI <xref ref-type="bibr" rid="B25">Tran et al. (2023)</xref>. <xref ref-type="bibr" rid="B15">Meese et al. (2022)</xref> explored a blockchained federated learning approach for real-time traffic flow prediction, further solidifying the use of distributed ledger technologies for secure and real-time operations <xref ref-type="bibr" rid="B15">Meese et al. (2022)</xref>. While these methods enhance privacy, optimizing the trade-off between privacy, utility, and computational overhead remains an active area of research.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Research gaps and contributions</title>
<p>Despite the significant advancements in deep learning for traffic flow prediction and the growing adoption of federated learning for privacy preservation, several critical research gaps persist, particularly concerning cross-city applications with extreme data heterogeneity:<list list-type="bullet">
<list-item>
<p>Adaptive Learning for Extreme Non-IID: While various FL approaches address non-IID data, few explicitly integrate highly adaptive deep learning architectures like Mixture-of-Experts (MoE) within a privacy-preserving FL framework specifically for multi-city transit forecasting. Current FL solutions often aim to improve generalization through robust aggregation or personalization, but rarely employ an architecture designed for dynamic expert specialization across heterogeneous clients.</p>
</list-item>
<list-item>
<p>Explicit Temporal Decomposition in FL: Many deep learning models capture temporal patterns implicitly. However, the explicit decomposition of time series into trend and seasonal components can offer more robust and interpretable learning, especially for highly periodic data like transit flows. The integration of such decomposition principles directly into a federated deep learning architecture for heterogeneous data, combined with adaptive learning, remains underexplored. While <xref ref-type="bibr" rid="B18">Pang and Li (2024)</xref> recently proposed a decomposition modeling framework for seasonal time series, its application within a federated, adaptive setting for urban transit has not been thoroughly investigated <xref ref-type="bibr" rid="B18">Pang and Li (2024)</xref>.</p>
</list-item>
<list-item>
<p>Rigorous Evaluation with Controlled Heterogeneity: The development and validation of comprehensive synthetic multi-city transit datasets that accurately capture diverse urban characteristics and privacy concerns are crucial for benchmarking novel FL solutions. Existing studies often rely on limited real-world datasets or simpler synthetic scenarios that may not fully represent the complexity of cross-city heterogeneity under privacy constraints.</p>
</list-item>
</list>
</p>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> summarizes the key distinctions between X-FedFormer and the most closely related methods in terms of training paradigm, heterogeneity handling, decomposition, and differential privacy. This paper directly addresses these gaps by proposing a novel Federated Adaptive Decomposed Learning framework that uniquely combines Differential Privacy with an MoE-based architecture and seasonal decomposition principles, evaluated on a statistically validated synthetic dataset designed for extreme cross-city heterogeneity. This integrated approach aims to significantly enhance the accuracy, privacy, and generalizability of cross-city transit flow forecasting.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Comparison with closely related methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Method</th>
<th align="center">Training</th>
<th align="left">Heterogeneity handling</th>
<th align="center">Decomposition</th>
<th align="center">DP</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<xref ref-type="bibr" rid="B18">Pang and Li (2024)</xref>
</td>
<td align="center">Centralized</td>
<td align="left">None</td>
<td align="center">Yes</td>
<td align="center">No</td>
</tr>
<tr>
<td align="left">FedGODE <xref ref-type="bibr" rid="B4">Al-Huthaifi et al. (2024b)</xref>
</td>
<td align="center">Federated</td>
<td align="left">Graph ODE</td>
<td align="center">No</td>
<td align="center">No</td>
</tr>
<tr>
<td align="left">Fed-TREND <xref ref-type="bibr" rid="B27">Xu et al. (2024)</xref>
</td>
<td align="center">Federated</td>
<td align="left">Robust aggregation</td>
<td align="center">No</td>
<td align="center">No</td>
</tr>
<tr>
<td align="left">X-FedFormer (Ours)</td>
<td align="center">Federated</td>
<td align="left">MoE routing</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title>Methodology</title>
<sec id="s3-1">
<label>3.1</label>
<title>Synthetic cross-city dataset and pre-processing</title>
<p>To overcome the paucity of publicly available multi-city transit records and to rigorously evaluate cross-city generalisation, a synthetic dataset was generated by extrapolating sample Astana bus-transportation history. The original sample consisted of hourly records; <xref ref-type="table" rid="T2">Table 2</xref> provides a detailed description of the dataset attributes.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Description of fields in the transit dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Field name</th>
<th align="left">Data type</th>
<th align="left">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<monospace>datetime</monospace>
</td>
<td align="left">Timestamp</td>
<td align="left">The specific date and hour of the observation (YYYY-MM-DD HH:MM)</td>
</tr>
<tr>
<td align="left">
<monospace>route_id</monospace>
</td>
<td align="left">String</td>
<td align="left">Unique identifier for the transit route</td>
</tr>
<tr>
<td align="left">
<monospace>inflow_count</monospace>
</td>
<td align="left">Integer</td>
<td align="left">Number of passengers boarding the vehicle/station</td>
</tr>
<tr>
<td align="left">
<monospace>outflow_count</monospace>
</td>
<td align="left">Integer</td>
<td align="left">Number of passengers alighting</td>
</tr>
<tr>
<td align="left">
<monospace>temperature</monospace>
</td>
<td align="left">Float</td>
<td align="left">Ambient temperature in degrees Celsius (<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>C)</td>
</tr>
<tr>
<td align="left">
<monospace>precip_flag</monospace>
</td>
<td align="left">Binary</td>
<td align="left">Indicator of precipitation (0: None, 1: Rain/Snow)</td>
</tr>
<tr>
<td align="left">
<monospace>route_length_km</monospace>
</td>
<td align="left">Float</td>
<td align="left">Total length of the route in kilometers</td>
</tr>
<tr>
<td align="left">
<monospace>num_stops</monospace>
</td>
<td align="left">Integer</td>
<td align="left">Number of stops along the route</td>
</tr>
<tr>
<td align="left">
<monospace>route_type</monospace>
</td>
<td align="left">Categorical</td>
<td align="left">Classification of the route (e.g., urban_core, suburban_feeder)</td>
</tr>
<tr>
<td align="left">zone</td>
<td align="left">Categorical</td>
<td align="left">Urban zone identifier (e.g., zone_1, zone_2)</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3-1-1">
<label>3.1.1</label>
<title>Pre-processing and forecasting setup</title>
<p>Categorical attributes (<monospace>route_type</monospace>, <monospace>zone</monospace>) are one-hot encoded and concatenated with numeric covariates; <monospace>route_id</monospace> is used only for grouping and does not enter the model input. Continuous variables (<monospace>inflow_count</monospace>, <monospace>outflow_count</monospace>, <monospace>temperature</monospace>, <monospace>route_length_km</monospace>, <monospace>num_stops</monospace>) are standardised per city using train-split statistics to prevent leakage. We use a sliding window with input length <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>24</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> hours and forecast horizon <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> hours. The target is the future inflow sequence; outflow is treated as an auxiliary input feature.</p>
</sec>
<sec id="s3-1-2">
<label>3.1.2</label>
<title>Rationale for synthetic generation</title>
<p>The sample history covered fewer than 30 days and only 3 routes, insufficient to stress-test a federated forecasting model. Therefore, a controlled synthetic generator was implemented to produce tens of thousands of hourly observations per city, while preserving key real-world patterns (diurnal peaks, weekday/weekend variation, weather effects, holidays, random events).</p>
</sec>
<sec id="s3-1-3">
<label>3.1.3</label>
<title>Algorithm overview</title>
<p>Let <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> be the number of cities, <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the number of routes per city, <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the number of stops, and <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the route length (km). For each city <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and each of its <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> routes, the following steps are applied:<list list-type="order">
<list-item>
<p>Time index.</p>
</list-item>
</list>
<disp-formula id="equ1">
<mml:math id="m10">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0,1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>24</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mtext>with</mml:mtext>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>number&#x2009;of&#x2009;days&#x2009;for&#x2009;city&#x2009;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>The corresponding datetime is <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="sans-serif">s</mml:mi>
<mml:mi mathvariant="sans-serif">t</mml:mi>
<mml:mi mathvariant="sans-serif">a</mml:mi>
<mml:mi mathvariant="sans-serif">r</mml:mi>
<mml:mi mathvariant="sans-serif">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="normal">h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>doy</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the day-of-year index.<list list-type="simple">
<list-item>
<p>2. Base daily profile. Two Gaussian peaks at typical rush hours are defined by <xref ref-type="disp-formula" rid="e1">Equation 1</xref>:</p>
</list-item>
</list>
<disp-formula id="e1">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>base</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>50</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mspace width="0.1em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>mod</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>80</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mspace width="0.1em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>18</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>mod</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>23</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the hour-of-day and <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the route index.<list list-type="simple">
<list-item>
<p>3. Popularity scaling is formulated in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>.</p>
</list-item>
</list>
<disp-formula id="e2">
<mml:math id="m16">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>15</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>15</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1.2</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;urban_core</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>0.8</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;suburban_feeder</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>base</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<list list-type="simple">
<list-item>
<p>4. Day-of-week adjustment is defined in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>.</p>
</list-item>
</list>
<disp-formula id="e3">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>dow</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1.0</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>Mon&#x2013;Fri</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>0.8</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>Sat</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>0.7</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>Sun</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mtext>dow</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<list list-type="simple">
<list-item>
<p>5. Holiday effect. Define the Kazakh holidays set <inline-formula id="inf14">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="script">H</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,21</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,22</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,23</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>12,16</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> as defined in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. Then</p>
</list-item>
</list>
<disp-formula id="e4">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>0.5</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="sans-serif">m</mml:mi>
<mml:mi mathvariant="sans-serif">o</mml:mi>
<mml:mi mathvariant="sans-serif">n</mml:mi>
<mml:mi mathvariant="sans-serif">t</mml:mi>
<mml:mi mathvariant="sans-serif">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="sans-serif">d</mml:mi>
<mml:mi mathvariant="sans-serif">a</mml:mi>
<mml:mi mathvariant="sans-serif">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">H</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1.0</mml:mn>
<mml:mo>,</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>otherwise</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
<list list-type="simple">
<list-item>
<p>6. Random events. Approximately one event per 10 days introduces multiplicative factors <inline-formula id="inf15">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0.4</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2.5</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> over random intervals of 6&#x2013;24h, yielding <inline-formula id="inf16">
<mml:math id="m21">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p>7. Weather simulation.</p>
</list-item>
</list>
<disp-formula id="equ2">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>sin</mml:mi>
<mml:mspace width="0.1em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>doy</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>80</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>365</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>mod</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ3">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.05</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>0.1</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>sin</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mspace width="0.1em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mtext>doy</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>365</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="normal">B</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">u</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>Temperature and precipitation modulate flow as given in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>. Temperature <inline-formula id="inf17">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and precipitation flag <inline-formula id="inf18">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> modulate flow:<disp-formula id="e5">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.1</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>30</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.15</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<list list-type="simple">
<list-item>
<p>9. Final counts are computed as given in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>.</p>
</list-item>
</list>
<disp-formula id="e6">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi mathvariant="italic">fl</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>profile</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bd;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>Outflow is derived by a lag and ratio as given in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>.<disp-formula id="e7">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi mathvariant="italic">fl</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi mathvariant="italic">fl</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>&#x3ba;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mi>&#x3ba;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="normal">U</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0.85</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.95</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>Each generated record <inline-formula id="inf20">
<mml:math id="m30">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="sans-serif">d</mml:mi>
<mml:mi mathvariant="sans-serif">a</mml:mi>
<mml:mi mathvariant="sans-serif">t</mml:mi>
<mml:mi mathvariant="sans-serif">e</mml:mi>
<mml:mi mathvariant="sans-serif">t</mml:mi>
<mml:mi mathvariant="sans-serif">i</mml:mi>
<mml:mi mathvariant="sans-serif">m</mml:mi>
<mml:mi mathvariant="sans-serif">e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mtext>route_id</mml:mtext>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi mathvariant="italic">fl</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi mathvariant="italic">fl</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x2113;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>route_type</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is stored in CSV format. This procedure produces per-city DataFrames of size <inline-formula id="inf21">
<mml:math id="m31">
<mml:mrow>
<mml:mn>24</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> rows, capturing realistic multi-modal passenger-flow dynamics suitable for federated forecasting studies.</p>
</sec>
<sec id="s3-1-4">
<label>3.1.4</label>
<title>Statistical Validation of Synthetic Dataset</title>
<p>To substantiate the realism of the synthetic data, we compared the real Astana sample (30 days, 3 observed routes) against a 30-day synthetic counterpart. We compute Kolmogorov&#x2013;Smirnov (KS) and 1-Wasserstein distances using SciPy&#x2019;s <monospace>ks_2samp</monospace> and <monospace>wasserstein_distance</monospace> on each route independently, then report mean<inline-formula id="inf22">
<mml:math id="m32">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>std across routes. We report effect sizes (KS <inline-formula id="inf23">
<mml:math id="m33">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) rather than relying solely on <inline-formula id="inf24">
<mml:math id="m34">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-values; all slices yield <inline-formula id="inf25">
<mml:math id="m35">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. We additionally verify peak/off-peak slices (7&#x2013;10 and 17&#x2013;20 vs. the rest), which follow the same ranges as <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Distributional similarity between real and synthetic data (mean <inline-formula id="inf26">
<mml:math id="m36">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> std over routes).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Variable</th>
<th align="left">Slice</th>
<th align="center">KS <inline-formula id="inf27">
<mml:math id="m37">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Wasserstein <inline-formula id="inf28">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf29">
<mml:math id="m39">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> per route</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Inflow</td>
<td align="left">All hours</td>
<td align="center">
<inline-formula id="inf30">
<mml:math id="m40">
<mml:mrow>
<mml:mn>0.048</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.010</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf31">
<mml:math id="m41">
<mml:mrow>
<mml:mn>1.60</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.22</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">720</td>
</tr>
<tr>
<td align="left">Inflow</td>
<td align="left">Weekday</td>
<td align="center">
<inline-formula id="inf32">
<mml:math id="m42">
<mml:mrow>
<mml:mn>0.050</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.012</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf33">
<mml:math id="m43">
<mml:mrow>
<mml:mn>1.67</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.24</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">504</td>
</tr>
<tr>
<td align="left">Inflow</td>
<td align="left">Weekend</td>
<td align="center">
<inline-formula id="inf34">
<mml:math id="m44">
<mml:mrow>
<mml:mn>0.056</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.015</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf35">
<mml:math id="m45">
<mml:mrow>
<mml:mn>1.74</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.27</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">216</td>
</tr>
<tr>
<td align="left">Outflow</td>
<td align="left">All hours</td>
<td align="center">
<inline-formula id="inf36">
<mml:math id="m46">
<mml:mrow>
<mml:mn>0.052</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.011</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf37">
<mml:math id="m47">
<mml:mrow>
<mml:mn>1.69</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.23</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">720</td>
</tr>
<tr>
<td align="left">Outflow</td>
<td align="left">Weekday</td>
<td align="center">
<inline-formula id="inf38">
<mml:math id="m48">
<mml:mrow>
<mml:mn>0.053</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.013</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf39">
<mml:math id="m49">
<mml:mrow>
<mml:mn>1.76</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.25</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">504</td>
</tr>
<tr>
<td align="left">Outflow</td>
<td align="left">Weekend</td>
<td align="center">
<inline-formula id="inf40">
<mml:math id="m50">
<mml:mrow>
<mml:mn>0.059</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.016</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf41">
<mml:math id="m51">
<mml:mrow>
<mml:mn>1.82</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.28</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">216</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Moment matching (mean/variance/skewness/kurtosis) deviates by less than <inline-formula id="inf42">
<mml:math id="m52">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the ACF/PACF curves align closely (<xref ref-type="fig" rid="F1">Figure 1</xref>), with mean absolute deviation across lags 1&#x2013;24 below 0.05. Visual consistency is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>ACF and PACF comparison between real and synthetic inflow series (lags 1&#x2013;24).</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g001.tif">
<alt-text content-type="machine-generated">Side-by-side line graphs compare the autocorrelation function (ACF) and partial autocorrelation function (PACF) for real data (solid blue) and synthetic data (dashed orange) over lags zero to twenty-four, showing close alignment between the two series in both plots.</alt-text>
</graphic>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Validation of Synthetic Data. Top: Comparison of Real vs. Synthetic Inflow Counts for a 14-day Period, illustrating the replication of diurnal and weekly patterns. Bottom: Kernel Density Estimation (KDE) showing the overlap of probability distributions between real and synthetic datasets.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g002.tif">
<alt-text content-type="machine-generated">Two data visualizations compare real and synthetic transit passenger inflow data: the top line graph shows hourly inflow patterns over two days, and the bottom density plot illustrates close overlap between real and synthetic passenger inflow distributions, with minimal statistical difference.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-1-5">
<label>3.1.5</label>
<title>Model architecture and layer-wise details</title>
<p>X-FedFormer is structured as a sequence of specialized modules that transform raw multi-modal inputs into accurate multi-step forecasts. <xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the data flow through these layers, and <xref ref-type="table" rid="T4">Table 4</xref> summarizes their dimensions and functions.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Block diagram of X-FedFormer layers.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g003.tif">
<alt-text content-type="machine-generated">Flowchart showing a prediction model pipeline with five stages: inputs (dynamic such as inflow, weather, calendar, and static route features), embedding and static encoder, fusion and encoding using cross-attention and transformer layers, optional mixture-of-experts with gating, and decoder producing forecast output for multiple time steps.</alt-text>
</graphic>
</fig>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Layer-wise summary of X-FedFormer.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Layer</th>
<th align="center">Input shape</th>
<th align="center">Output shape</th>
<th align="left">Function</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Input Projection</td>
<td align="center">
<inline-formula id="inf43">
<mml:math id="m53">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf44">
<mml:math id="m54">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Projects raw features into <inline-formula id="inf45">
<mml:math id="m55">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dim latent space</td>
</tr>
<tr>
<td align="left">Positional Encoding</td>
<td align="center">
<inline-formula id="inf46">
<mml:math id="m56">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf47">
<mml:math id="m57">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Injects temporal order via learnable embeddings</td>
</tr>
<tr>
<td align="left">Seasonal&#x2013;Trend Decomp</td>
<td align="center">
<inline-formula id="inf48">
<mml:math id="m58">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf49">
<mml:math id="m59">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Separates trend and seasonal components</td>
</tr>
<tr>
<td align="left">Transformer Encoder <inline-formula id="inf50">
<mml:math id="m60">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf51">
<mml:math id="m61">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf52">
<mml:math id="m62">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Multi-head attention &#x2b; FFN for temporal context</td>
</tr>
<tr>
<td align="left">Mixture-of-Experts Block</td>
<td align="center">
<inline-formula id="inf53">
<mml:math id="m63">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf54">
<mml:math id="m64">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Dynamically routes tokens through expert MLPs</td>
</tr>
<tr>
<td align="left">Decoder</td>
<td align="center">
<inline-formula id="inf55">
<mml:math id="m65">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf56">
<mml:math id="m66">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Maps final hidden state to <inline-formula id="inf57">
<mml:math id="m67">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-step forecasts</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-1-6">
<label>3.1.6</label>
<title>Input projection</title>
<p>The concatenated input vector at time t is defined in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>.<disp-formula id="e8">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">fl</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">w</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>is linearly mapped into a d-dimensional latent space as in <xref ref-type="disp-formula" rid="e9">Equation 9</xref>.<disp-formula id="e9">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>proj</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>proj</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf82">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>proj</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. This reduces input heterogeneity and aligns all modalities into a common embedding for downstream processing. Calendar features include hour-of-day and day-of-week encoded via sine/cosine pairs, a holiday indicator, and a day-of-year sine/cosine pair.</p>
</sec>
<sec id="s3-1-7">
<label>3.1.7</label>
<title>Positional encoding</title>
<p>To enable the model to distinguish positions within the sequence of length <inline-formula id="inf83">
<mml:math id="m95">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, a learnable positional embedding is added, as given in <xref ref-type="disp-formula" rid="e10">Equation 10</xref>.<disp-formula id="e10">
<mml:math id="m97">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf85">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the <inline-formula id="inf86">
<mml:math id="m99">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th row of <inline-formula id="inf87">
<mml:math id="m100">
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This mechanism preserves temporal ordering without relying on recurrence.</p>
</sec>
<sec id="s3-1-8">
<label>3.1.8</label>
<title>Seasonal&#x2013;trend decomposition</title>
<p>A multi-scale gated pooling decomposition module separates long-term trend from the high-frequency seasonal component, inspired by STL-style seasonal-trend decomposition <xref ref-type="bibr" rid="B5">Cleveland et al. (1990)</xref>. Given the sequence of projected embeddings, the decomposition is computed via <xref ref-type="disp-formula" rid="e11">Equations 11</xref>&#x2013;<xref ref-type="disp-formula" rid="e13">13</xref> <inline-formula id="inf88">
<mml:math id="m101">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e11">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
<disp-formula id="e13">
<mml:math id="m104">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf89">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes average pooling with window size <inline-formula id="inf90">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf91">
<mml:math id="m107">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pooling scales capture multiple seasonal periods, and the gating network <inline-formula id="inf92">
<mml:math id="m108">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> learns dynamic combination weights.</p>
</sec>
<sec id="s3-1-9">
<label>3.1.9</label>
<title>Transformer encoder layers</title>
<p>Each of the two stacked encoder layers applies:<list list-type="order">
<list-item>
<p>Multi-Head Self-Attention is defined in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>.</p>
</list-item>
</list>
<disp-formula id="e14">
<mml:math id="m109">
<mml:mrow>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where <inline-formula id="inf93">
<mml:math id="m110">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are linear projections of the input, <inline-formula id="inf94">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> per head, and 4 heads enable diverse temporal pattern extraction.<list list-type="simple">
<list-item>
<p>2. The Position-Wise Feed-Forward Network is given in <xref ref-type="disp-formula" rid="e15">Equation 15</xref>.</p>
</list-item>
</list>
<disp-formula id="e15">
<mml:math id="m112">
<mml:mrow>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi mathvariant="normal">U</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>which enhances non-linear modelling capacity.<list list-type="simple">
<list-item>
<p>3. Residual Connections and Layer Normalisation are applied after each sublayer to stabilise training.</p>
</list-item>
</list>
</p>
<p>This encoder stack refines both trend and seasonal streams jointly, yielding contextualised representations <inline-formula id="inf95">
<mml:math id="m113">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-1-10">
<label>3.1.10</label>
<title>Mixture-of-experts block</title>
<p>To increase model capacity without a proportional parameter increase, an MoE block routes each time-step representation through the top-k of EMoE 4 expert MLPs <xref ref-type="bibr" rid="B21">Shazeer et al. (2017)</xref> as given in <xref ref-type="disp-formula" rid="e16">Equation 16</xref>:<disp-formula id="e16">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>expert</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf98">
<mml:math id="m117">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the encoder output at time <inline-formula id="inf99">
<mml:math id="m118">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf100">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>expert</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are individual MLPs, and gating scores <inline-formula id="inf101">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are computed via a softmax over learned logits. Choosing <inline-formula id="inf102">
<mml:math id="m121">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> balances specialisation and computational cost because only the selected experts are evaluated per token. We did not use auxiliary load-balancing or entropy regularizers; routing is learned solely from the forecasting loss.</p>
</sec>
<sec id="s3-1-11">
<label>3.1.11</label>
<title>Decoder</title>
<p>The final hidden state <inline-formula id="inf103">
<mml:math id="m122">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (combining trend and seasonal channels) is used to predict the next T steps as in <xref ref-type="disp-formula" rid="e17">Equation 17</xref>
<disp-formula id="e17">
<mml:math id="m124">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dec</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dec</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf105">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dec</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. This direct linear mapping simplifies multi-step forecasting and reduces inference latency.</p>
</sec>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Optimization and federated integration</title>
<p>This subsection describes the optimisation routines used at the client and server, the federated training protocol, and the integration of differential privacy. All procedures are chosen to balance convergence speed, model generalisation, and privacy guarantees in a non-IID cross-city setting.</p>
<sec id="s3-2-1">
<label>3.2.1</label>
<title>Local optimizer: AdamW</title>
<p>Each client performs local updates using the AdamW optimiser, which decouples weight decay from gradient-based parameter updates, leading to improved generalisation:<disp-formula id="e18">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
<disp-formula id="e19">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
<disp-formula id="e20">
<mml:math id="m128">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3bb;</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
</p>
<p>where:<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf106">
<mml:math id="m129">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are the gradients at step <inline-formula id="inf107">
<mml:math id="m130">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf108">
<mml:math id="m131">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0.9</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.999</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are exponential decay rates,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf109">
<mml:math id="m132">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the learning rate,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf110">
<mml:math id="m133">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> ensures numerical stability,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf111">
<mml:math id="m134">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the weight-decay coefficient (decoupled).</p>
</list-item>
</list>
</p>
<p>AdamW is selected for its ability to adapt per-parameter learning rates via first and second moment estimates given by <xref ref-type="disp-formula" rid="e18">Equations 18</xref>, <xref ref-type="disp-formula" rid="e19">19</xref>, while applying weight decay directly to model weights&#x2014;improving convergence robustness on small batches and heterogeneous data.</p>
</sec>
<sec id="s3-2-2">
<label>3.2.2</label>
<title>Federated objective with FedProx</title>
<p>To mitigate client drift arising from non-IID local data, the FedProx algorithm introduces a proximal term into each client&#x2019;s loss as defined in <xref ref-type="disp-formula" rid="e21">Equation 21</xref>, <xref ref-type="bibr" rid="B9">Li et al. (2020)</xref>:<disp-formula id="e21">
<mml:math id="m135">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>FedProx</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>
</p>
<p>where:<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf112">
<mml:math id="m136">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the local empirical loss (e.g., MSE) on client <inline-formula id="inf113">
<mml:math id="m137">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf114">
<mml:math id="m138">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the global parameters from round <inline-formula id="inf115">
<mml:math id="m139">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf116">
<mml:math id="m140">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> controls the strength of the proximal penalty.</p>
</list-item>
</list>
</p>
<p>This additional term constrains local updates to remain close to the global model, improving stability and convergence in heterogeneous environments.</p>
<p>Following local training for <inline-formula id="inf117">
<mml:math id="m141">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> epochs with AdamW (<xref ref-type="disp-formula" rid="e20">Equation S20</xref>), each client computes parameter delta <inline-formula id="inf118">
<mml:math id="m142">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and securely transmits it to the server.</p>
</sec>
<sec id="s3-2-3">
<label>3.2.3</label>
<title>Server aggregation: Weighted averaging</title>
<p>Upon collecting updates <inline-formula id="inf119">
<mml:math id="m143">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from a subset <inline-formula id="inf120">
<mml:math id="m144">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of clients, the server updates the global model via weighted FedAvg as in <xref ref-type="disp-formula" rid="e22">Equation 22</xref>, <xref ref-type="bibr" rid="B14">McMahan et al. (2017)</xref>:<disp-formula id="e22">
<mml:math id="m145">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf121">
<mml:math id="m146">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of local samples on client <inline-formula id="inf122">
<mml:math id="m147">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This data-weighted aggregation ensures that clients with larger datasets have proportionally greater influence on the global model.</p>
</sec>
<sec id="s3-2-4">
<label>3.2.4</label>
<title>Differential privacy via per-sample clipping &#x2b; Gaussian noise (opacus)</title>
<p>To prevent information leakage from model updates, each client applies per-sample gradient clipping and Gaussian noise injection before the optimizer step using Opacus <xref ref-type="bibr" rid="B1">Abadi et al. (2016)</xref>; <xref ref-type="bibr" rid="B17">Opacus Team (2021)</xref>. The per-sample clipping and Gaussian noise injection mechanism is defined in <xref ref-type="disp-formula" rid="e23">Equations 23</xref>, <xref ref-type="disp-formula" rid="e24">24</xref>.<disp-formula id="e23">
<mml:math id="m148">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2113;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mtext>clip&#x2009;</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>
<disp-formula id="e24">
<mml:math id="m149">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>
</p>
<p>where:<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf123">
<mml:math id="m150">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the current mini-batch of size <inline-formula id="inf124">
<mml:math id="m151">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf125">
<mml:math id="m152">
<mml:mrow>
<mml:mtext>clip&#x2009;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>/</mml:mo>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> bounds per-sample gradient norm,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf126">
<mml:math id="m153">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the noise multiplier,</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf127">
<mml:math id="m154">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the clipping norm,</p>
</list-item>
<list-item>
<p>The Gaussian mechanism ensures <inline-formula id="inf128">
<mml:math id="m155">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>-DP.</p>
</list-item>
</list>
</p>
<p>The noised gradients <inline-formula id="inf129">
<mml:math id="m156">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> are then used by AdamW for the local update. We provide record-level DP within each city, where one hourly route record corresponds to one example (not client-level DP). A privacy accountant tracks the cumulative <inline-formula id="inf130">
<mml:math id="m157">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> across rounds. We use an RDP accountant as implemented in Opacus to calibrate <inline-formula id="inf131">
<mml:math id="m158">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for target <inline-formula id="inf132">
<mml:math id="m159">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, with <inline-formula id="inf133">
<mml:math id="m160">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and fixed sampling rate across rounds <xref ref-type="bibr" rid="B17">Opacus Team (2021)</xref>. The accountant inputs are the sampling rate <inline-formula id="inf134">
<mml:math id="m161">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>train</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, steps per round <inline-formula id="inf135">
<mml:math id="m162">
<mml:mrow>
<mml:mo>&#x2308;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>train</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mo>&#x2309;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (with one local epoch), total steps <inline-formula id="inf136">
<mml:math id="m163">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mrow>
<mml:mo>&#x2308;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>train</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mo>&#x2309;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, clipping norm <inline-formula id="inf137">
<mml:math id="m164">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and noise multiplier <inline-formula id="inf138">
<mml:math id="m165">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-2-5">
<label>3.2.5</label>
<title>Federated training workflow</title>
<p>The complete federated training and optimization workflow is illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Flowchart of the federated training and optimization process.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g004.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a federated learning process: the server initializes model parameters, selects clients, broadcasts parameters, clients perform local training and secure upload, server aggregates updates, and process repeats if rounds remain; otherwise, training is complete.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-2-6">
<label>3.2.6</label>
<title>Hyperparameters</title>
<p>All hyperparameters used for optimization and federated training are listed in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Hyperparameters for optimization and federated learning.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Parameter</th>
<th align="center">Symbol</th>
<th align="left">Value/Range</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Learning rate</td>
<td align="center">
<inline-formula id="inf58">
<mml:math id="m68">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">
<inline-formula id="inf59">
<mml:math id="m69">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Betas (AdamW)</td>
<td align="center">
<inline-formula id="inf60">
<mml:math id="m70">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">
<inline-formula id="inf61">
<mml:math id="m71">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0.9</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.999</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Weight decay</td>
<td align="center">
<inline-formula id="inf62">
<mml:math id="m72">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">
<inline-formula id="inf63">
<mml:math id="m73">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Batch size</td>
<td align="center">
<inline-formula id="inf64">
<mml:math id="m74">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">32</td>
</tr>
<tr>
<td align="left">Input window length</td>
<td align="center">
<inline-formula id="inf65">
<mml:math id="m75">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">24</td>
</tr>
<tr>
<td align="left">Forecast horizon</td>
<td align="center">
<inline-formula id="inf66">
<mml:math id="m76">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">6</td>
</tr>
<tr>
<td align="left">Local epochs</td>
<td align="center">
<inline-formula id="inf67">
<mml:math id="m77">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">1</td>
</tr>
<tr>
<td align="left">Proximal term</td>
<td align="center">
<inline-formula id="inf68">
<mml:math id="m78">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">
<inline-formula id="inf69">
<mml:math id="m79">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Clipping norm</td>
<td align="center">
<inline-formula id="inf70">
<mml:math id="m80">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">1.0</td>
</tr>
<tr>
<td align="left">Noise multiplier</td>
<td align="center">
<inline-formula id="inf71">
<mml:math id="m81">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">1.1 (base run)</td>
</tr>
<tr>
<td align="left">Privacy budget</td>
<td align="center">
<inline-formula id="inf72">
<mml:math id="m82">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">
<inline-formula id="inf73">
<mml:math id="m83">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>2,1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (base run)</td>
</tr>
<tr>
<td align="left">Communication rounds</td>
<td align="center">
<inline-formula id="inf74">
<mml:math id="m84">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">50</td>
</tr>
<tr>
<td align="left">Encoder layers</td>
<td align="center">
<inline-formula id="inf75">
<mml:math id="m85">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">2</td>
</tr>
<tr>
<td align="left">Embedding dimension</td>
<td align="center">
<inline-formula id="inf76">
<mml:math id="m86">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">64</td>
</tr>
<tr>
<td align="left">Attention heads</td>
<td align="center">&#x2014;</td>
<td align="left">4</td>
</tr>
<tr>
<td align="left">MoE experts</td>
<td align="center">
<inline-formula id="inf77">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MoE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">4</td>
</tr>
<tr>
<td align="left">Top-<inline-formula id="inf78">
<mml:math id="m88">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> experts</td>
<td align="center">
<inline-formula id="inf79">
<mml:math id="m89">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">2</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3-2-6-1">
<label>3.2.6.1</label>
<title>Baseline implementation protocol</title>
<p>All baselines use the same input window <inline-formula id="inf139">
<mml:math id="m166">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>24</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, horizon <inline-formula id="inf140">
<mml:math id="m167">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, batch size, number of rounds, and optimization settings (AdamW, learning rate, and local epochs). For fairness, we keep the federated protocol, DP setting (when enabled), and tuning budget identical, while varying only the backbone architecture or aggregation rule.</p>
</sec>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s4">
<label>4</label>
<title>Results</title>
<p>The efficacy of the X-FedFormer model in multi-modal passenger-flow forecasting was rigorously evaluated using a synthetic dataset designed to simulate real-world urban mobility patterns. This dataset encompasses 90 days of hourly passenger-flow records for 30 distinct routes within each of ten participating cities, culminating in a comprehensive total of 64,800 observations per city. Model performance was quantitatively assessed using three widely accepted metrics: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and the coefficient of determination <inline-formula id="inf141">
<mml:math id="m168">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. These metrics were computed at the culmination of the federated training process, specifically at the final communication round, to reflect the model&#x2019;s converged performance.</p>
<p>Each city is split temporally into 70% training, 10% validation, and 20% testing with no shuffling. We run 5 trials with seeds {11, 23, 37, 41, 59}. For statistical testing, we average each city&#x2019;s MAE across seeds and apply a Wilcoxon signed-rank test on the 10 paired cities when comparing X-FedFormer against the strongest baseline (FedProx); seed variability is reported descriptively as mean<inline-formula id="inf142">
<mml:math id="m169">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>std <xref ref-type="bibr" rid="B6">Demsar (2006)</xref>.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Per-city forecasting performance</title>
<p>
<xref ref-type="table" rid="T6">Table 6</xref> presents the forecasting accuracy for each client city at the final federated communication round under Differential Privacy (DP, <inline-formula id="inf143">
<mml:math id="m170">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). To ensure robust statistical validation, we report the Mean <inline-formula id="inf144">
<mml:math id="m171">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Standard Deviation across 5 independent experimental runs initiated with distinct random seeds. The results indicate a highly consistent performance profile across the diverse urban environments. Furthermore, the statistical significance of the performance improvements of X-FedFormer over the strongest baseline (FedProx) was rigorously verified using a one-sided Wilcoxon signed-rank test. The main comparison yields <inline-formula id="inf145">
<mml:math id="m172">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.018</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, confirming that the superior predictive accuracy of our proposed model is statistically significant and not attributable to random variance.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Per-city forecasting performance at the final federated round with DP enabled (<inline-formula id="inf146">
<mml:math id="m173">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>; Mean <inline-formula id="inf147">
<mml:math id="m174">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Std Dev over 5 runs).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">City</th>
<th align="right">
<inline-formula id="inf148">
<mml:math id="m175">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf149">
<mml:math id="m176">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf150">
<mml:math id="m177">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Almaty</td>
<td align="right">
<inline-formula id="inf151">
<mml:math id="m178">
<mml:mrow>
<mml:mn>8.91</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.42</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf152">
<mml:math id="m179">
<mml:mrow>
<mml:mn>18.25</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.88</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf153">
<mml:math id="m180">
<mml:mrow>
<mml:mn>0.899</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.005</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Astana</td>
<td align="right">
<inline-formula id="inf154">
<mml:math id="m181">
<mml:mrow>
<mml:mn>9.07</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.38</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf155">
<mml:math id="m182">
<mml:mrow>
<mml:mn>18.12</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf156">
<mml:math id="m183">
<mml:mrow>
<mml:mn>0.921</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.004</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Karaganda</td>
<td align="right">
<inline-formula id="inf157">
<mml:math id="m184">
<mml:mrow>
<mml:mn>7.30</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.25</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf158">
<mml:math id="m185">
<mml:mrow>
<mml:mn>17.51</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.65</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf159">
<mml:math id="m186">
<mml:mrow>
<mml:mn>0.938</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.003</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Shymkent</td>
<td align="right">
<inline-formula id="inf160">
<mml:math id="m187">
<mml:mrow>
<mml:mn>6.46</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.22</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf161">
<mml:math id="m188">
<mml:mrow>
<mml:mn>15.60</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.55</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf162">
<mml:math id="m189">
<mml:mrow>
<mml:mn>0.935</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.004</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Aktobe</td>
<td align="right">
<inline-formula id="inf163">
<mml:math id="m190">
<mml:mrow>
<mml:mn>8.35</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.31</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf164">
<mml:math id="m191">
<mml:mrow>
<mml:mn>16.46</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.72</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf165">
<mml:math id="m192">
<mml:mrow>
<mml:mn>0.914</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.006</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Pavlodar</td>
<td align="right">
<inline-formula id="inf166">
<mml:math id="m193">
<mml:mrow>
<mml:mn>8.40</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.33</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf167">
<mml:math id="m194">
<mml:mrow>
<mml:mn>17.35</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.68</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf168">
<mml:math id="m195">
<mml:mrow>
<mml:mn>0.929</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.005</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Taraz</td>
<td align="right">
<inline-formula id="inf169">
<mml:math id="m196">
<mml:mrow>
<mml:mn>8.78</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.37</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf170">
<mml:math id="m197">
<mml:mrow>
<mml:mn>20.32</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.95</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf171">
<mml:math id="m198">
<mml:mrow>
<mml:mn>0.925</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.005</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Atyrau</td>
<td align="right">
<inline-formula id="inf172">
<mml:math id="m199">
<mml:mrow>
<mml:mn>5.36</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.18</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf173">
<mml:math id="m200">
<mml:mrow>
<mml:mn>11.41</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.42</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf174">
<mml:math id="m201">
<mml:mrow>
<mml:mn>0.919</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.007</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Kostanay</td>
<td align="right">
<inline-formula id="inf175">
<mml:math id="m202">
<mml:mrow>
<mml:mn>8.11</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.29</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf176">
<mml:math id="m203">
<mml:mrow>
<mml:mn>15.79</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.61</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf177">
<mml:math id="m204">
<mml:mrow>
<mml:mn>0.909</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.006</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Aktau</td>
<td align="right">
<inline-formula id="inf178">
<mml:math id="m205">
<mml:mrow>
<mml:mn>8.51</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.30</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf179">
<mml:math id="m206">
<mml:mrow>
<mml:mn>17.03</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="right">
<inline-formula id="inf180">
<mml:math id="m207">
<mml:mrow>
<mml:mn>0.926</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.005</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F5">Figure 5</xref> visually represents the MAE for each city, allowing for a clear categorization of performance. Three distinct tiers of forecasting accuracy emerged:<list list-type="bullet">
<list-item>
<p>High-Accuracy Cluster (MAE <inline-formula id="inf181">
<mml:math id="m208">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>7.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>): Atyrau (5.36), Shymkent (6.46), and Karaganda (7.30) exhibited the lowest MAE values, indicating superior forecasting precision. These cities likely possess more predictable passenger flow dynamics or benefit from data characteristics that align well with the model&#x2019;s learning capabilities.</p>
</list-item>
<list-item>
<p>Lower-Accuracy Group (MAE &#x3e; 8.5): Almaty (8.91), Astana (9.07), Taraz (8.78), and Aktau (8.51) recorded comparatively higher MAE values. This suggests that passenger flow in these cities might be subject to greater inherent variability, more complex exogenous factors, or unique patterns that pose a greater challenge for precise prediction.</p>
</list-item>
<list-item>
<p>Intermediate Band: The remaining cities (Aktobe, Pavlodar, Kostanay) demonstrated MAE values falling between these two extremes, representing a moderate level of forecasting accuracy.</p>
</list-item>
</list>
</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>City-wise Mean Absolute Error (MAE) at the final federated round. Error bars represent one standard deviation over 5 runs, illustrating performance variability across random seeds.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g005.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x22;City-wise MAE Performance at Final Federated Round&#x22; depicts mean absolute error (MAE) in passenger prediction for ten cities, with grouped color coding for performance tiers, labeled error bars, and tier boundaries at MAE equals 7.0 and 8.5.</alt-text>
</graphic>
</fig>
<p>The observed variations underscore the inherent heterogeneity in urban mobility patterns and the differential impact of local characteristics on forecasting performance, even within a federated learning framework.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Aggregate error statistics</title>
<p>To provide a holistic understanding of the X-FedFormer model&#x2019;s performance, aggregate error statistics were computed across all ten participating cities. The average MAE <inline-formula id="inf182">
<mml:math id="m209">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> was found to be 7.93 passengers, with a standard deviation <inline-formula id="inf183">
<mml:math id="m210">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of 1.14 passengers. This low standard deviation indicates a relatively consistent level of absolute error across the diverse urban environments. Similarly, the average RMSE <inline-formula id="inf184">
<mml:math id="m211">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> was 16.78 passengers, with a standard deviation of 2.21 passengers. The larger magnitude of RMSE compared to MAE, while expected, still demonstrates a controlled impact of larger errors on the overall performance. Finally, the average <inline-formula id="inf185">
<mml:math id="m212">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf186">
<mml:math id="m213">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> across all cities was 0.9215, with a remarkably small standard deviation of 0.0113.<disp-formula id="equ4">
<mml:math id="m214">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>7.93</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.14</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:mtext>passengers</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ5">
<mml:math id="m215">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16.78</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.21</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:mtext>passengers</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ6">
<mml:math id="m216">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9215</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0113</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>These aggregate statistics, visually summarized in <xref ref-type="fig" rid="F6">Figure 6</xref>, collectively affirm that the federated X-FedFormer model maintains a consistently high level of accuracy and explanatory power. The low standard deviations across all metrics highlight the model&#x2019;s robustness and its ability to generalize effectively across geographically and operationally diverse urban contexts, a critical advantage of the federated learning paradigm.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Boxplots illustrating the distribution of (left) Mean Absolute Error (MAE) and (right) Coefficient of Determination <inline-formula id="inf187">
<mml:math id="m217">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> across all cities at the final federated round. The boxplots highlight the median, interquartile range, and potential outliers, demonstrating the consistency of model performance.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g006.tif">
<alt-text content-type="machine-generated">Boxplot graphic comparing two statistical metrics across cities; the left plot shows mean absolute error (MAE) in blue, and the right plot shows coefficient of determination (R&#xB2;) in orange, each with median indicated in red.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Error dispersion and RMSE/MAE ratios</title>
<p>The ratio of Root Mean Squared Error (RMSE) to Mean Absolute Error (MAE) serves as an insightful indicator of the distribution of forecast errors, particularly highlighting the presence and magnitude of occasional large deviations. A higher RMSE/MAE ratio suggests a greater influence of larger errors, implying a heavier tail in the error distribution. <xref ref-type="fig" rid="F7">Figure 7</xref> illustrates this relationship, showing that all cities cluster within a relatively narrow RMSE/MAE ratio range of 2.0&#x2013;2.42. This consistent clustering indicates that the X-FedFormer model, through its architecture and training setup, effectively controls extreme errors across all client environments. The moderate ratio suggests that while some larger errors are present, they are not disproportionately dominant, and the model does not suffer from severe outliers.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Scatter plot of Root Mean Squared Error (RMSE) versus Mean Absolute Error (MAE) for each city at the final federated round. The dashed line denotes the theoretical <inline-formula id="inf188">
<mml:math id="m218">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> condition, highlighting the relative influence of larger errors.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g007.tif">
<alt-text content-type="machine-generated">Scatter plot illustrating root mean squared error (RMSE) on the vertical axis versus mean absolute error (MAE) on the horizontal axis for various cities, with each city labeled; a dashed reference line signifies RMSE equals MAE.</alt-text>
</graphic>
</fig>
<p>Specifically, Shymkent exhibits the highest RMSE/MAE ratio (approximately 2.41), indicating a slightly heavier tail in its error distribution compared to other cities. Taraz and Karaganda are also among the higher ratios (about 2.31 and 2.40, respectively). Conversely, Kostanay shows the lowest ratio (approximately 1.95), indicating a more uniform distribution of errors with fewer significant outliers. The dashed line in <xref ref-type="fig" rid="F7">Figure 7</xref> represents the theoretical <inline-formula id="inf189">
<mml:math id="m219">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> scenario, which would only occur if all errors were identical in magnitude, serving as a baseline for comparison. The observed ratios consistently above 1.0 are typical for real-world forecasting tasks, confirming the presence of varying error magnitudes.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Variance explained <inline-formula id="inf190">
<mml:math id="m220">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</title>
<p>The coefficient of determination <inline-formula id="inf191">
<mml:math id="m221">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> quantifies the proportion of the variance in the dependent variable (passenger flow) that is predictable from the independent variables (model inputs). In this study, the computed <inline-formula id="inf192">
<mml:math id="m222">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values are approximately 0.90 or higher for all cities, with a minimum of 0.899 (Almaty) and a maximum of 0.938 (Karaganda). This remarkable consistency highlights the X-FedFormer model&#x2019;s exceptional ability to capture about 90% of the temporal variance inherent in passenger flows. Such high <inline-formula id="inf193">
<mml:math id="m223">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values are indicative of a strong model fit and robust predictive power, suggesting that the model effectively learns the underlying patterns and relationships governing passenger movement.</p>
<p>Karaganda achieved the highest <inline-formula id="inf194">
<mml:math id="m224">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> of 0.938, closely followed by Shymkent with 0.935. This superior performance in these cities, coupled with their lower MAE values, suggests that their passenger flow patterns might be more stable, less influenced by unpredictable exogenous factors, or exhibit clearer, more discernible trends that the Transformer architecture can effectively leverage. The high <inline-formula id="inf195">
<mml:math id="m225">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values across the board imply that the model provides highly reliable forecasts, which is crucial for practical applications such as urban planning, traffic management, and public transportation optimization.</p>
</sec>
<sec id="s4-5">
<label>4.5</label>
<title>Temporal consistency of forecasts</title>
<p>To rigorously assess the temporal robustness and generalization capabilities of the X-FedFormer model, a rolling Mean Absolute Error (MAE) was computed over hourly intervals with a 24-h window. This analysis provides insights into how the model&#x2019;s performance fluctuates throughout different times of day and across various temporal contexts. <xref ref-type="fig" rid="F8">Figure 8</xref> illustrates the hourly rolling MAE for each city. A key observation is that all cities exhibit relatively flat curves with only minor diurnal fluctuations. This consistency confirms that the model generalizes exceptionally well across different times of day, demonstrating its ability to maintain high accuracy regardless of the hourly variations in passenger flow.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Hourly rolling Mean Absolute Error (MAE) with a 24-h window for each city at the final federated round. The plot illustrates the model&#x2019;s forecasting performance, temporal consistency and diurnal stability.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g008.tif">
<alt-text content-type="machine-generated">Line graph showing hourly rolling mean absolute error (MAE) over 336 hours for ten cities. Each city is represented by a different color, with Atyrau maintaining the lowest MAE and Almaty and Astana the highest. Vertical axis is labeled &#x201C;24-hour Rolling Mean Absolute Error (MAE) [passengers],&#x201D; and the horizontal axis is labeled &#x201C;Time [Hours].&#x201D;</alt-text>
</graphic>
</fig>
<p>The cities of Atyrau and Shymkent, which also demonstrated superior aggregate MAE performance, displayed the flattest rolling MAE profiles. This indicates that their forecasting accuracy is remarkably stable throughout the 24-h cycle, with minimal degradation during peak or off-peak hours. This temporal stability is a critical attribute for real-world deployment, as it ensures reliable predictions for operational decision-making at any given time. The model&#x2019;s capacity to handle the inherent periodicity and non-stationarity of time-series passenger flow data is thus strongly validated.</p>
</sec>
<sec id="s4-6">
<label>4.6</label>
<title>Spatial heterogeneity in forecast skill</title>
<p>An investigation into spatial heterogeneity was conducted by grouping cities based on their geographic regions (northern vs. southern) to ascertain if systematic biases in forecast skill were present. <xref ref-type="fig" rid="F9">Figure 9</xref> presents a comparison of mean MAE values for these regional groupings. The analysis revealed minimal systematic bias attributable to geographic location. Northern cities, exemplified by Kostanay and Pavlodar, achieved mean MAE values within the range of 7.5&#x2013;9.0 passengers. Similarly, southern cities, such as Shymkent and Taraz, exhibited mean MAE values within a comparable range of 6.5&#x2013;9.0 passengers.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Regional Mean Absolute Error (MAE) comparison: northern versus southern cities. Bars denote the mean MAE for each regional group, while whiskers indicate plus/minus one standard deviation, illustrating the consistency of performance across geographic divisions.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g009.tif">
<alt-text content-type="machine-generated">Horizontal bar chart comparing mean absolute error values across five Kazakhstan regions for passenger data, with error bars showing variability. Northern Kazakhstan has the highest error at 8.59, followed by Eastern Kazakhstan at 8.40, Southern Kazakhstan at 8.05, Western Kazakhstan at 7.41, and Central Kazakhstan at 7.30. Each bar is color-coded and labels include the specific cities within each region.</alt-text>
</graphic>
</fig>
<p>This overlap and lack of significant divergence in performance between the northern and southern clusters strongly indicate that the federated X-FedFormer model effectively accommodates the diverse climatic conditions, urban structures, and operational characteristics inherent to different regions. The model&#x2019;s ability to learn from a distributed dataset across varied environments, facilitated by the federated learning framework, mitigates the impact of spatial heterogeneity, ensuring a generalized and robust forecasting capability across the entire network of participating cities. This finding underscores the model&#x2019;s adaptability and its potential for broad applicability in diverse urban settings.</p>
</sec>
<sec id="s4-7">
<label>4.7</label>
<title>Convergence dynamics of federated training</title>
<p>To thoroughly understand the learning dynamics and stability of the federated optimization process, the evolution of aggregated forecast errors and explained variance was meticulously tracked over 50 communication rounds. This analysis provides critical insights into the efficiency and convergence behavior of the X-FedFormer model under a differentially private federated learning paradigm.</p>
<p>
<xref ref-type="fig" rid="F10">Figure 10</xref> illustrates the trajectory of the average Mean Absolute Error (MAE) across all client models at each communication round. A pronounced and rapid decrease in MAE is evident during the initial 10 rounds, signifying efficient knowledge transfer and model refinement in the early stages of federated aggregation. Following this rapid improvement, the rate of MAE reduction gradually tapers off, indicating diminishing returns from subsequent global aggregations. The model effectively reaches a performance plateau by approximately round 40, suggesting that further communication rounds yield only marginal improvements in forecasting accuracy. This convergence behavior is characteristic of well-behaved optimization processes in federated learning, where initial global model updates provide substantial benefits, followed by fine-tuning.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Evolution of the aggregated Mean Absolute Error (MAE) across 50 federated communication rounds. The plot demonstrates rapid initial convergence followed by a plateau, indicating efficient learning dynamics.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g010.tif">
<alt-text content-type="machine-generated">Line chart illustrating average Mean Absolute Error (MAE), Root Mean Square Error (RMSE), and R squared (R&#xB2;) values across communication rounds. MAE and RMSE decrease, while R&#xB2; increases, indicating improving model accuracy.</alt-text>
</graphic>
</fig>
<p>
<xref ref-type="table" rid="T7">Table 7</xref> provides a quantitative summary of the key aggregated forecast metrics&#x2014;<inline-formula id="inf196">
<mml:math id="m226">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf197">
<mml:math id="m227">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf198">
<mml:math id="m228">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>&#x2014;at selected communication rounds (1, 10, 20, 30, 40, 50). This tabular representation corroborates the visual trends observed in <xref ref-type="fig" rid="F10">Figure 10</xref>. For instance, the <inline-formula id="inf199">
<mml:math id="m229">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> dramatically drops from 23.69&#xa0;at Round 1 to 13.24&#xa0;at Round 10, and further to 9.68&#xa0;at Round 20, before stabilizing around 7.93 by Round 50.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>Evolution of average forecast metrics across communication rounds (selected). Metrics include Mean Absolute Error (MAE), Coefficient of Determination <inline-formula id="inf200">
<mml:math id="m230">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and Root Mean Squared Error (RMSE).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Round</th>
<th align="right">
<inline-formula id="inf201">
<mml:math id="m231">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf202">
<mml:math id="m232">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf203">
<mml:math id="m233">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1</td>
<td align="right">23.69</td>
<td align="right">40.64</td>
<td align="right">0.263</td>
</tr>
<tr>
<td align="center">10</td>
<td align="right">13.24</td>
<td align="right">29.23</td>
<td align="right">0.637</td>
</tr>
<tr>
<td align="center">20</td>
<td align="right">9.68</td>
<td align="right">18.49</td>
<td align="right">0.856</td>
</tr>
<tr>
<td align="center">30</td>
<td align="right">8.65</td>
<td align="right">17.20</td>
<td align="right">0.888</td>
</tr>
<tr>
<td align="center">40</td>
<td align="right">8.10</td>
<td align="right">16.95</td>
<td align="right">0.905</td>
</tr>
<tr>
<td align="center">50</td>
<td align="right">7.93</td>
<td align="right">16.78</td>
<td align="right">0.922</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The convergence behavior indicates that 50 rounds are sufficient for the model to achieve a high level of performance and stability.</p>
</sec>
<sec id="s4-8">
<label>4.8</label>
<title>Ablation study and baseline comparison</title>
<p>To validate the contribution of specific architectural components (MoE and Seasonal Decomposition), we conducted a comprehensive ablation study. To strictly isolate the performance gains attributable to the federated setup and DP, we first provide a <inline-formula id="inf204">
<mml:math id="m234">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> control comparison using the same architecture (<xref ref-type="table" rid="T8">Table 8</xref>). In this table, the dataset, hyperparameters, and evaluation protocol are identical; only the training regime (centralized vs. federated) and DP flag change. We then report architectural ablations and baseline comparisons under a fixed <italic>Federated, No-DP</italic> regime to make attribution unambiguous.</p>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Controlled comparison isolating training regime and DP (X-FedFormer only, Avg over all cities).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Training regime</th>
<th align="center">DP?</th>
<th align="left">Model</th>
<th align="right">
<inline-formula id="inf205">
<mml:math id="m235">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf206">
<mml:math id="m236">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf207">
<mml:math id="m237">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Centralized</td>
<td align="center">No</td>
<td align="left">X-FedFormer</td>
<td align="right">6.92</td>
<td align="right">15.10</td>
<td align="right">0.941</td>
</tr>
<tr>
<td align="left">Centralized</td>
<td align="center">Yes</td>
<td align="left">X-FedFormer</td>
<td align="right">7.25</td>
<td align="right">15.85</td>
<td align="right">0.934</td>
</tr>
<tr>
<td align="left">Federated</td>
<td align="center">No</td>
<td align="left">X-FedFormer</td>
<td align="right">7.15</td>
<td align="right">15.65</td>
<td align="right">0.935</td>
</tr>
<tr>
<td align="left">Federated</td>
<td align="center">Yes</td>
<td align="left">X-FedFormer</td>
<td align="right">7.93</td>
<td align="right">16.78</td>
<td align="right">0.922</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Additionally, we compared our method against recent state-of-the-art baselines. As shown in <xref ref-type="table" rid="T9">Table 9</xref>, the full X-FedFormer model significantly outperforms the ablated versions and standard baselines under the same Federated, No-DP protocol; DP is disabled for all rows in <xref ref-type="table" rid="T9">Table 9</xref> and the federated protocol is identical.</p>
<table-wrap id="T9" position="float">
<label>TABLE 9</label>
<caption>
<p>Ablation Study and Baseline Comparison (Federated, No-DP; Avg over all cities).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="right">
<inline-formula id="inf208">
<mml:math id="m238">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf209">
<mml:math id="m239">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="right">
<inline-formula id="inf210">
<mml:math id="m240">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">X-FedFormer (Full)</td>
<td align="right">
<bold>7.15</bold>
</td>
<td align="right">
<bold>15.65</bold>
</td>
<td align="right">
<bold>0.935</bold>
</td>
</tr>
<tr>
<td align="left">w/o MoE</td>
<td align="right">7.95</td>
<td align="right">17.10</td>
<td align="right">0.907</td>
</tr>
<tr>
<td align="left">w/o Decomposition</td>
<td align="right">8.28</td>
<td align="right">17.70</td>
<td align="right">0.897</td>
</tr>
<tr>
<td align="left">FedAvg (Baseline)</td>
<td align="right">9.20</td>
<td align="right">19.95</td>
<td align="right">0.865</td>
</tr>
<tr>
<td align="left">FedProx (Baseline)</td>
<td align="right">8.85</td>
<td align="right">19.10</td>
<td align="right">0.878</td>
</tr>
<tr>
<td align="left">Baseline (CNN)</td>
<td align="right">9.75</td>
<td align="right">20.85</td>
<td align="right">0.852</td>
</tr>
<tr>
<td align="left">Baseline (Trans)</td>
<td align="right">9.40</td>
<td align="right">20.10</td>
<td align="right">0.861</td>
</tr>
<tr>
<td align="left">Freight-FL (<xref ref-type="bibr" rid="B22">Shen et al., 2024a</xref>)</td>
<td align="right">9.05</td>
<td align="right">19.40</td>
<td align="right">0.872</td>
</tr>
<tr>
<td align="left">FlightDelay-FL (<xref ref-type="bibr" rid="B23">Shen et al., 2024b</xref>)</td>
<td align="right">9.18</td>
<td align="right">19.70</td>
<td align="right">0.868</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold values indicate the best-performing model for each evaluation metric (MAE, RMSE, R<sup>2</sup>) across all compared methods.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>As illustrated in <xref ref-type="fig" rid="F11">Figure 11</xref>, the removal of the MoE block resulted in an MAE increase of approximately 11%, underscoring the critical role of adaptive expert routing in managing data heterogeneity. Similarly, the exclusion of the Seasonal Decomposition layers degraded performance by roughly 16%, confirming that explicit temporal modeling is indispensable for transit data, which exhibits strong periodicities.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Impact of removing key architectural components (MoE and Seasonal Decomposition) on forecasting error (MAE). The full X-FedFormer model achieves the lowest error, validating the synergy of its components.</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g011.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x201C;Ablation Study: Component Impact&#x201D; compares MAE values for X-FedFormer (Full), w/o MoE, and w/o Decomp; respective scores are 7.15, 7.95, and 8.28, with error bars shown for each bar.</alt-text>
</graphic>
</fig>
<p>Comparative analysis against external benchmarks reveals that X-FedFormer significantly outperforms both the decentralized freight-speed model <xref ref-type="bibr" rid="B22">Shen et al. (2024a)</xref> and the federated flight-delay model <xref ref-type="bibr" rid="B23">Shen et al. (2024b)</xref>. We implement their spatio-temporal blocks as architectural baselines under the same federated protocol to ensure fairness. Unlike these unified models, our MoE-based approach offers superior personalization capabilities, enabling it to better capture the distinct dynamic profiles of individual cities.</p>
<sec id="s4-8-1">
<label>4.8.1</label>
<title>MoE utilization diagnostic</title>
<p>To verify that the MoE does not collapse to a single expert, we report the average routing distribution across experts (<xref ref-type="fig" rid="F12">Figure 12</xref>). Expert utilization remains balanced (0.20&#x2013;0.31 per expert) with high routing entropy (1.36, close to the maximum 1.39 for four experts), indicating meaningful specialization rather than collapse.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Average expert utilization across cities and rounds (MoE with <inline-formula id="inf211">
<mml:math id="m241">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MoE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf212">
<mml:math id="m242">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>).</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g012.tif">
<alt-text content-type="machine-generated">Bar chart titled Average Expert Utilization compares routing share for four experts: Expert 1 at zero point three one, Expert 2 at zero point two seven, Expert 3 at zero point two two, and Expert 4 at zero point two zero.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4-9">
<label>4.9</label>
<title>Privacy-utility tradeoff analysis</title>
<p>We rigorously analyzed the impact of the differential privacy budget <inline-formula id="inf213">
<mml:math id="m243">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> on forecasting performance to characterize the privacy-utility frontier. <xref ref-type="table" rid="T10">Table 10</xref> quantifies the exact performance metrics across a range of <inline-formula id="inf214">
<mml:math id="m244">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> values, where the noise multiplier <inline-formula id="inf215">
<mml:math id="m245">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> was calibrated using the Opacus privacy accountant to achieve the target budget over <inline-formula id="inf216">
<mml:math id="m246">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> rounds while keeping the sampling rate, clipping norm, and number of steps fixed.</p>
<table-wrap id="T10" position="float">
<label>TABLE 10</label>
<caption>
<p>Privacy-Utility Tradeoff: Impact of varying <inline-formula id="inf217">
<mml:math id="m247">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (via calibrated <inline-formula id="inf218">
<mml:math id="m248">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) on model performance.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Privacy budget <inline-formula id="inf219">
<mml:math id="m249">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Noise multiplier <inline-formula id="inf220">
<mml:math id="m250">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf221">
<mml:math id="m251">
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf222">
<mml:math id="m252">
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf223">
<mml:math id="m253">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">0.5 (High Privacy)</td>
<td align="center">1.85</td>
<td align="center">9.45</td>
<td align="center">19.82</td>
<td align="center">0.864</td>
</tr>
<tr>
<td align="center">1.0</td>
<td align="center">1.35</td>
<td align="center">8.60</td>
<td align="center">18.15</td>
<td align="center">0.890</td>
</tr>
<tr>
<td align="center">2.0 <bold>(Selected)</bold>
</td>
<td align="center">1.10</td>
<td align="center">7.93</td>
<td align="center">16.78</td>
<td align="center">0.922</td>
</tr>
<tr>
<td align="center">5.0</td>
<td align="center">0.80</td>
<td align="center">7.35</td>
<td align="center">16.05</td>
<td align="center">0.930</td>
</tr>
<tr>
<td align="center">10.0 (Low Privacy)</td>
<td align="center">0.55</td>
<td align="center">7.20</td>
<td align="center">15.80</td>
<td align="center">0.933</td>
</tr>
<tr>
<td align="center">
<inline-formula id="inf224">
<mml:math id="m254">
<mml:mrow>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (No DP)</td>
<td align="center">0.00</td>
<td align="center">7.15</td>
<td align="center">15.65</td>
<td align="center">0.935</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F13">Figure 13</xref> delineates the trajectories of MAE, RMSE, and <inline-formula id="inf225">
<mml:math id="m255">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as <inline-formula id="inf226">
<mml:math id="m256">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is varied from 0.5 (high privacy regime) to 10 (low privacy regime).</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>Impact of Privacy Budget <inline-formula id="inf227">
<mml:math id="m257">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> on model utility (MAE, RMSE, <inline-formula id="inf228">
<mml:math id="m258">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>).</p>
</caption>
<graphic xlink:href="ffutr-07-1644979-g013.tif">
<alt-text content-type="machine-generated">Line chart illustrating the privacy-utility tradeoff by plotting performance metrics against privacy budget epsilon. Red line with circles shows MAE decreasing, orange line with squares shows RMSE decreasing, and blue dashed line with triangles shows R squared increasing as epsilon rises.</alt-text>
</graphic>
</fig>
<p>As anticipated, stricter privacy constraints (lower <inline-formula id="inf229">
<mml:math id="m259">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) necessitate larger noise injection, resulting in elevated error metrics. However, the performance degradation is observed to stabilize significantly around <inline-formula id="inf230">
<mml:math id="m260">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. This plateau suggests an optimal operating point wherein robust privacy guarantees can be upheld without incurring catastrophic utility losses, making the framework viable for practical deployment.</p>
</sec>
<sec id="s4-10">
<label>4.10</label>
<title>Runtime and complexity analysis</title>
<p>Runtime measurements were collected on a workstation with an Intel i9-12900K CPU, 64GB RAM, and an NVIDIA RTX 3090 GPU using PyTorch 2.1 (float32). A detailed comparison of training time, inference latency, communication overhead, peak memory usage, and parameter count is provided in <xref ref-type="table" rid="T11">Table 11</xref>. Per-round training time is measured with 10 clients per round, batch size 32, <inline-formula id="inf231">
<mml:math id="m261">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>24</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf232">
<mml:math id="m262">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>; inference latency is reported per step on the same device. Communication overhead is reported per client per round as upload &#x2b; download <inline-formula id="inf233">
<mml:math id="m263">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> parameters <inline-formula id="inf234">
<mml:math id="m264">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> bytes (no compression), excluding network latency. While X-FedFormer introduces a modest overhead due to MoE gating, top-<inline-formula id="inf235">
<mml:math id="m265">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> routing <inline-formula id="inf236">
<mml:math id="m266">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> ensures compute scales with selected experts rather than linearly with all <inline-formula id="inf237">
<mml:math id="m267">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MoE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> experts.</p>
<table-wrap id="T11" position="float">
<label>TABLE 11</label>
<caption>
<p>Runtime and complexity analysis.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th align="center">Train time</th>
<th align="center">Inference</th>
<th align="center">Comm overhead</th>
<th align="center">Peak mem</th>
<th align="center">Params</th>
</tr>
<tr>
<th align="center">(s/round)</th>
<th align="center">(ms/step)</th>
<th align="center">(MB/round)</th>
<th align="center">(MB)</th>
<th align="center">(M)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">X-FedFormer</td>
<td align="center">145.2</td>
<td align="center">12.5</td>
<td align="center">16.8</td>
<td align="center">412</td>
<td align="center">2.1</td>
</tr>
<tr>
<td align="left">FedAvg</td>
<td align="center">85.1</td>
<td align="center">5.2</td>
<td align="center">11.2</td>
<td align="center">280</td>
<td align="center">1.4</td>
</tr>
<tr>
<td align="left">FedProx</td>
<td align="center">92.3</td>
<td align="center">5.5</td>
<td align="center">11.2</td>
<td align="center">285</td>
<td align="center">1.4</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<p>This study introduces a pioneering federated learning framework that, through the integration of differentially private adaptive deep learning, demonstrably enhances the capabilities of cross-city transit flow forecasting. The empirical results, derived from a meticulously validated synthetic dataset, unequivocally underscore the efficacy of our proposed &#x201c;Federated Adaptive Decomposed Learning&#x201d; approach in generating accurate and stable predictions across highly diverse urban environments, all while rigorously upholding data privacy. This section delves into a more analytical interpretation of these findings, exploring their implications, broader applications, and inherent limitations.</p>
<sec id="s5-1">
<label>5.1</label>
<title>Interpretation of performance in the context of urban heterogeneity</title>
<p>The core experimental results, particularly the city-wise Mean Absolute Error (MAE) at the final federated round (<xref ref-type="fig" rid="F5">Figure 5</xref>), provide compelling evidence of the model&#x2019;s overall predictive strength. Cities such as Atyrau (MAE: 5.36 passengers) consistently achieved exceptional accuracy, aligning with a high-performance tier critical for operational dispatch and resource optimization. While other cities, including Almaty (MAE: 8.91) and Astana (MAE: 9.07), presented slightly higher MAE values, these figures remain within an acceptable tolerance for real-world transit planning. This variance in performance across cities is not merely an indicator of differing predictability but a direct reflection of the inherent urban heterogeneity that characterizes distinct metropolitan areas. Factors such as city size, population density, public transit network complexity, usage patterns (e.g., commuter-heavy vs. tourism-driven), and underlying socio-economic dynamics contribute to widely varying data distributions.</p>
<p>The success in navigating this significant Non-IID challenge is largely attributable to the synergistic application of the Mixture-of-Experts (MoE) architecture and Seasonal Decomposition. The MoE mechanism allows the global model to not only accommodate but actively leverage these disparate data patterns. Rather than forcing a single model to generalize across vastly different urban contexts, the MoE facilitates the development of specialized &#x201c;expert&#x201d; sub-networks. We posit that during the federated training, the gating network within the MoE learns to assign input transit data from specific cities (or even specific periods within a city) to the most relevant expert(s). For instance, an expert might specialize in high-volume, highly dynamic cities, while another becomes proficient in smaller, more predictable urban environments. This adaptive expertise enables the model to tailor its predictive logic dynamically, preventing overfitting to dominant data patterns and ensuring robust performance across the entire spectrum of urban diversity.</p>
<p>Furthermore, the explicit incorporation of Seasonal Decomposition into the deep learning architecture proved crucial for managing the complex temporal characteristics of transit flows. By disentangling the time series into trend and seasonal components, the model can separately learn the underlying periodic patterns (e.g., daily commutes, weekly variations, annual cycles) from more irregular dynamics. This decomposition simplifies the learning task for the neural network and, crucially, for the MoE experts. It allows the experts to focus their learning capacity on less predictable trend dynamics while robustly leveraging strong, recurring seasonal signals that are often stable even across heterogeneous cities. This architectural foresight enabled more accurate and interpretable predictions than approaches that would attempt to learn all temporal patterns implicitly.</p>
<p>The consistency observed in the 24-h rolling MAE (<xref ref-type="fig" rid="F8">Figure 8</xref>) across the 336-h evaluation period further validates the robustness of our approach. This metric, capturing short-term predictive stability, indicates that the model is not prone to sudden degradations in accuracy over different operational windows. The smooth curves, even with some fluctuations corresponding to likely daily peak/off-peak cycles within the synthetic data, demonstrate the model&#x2019;s ability to maintain performance consistency despite the inherent dynamism of transit demand.</p>
</sec>
<sec id="s5-2">
<label>5.2</label>
<title>Implications of federated learning and privacy preservation</title>
<p>The successful implementation of our federated learning framework carries profound implications for data governance and collaborative AI initiatives in sensitive domains like urban mobility. By enabling the collaborative training of a global model without requiring raw data centralization, FL directly addresses critical concerns around data sovereignty, regulatory compliance (e.g., GDPR, local data protection laws), and competitive advantages among transit agencies. This opens avenues for unprecedented levels of data-driven intelligence across cities that would otherwise be impossible due to privacy barriers. Our work demonstrates that sophisticated, high-performance models can be built even when data remains decentralized and confidential, fostering a new paradigm for urban data ecosystems.</p>
<p>The integration of Differential Privacy (DP) elevates the privacy guarantees beyond the inherent benefits of FL. While FL prevents direct data sharing, gradient exchanges can still be vulnerable to reconstruction or inference attacks. DP, by introducing mathematically quantifiable noise to the model updates, provides a strong, provable guarantee that no individual city&#x2019;s specific data points can be inferred from the shared gradients. This is critical for establishing trust among participating entities and ensuring the solution&#x2019;s deployability in real-world, privacy-sensitive environments. The achieved MAE performance, even with the application of DP, indicates a successful balancing of the privacy-utility trade-off. This suggests that the chosen DP budget and mechanism are practical for operational use, providing sufficient accuracy while meeting stringent privacy requirements. Such privacy guarantees are not merely a technical add-on but a fundamental enabler for the adoption of AI in public sectors dealing with sensitive citizen data.</p>
<p>Moreover, the federated approach implicitly supports scalability. Adding new cities to the collaboration only requires them to integrate with the federated training protocol, without demanding extensive data migration or re-architecture of a central data lake. This distributed nature reduces the computational and communication burden on any single central entity, making the system inherently more resilient and scalable for a growing network of participating cities.</p>
</sec>
<sec id="s5-3">
<label>5.3</label>
<title>The foundational role of synthetic data for reproducible research</title>
<p>The development and extensive use of a statistically validated synthetic data generation framework represents a methodological contribution, particularly pertinent in domains where real-world data sharing is highly restricted. This framework was not simply a substitute for real data; it served as a crucial &#x201c;enabler&#x201d; for rigorous, reproducible research into complex federated systems involving sensitive information and extreme heterogeneity.</p>
<p>By simulating realistic transit flow patterns, encompassing both general trends and city-specific nuances, the synthetic dataset provided a controlled yet representative environment for experimentation. This allowed for:<list list-type="bullet">
<list-item>
<p>Systematic Evaluation: The ability to precisely control parameters related to heterogeneity (e.g., degree of Non-IID, number of cities, underlying seasonal patterns) enabled systematic testing and deeper understanding of the model&#x2019;s behavior under various conditions.</p>
</list-item>
<list-item>
<p>Reproducibility: The synthetic nature ensures that the exact experimental conditions can be replicated by other researchers, fostering transparency and verification of results.</p>
</list-item>
<list-item>
<p>Ethical Research: It allowed for the development and testing of privacy-preserving techniques without exposing any real-world sensitive mobility data, adhering to ethical research standards from the outset.</p>
</list-item>
<list-item>
<p>Benchmarking: The framework provides a standardized platform for future comparisons of federated learning algorithms designed for heterogeneous time series, advancing research in this critical area.</p>
</list-item>
</list>
</p>
<p>The statistical validation of the synthetic data against known characteristics of real transit systems further strengthens the generalizability of our findings, building confidence that the observed performance and insights are relevant to real-world deployment.</p>
</sec>
<sec id="s5-4">
<label>5.4</label>
<title>Broader implications and adaptability across domains</title>
<p>The methodological innovations presented in this work extend far beyond the specific application of cross-city transit forecasting, offering a versatile paradigm for distributed, privacy-preserving machine learning across heterogeneous data sources.<list list-type="bullet">
<list-item>
<p>Smart City Utilities: The framework can be directly adapted for other urban utility forecasting challenges, such as decentralized energy consumption prediction across smart grids, water usage forecasting in different districts, or waste generation prediction across municipalities, all of which often involve sensitive, heterogeneous, and distributed data.</p>
</list-item>
<list-item>
<p>Healthcare and Biomedicine: In a highly privacy-sensitive field like healthcare, this framework could enable collaborative research across multiple hospitals or clinics for disease prediction, personalized treatment planning, or epidemiological forecasting. The MoE aspect would be crucial for handling patient-specific data heterogeneity (e.g., demographics, comorbidities), while DP ensures patient confidentiality.</p>
</list-item>
<list-item>
<p>IoT and Edge Analytics: For large-scale Internet of Things (IoT) deployments where devices generate vast amounts of heterogeneous time-series data (e.g., industrial sensors, environmental monitoring), our federated architecture can facilitate on-device learning and aggregation, minimizing data transfer and ensuring privacy while adapting to diverse sensor characteristics.</p>
</list-item>
<list-item>
<p>Supply Chain and Logistics: Forecasting demand and managing inventory across geographically dispersed warehouses or retail outlets, each with unique local customer bases and seasonal demand patterns, presents another compelling application. The MoE can adapt to regional demand characteristics, and seasonal decomposition can capture local buying habits.</p>
</list-item>
<list-item>
<p>Financial Time Series: While often complex, forecasting financial metrics across different markets or asset classes, each with distinct underlying dynamics and regulatory restrictions on data sharing, could benefit from a similar adaptive, decomposed, and privacy-preserving federated approach.</p>
</list-item>
</list>
</p>
<p>The core &#x201c;Adaptive Decomposed Learning&#x201d; paradigm, combining adaptive experts with explicit temporal component modeling, is broadly applicable to any complex, heterogeneous time-series forecasting problem, particularly those characterized by strong seasonal components and diverse entities, making it a powerful contribution to the broader field of machine learning.</p>
</sec>
<sec id="s5-5">
<label>5.5</label>
<title>Limitations and future research directions</title>
<p>While this research provides a robust foundation, several limitations offer fertile ground for future work:<list list-type="bullet">
<list-item>
<p>Transition to Real-World Data and External Factors: The current study primarily relies on statistically validated synthetic data. Future efforts will involve deploying and evaluating the framework on real-world multi-city transit datasets, which may present unforeseen complexities such as missing data, outliers, and real-time noise. Additionally, incorporating external factors like weather conditions, public holidays, or planned events into the forecasting model will be crucial for enhancing predictive accuracy in operational settings.</p>
</list-item>
<list-item>
<p>Optimizing Privacy-Utility Trade-offs and Advanced DP: Although Differential Privacy was successfully integrated, the optimal balance between privacy budget <inline-formula id="inf238">
<mml:math id="m268">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and model accuracy can be highly application-dependent. Future research will explore adaptive DP mechanisms that dynamically adjust noise levels based on data sensitivity, model convergence, or specific client contributions, aiming for greater efficiency and tighter privacy guarantees with minimal accuracy sacrifice.</p>
</list-item>
<list-item>
<p>Exploring Advanced Federated Personalization Strategies: While MoE provides adaptation, more explicit personalized federated learning (pFL) techniques, such as FedMeta, FedAvgM, or client clustering, could be investigated. These methods allow for client-specific model components while leveraging global knowledge, potentially yielding even higher accuracy for cities with highly unique transit behaviors.</p>
</list-item>
<list-item>
<p>Communication and Computational Efficiency at Scale: Deploying such complex models (MoE with DP) in large-scale federated networks involving hundreds or thousands of clients will require further optimization of communication overhead (e.g., gradient compression, sparsification, asynchronous updates) and computational costs on client devices.</p>
</list-item>
<list-item>
<p>Model Interpretability and Explainable AI (XAI): As deep learning models become more complex, their black-box nature can hinder adoption in critical applications. Future work could focus on developing XAI techniques specific to federated MoE models to understand how different experts contribute to predictions for specific cities or scenarios, and how DP impacts interpretability.</p>
</list-item>
<list-item>
<p>Robustness to Adversarial Attacks and Data Poisoning: Investigating the framework&#x2019;s resilience against various adversarial attacks, including data poisoning, model poisoning, and inference attacks, within the federated and differentially private setting.</p>
</list-item>
</list>
</p>
<p>In summary, this research marks a substantial advance in privacy-preserving, collaborative, and highly adaptive AI solutions for urban mobility. By tackling the complexities of cross-city data heterogeneity, it paves the way for more intelligent, efficient, and ethical public transportation systems globally.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>This paper presents a robust federated learning framework, termed &#x201c;Federated Adaptive Decomposed Learning,&#x201d; designed to address the critical challenges of cross-city transit flow forecasting amidst inherent urban data heterogeneity and stringent privacy requirements. By synergistically integrating advanced deep learning architectural components with privacy-preserving mechanisms, our approach offers a scalable and ethically compliant solution for collaborative urban mobility analytics.</p>
<p>Our methodology is underpinned by three core innovations: (1) a federated learning paradigm that enables distributed model training without raw data centralization, (2) the rigorous application of Differential Privacy to guarantee the confidentiality of individual city data during model updates, and (3) a sophisticated deep learning architecture that combines a Mixture-of-Experts (MoE) block with an adaptive Seasonal-Trend Decomposition module. This unique architectural synthesis is specifically engineered to effectively disentangle complex temporal patterns and dynamically adapt to the diverse, non-IID characteristics of transit data across multiple cities. Furthermore, to facilitate comprehensive evaluation in a privacy-sensitive domain, we developed and statistically validated a synthetic data generation framework that faithfully replicates real-world urban transit dynamics.</p>
<p>The empirical results underscore the efficacy and robustness of our proposed framework. The model consistently achieved competitive forecasting accuracy across all participating synthetic cities, with Mean Absolute Error (MAE) values predominantly ranging from approximately 5.36 passengers in highly predictable environments (e.g., Atyrau) to around 9.07 passengers in more complex urban settings (e.g., Astana), as detailed in <xref ref-type="fig" rid="F5">Figure 5</xref>. The round-wise evolution of performance metrics (refer to <xref ref-type="fig" rid="F10">Figure 10</xref>) demonstrated rapid initial convergence, with the Coefficient of Determination (<italic>R</italic>
<sup>2</sup>) approaching 0.90 by Round 40 and reaching 0.922 by Round 50, highlighting efficient learning dynamics and the model&#x2019;s strong predictive power. Moreover, the analysis of the 24-h rolling MAE (as shown in <xref ref-type="fig" rid="F8">Figure 8</xref>) confirmed the temporal stability and consistency of the model&#x2019;s predictions over extended periods, a crucial aspect for operational reliability. The regional MAE comparison (<xref ref-type="fig" rid="F9">Figure 9</xref>) further illustrated the model&#x2019;s adaptive capacity to maintain robust performance across geographically diverse urban profiles.</p>
<p>In summary, this research delivers an advancement in privacy-preserving AI for smart cities. It demonstrates that high-fidelity, adaptive forecasting models can be collaboratively developed and deployed across heterogeneous urban environments without compromising sensitive data. This work not only provides a practical solution for enhancing urban transit efficiency but also establishes a foundational paradigm for distributed, privacy-aware machine learning applicable to a wide array of real-world problems characterized by decentralized and sensitive time-series data. Our findings pave the way for future innovations in urban intelligence, fostering smarter, more efficient, and privacy-respecting public services.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: The primary dataset comprises two parts: (1) a proprietary Astana bus-transportation log provided under a non-disclosure agreement with the municipal transit authority, and (2) a fully synthetic multi-city dataset generated from that log. Due to licensing and privacy considerations, the original Astana hourly inflow/outflow records cannot be redistributed or published. Instead, we release only the synthetic dataset, which faithfully reproduces key temporal and seasonal patterns without revealing any real passenger trajectories or personally identifiable information. The synthetic data are made available under a Creative Commons Attribution (CC-BY 4.0) license via our project repository, along with the complete generation scripts and parameter settings. Researchers wishing to examine aggregate statistics of the original data or to validate the synthetic-data fidelity may contact the corresponding author for limited, read-only access under a separate data-use agreement. Requests to access these datasets should be directed to Diar Begisbayev, <email>begisbayev@gmail.com</email>.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>AS: Validation, Methodology, Investigation, Formal Analysis, Supervision, Writing &#x2013; review and editing, Software, Data curation, Visualization, Project administration, Resources, Conceptualization, Writing &#x2013; original draft, Funding acquisition. ZU: Supervision, Data curation, Formal Analysis, Methodology, Software, Visualization, Resources, Conceptualization, Funding acquisition, Investigation, Validation, Writing &#x2013; review and editing, Project administration, Writing &#x2013; original draft. DB: Writing &#x2013; original draft, Resources, Investigation, Software, Funding acquisition, Visualization, Data curation, Writing &#x2013; review and editing, Formal Analysis, Validation, Conceptualization, Project administration, Methodology, Supervision. AM: Visualization, Conceptualization, Investigation, Resources, Software, Data curation, Formal Analysis, Project administration, Supervision, Writing &#x2013; review and editing, Writing &#x2013; original draft, Methodology, Validation, Funding acquisition. RS: Funding acquisition, Software, Formal Analysis, Writing &#x2013; original draft, Conceptualization, Resources, Visualization, Supervision, Methodology, Project administration, Writing &#x2013; review and editing, Investigation, Validation, Data curation. DY: Software, Writing &#x2013; original draft, Investigation, Writing &#x2013; review and editing, Resources, Funding acquisition, Data curation, Methodology, Validation, Formal Analysis, Visualization, Project administration, Supervision, Conceptualization.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Abadi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goodfellow</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>McMahan</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Mironov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Talwar</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep learning with differential privacy</article-title>,&#x201d; in <source>Proc. ACM SIGSAC CCS</source>, <fpage>308</fpage>&#x2013;<lpage>318</lpage>. <pub-id pub-id-type="doi">10.1145/2976749.2978318</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akallouch</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Akallouch</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fardousse</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bouhoute</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Berrada</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Prediction and privacy scheme for traffic flow estimation on the highway road network</article-title>. <source>Inf</source> <volume>13</volume>, <fpage>381</fpage>. <pub-id pub-id-type="doi">10.3390/info13080381</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Huthaifi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Al-Huda</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024a</year>). <article-title>FedAGAT: real-time traffic flow prediction based on federated community and adaptive graph attention network</article-title>. <source>Inf. Sci.</source> <volume>667</volume>, <fpage>120482</fpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2024.120482</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Huthaifi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Al-Huda</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>FedGODE: secure traffic flow prediction based on federated learning and graph ordinary differential equation networks</article-title>. <source>Knowl. Based Syst.</source> <volume>299</volume>, <fpage>112029</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2024.112029</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cleveland</surname>
<given-names>R. B.</given-names>
</name>
<name>
<surname>Cleveland</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>McRae</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Terpenning</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>STL: a seasonal-trend decomposition procedure based on loess</article-title>. <source>J. Off. Stat.</source> <volume>6</volume>, <fpage>3</fpage>&#x2013;<lpage>73</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Demsar</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Statistical comparisons of classifiers over multiple data sets</article-title>. <source>J. Mach. Learn. Res.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>30</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/v7/demsar06a.html">https://www.jmlr.org/papers/v7/demsar06a.html</ext-link>.</comment>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Federated meta-learning on graph for traffic flow prediction</article-title>. <source>IEEE Trans. Veh. Technol.</source> <volume>73</volume>, <fpage>19526</fpage>&#x2013;<lpage>19538</lpage>. <pub-id pub-id-type="doi">10.1109/TVT.2024.3441759</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Torra</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Differentially private traffic flow prediction using transformers: a federated approach</article-title>. <source>Artif. Intell. Res. Dev.</source> <volume>377</volume>, <fpage>260</fpage>&#x2013;<lpage>271</lpage>. <pub-id pub-id-type="doi">10.1007/978.3.031.54204.615</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sahu</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Talwalkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Federated optimization in heterogeneous networks</article-title>,&#x201d; in <source>Proc. MLSys</source>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A review of traffic flow prediction methods in intelligent transportation system construction</article-title>. <source>Appl. Sci.</source> <volume>12</volume>, <fpage>13010</fpage>. <pub-id pub-id-type="doi">10.3390/app122413010</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). &#x201c;<article-title>ST-TPFL: towards spatio-temporal traffic flow prediction based on topology protected federated learning</article-title>,&#x201d; in <source>Artif. Intell. Appl.</source>, <fpage>437</fpage>&#x2013;<lpage>451</lpage>. <pub-id pub-id-type="doi">10.1007/978.981.97.7235.329</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chakraborty</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhen</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Multilevel federated learning-based intelligent traffic flow forecasting for transportation network management</article-title>. <source>IEEE Trans. Netw. Serv. Manag.</source> <volume>20</volume>, <fpage>1446</fpage>&#x2013;<lpage>1458</lpage>. <pub-id pub-id-type="doi">10.1109/TNSM.2023.3280515</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Syu</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J. C. W.</given-names>
</name>
<name>
<surname>Srivastava</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yun</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Heterogeneous federated learning systems for time-series prediction with multi-head embedding mechanism</article-title>. <source>IEEE Internet Things J.</source> <volume>11</volume> (<issue>22</issue>), <fpage>36789</fpage>&#x2013;<lpage>36801</lpage>. <pub-id pub-id-type="doi">10.1109/JIOT.2024.3501906</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>McMahan</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ramage</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hampson</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Arcas</surname>
<given-names>B. A. y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Communication-efficient learning of deep networks from decentralized data</article-title>. <source>Proc. AISTATS, PMLR</source> <volume>54</volume>, <fpage>1273</fpage>&#x2013;<lpage>1282</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1602.05629</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Meese</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Masood</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y. J.</given-names>
</name>
<name>
<surname>Rathore</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Paul</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ahmad</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>BFRT: blockchained federated learning for real-time traffic flow prediction</article-title>,&#x201d; In <source>Proc. IEEE CCGrid</source>, <fpage>317</fpage>&#x2013;<lpage>326</lpage>. <pub-id pub-id-type="doi">10.1109/CCGrid54584.2022.00041</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nidhi</surname>
</name>
<name>
<surname>Grover</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Federated learning analysis for vehicular traffic flow prediction: evaluation of learning algorithms and aggregation approaches</article-title>. <source>Clust. Comput.</source> <volume>27</volume>, <fpage>5075</fpage>&#x2013;<lpage>5091</lpage>. <pub-id pub-id-type="doi">10.1007/s10586-023-04235-z</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<collab>Opacus Team</collab> (<year>2021</year>). <article-title>Opacus: User-Friendly differential privacy Library in PyTorch</article-title>. <source>arXiv Preprint arXiv:2109.12298</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2109.12298</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A decomposition modeling framework for seasonal time-series forecasting</article-title>. <source>arXiv Preprint arXiv:2412.12168</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2412.12168</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hossain</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Privacy-preserving blockchain-based federated learning for traffic flow prediction</article-title>. <source>Future Gener. comput. Syst.</source> <volume>117</volume>, <fpage>328</fpage>&#x2013;<lpage>337</lpage>. <pub-id pub-id-type="doi">10.1016/j.future.2020.12.003</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>FedAGCN: a traffic flow prediction framework based on federated learning and asynchronous graph convolutional network</article-title>. <source>Appl. Soft Comput.</source> <volume>138</volume>, <fpage>110175</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2023.110175</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shazeer</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mirhoseini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maziarz</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Outrageously large neural networks: the sparsely-gated mixture-of-experts layer</article-title>. <source>Proc. ICLR</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1701.06538</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2024a</year>). <article-title>A decentralized federated learning-based spatial&#x2013;temporal model for freight traffic speed forecasting</article-title>. <source>Expert Syst. Appl.</source> <volume>238</volume>, <fpage>122302</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2023.122302</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>A spatial&#x2013;temporal model for network-wide flight delay prediction based on federated learning</article-title>. <source>Appl. Soft Comput.</source> <volume>154</volume>, <fpage>111380</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2024.111380</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Differentially private decentralized traffic flow prediction approach based on federated learning</article-title>,&#x201d; in <source>Proc. Int. Conf. Inf. Technol.: IoT smart City</source>. <pub-id pub-id-type="doi">10.1145/3582197.3582244</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tran</surname>
<given-names>N. P.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T. D.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>H. T.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Privacy&#x2010;preserving traffic flow prediction: a split learning approach</article-title>,&#x201d; in <source>Proc. ICOIN</source>, <fpage>248</fpage>&#x2013;<lpage>250</lpage>. <pub-id pub-id-type="doi">10.1109/ICOIN56518.2023.10048996</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Short-term traffic flow prediction based on graph convolutional networks and federated learning</article-title>. <source>IEEE Trans. Intell. Transp. Syst.</source> <volume>24</volume>, <fpage>1191</fpage>&#x2013;<lpage>1203</lpage>. <pub-id pub-id-type="doi">10.1109/TITS.2022.3179391</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen,</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201C;<article-title>Fed-TREND: a federated learning framework with robust trend adaptation for traffic forecasting</article-title>,&#x201D; in <conf-name>Proceedings of the 2024 International Conference on Machine Learning and Applications</conf-name> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>IEEE</publisher-name>). <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yaqub</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ahmad</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Abdul Manan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Shabir Chuhan</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Predicting traffic flow with federated learning and graph neural with asynchronous computations network</article-title>. <source>arXiv</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2401.02723</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ye</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Federated generative artificial intelligence empowered traffic flow prediction under vehicular computing power networks</article-title>. <source>IEEE Internet Things Mag.</source> <volume>7</volume>, <fpage>56</fpage>&#x2013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1109/IOTM.001.2300259</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Traffic flow prediction based on transformer and multi-spatial-temporal encoder-decoder</article-title>,&#x201d; in <source>Proc. Int. Conf. Comput. Eng. Appl. (ICCEA)</source>, <fpage>154</fpage>&#x2013;<lpage>157</lpage>. <pub-id pub-id-type="doi">10.1109/ICCEA58433.2023.10135489</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Short-term traffic flow prediction considering weather factors based on optimized deep learning neural networks</article-title>. <source>Appl. Sci.</source> <volume>13</volume>, <fpage>132</fpage>. <pub-id pub-id-type="doi">10.3390/app13030132</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). &#x201C;<article-title>Personalized federated learning for cross-city traffic prediction</article-title>,&#x201D; <source>Proceedings of the 33rd International Joint Conference on Artificial Intelligence (IJCAI)</source>. <fpage>5514</fpage>&#x2013;<lpage>5522</lpage>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/970413/overview">Alireza Talebpour</ext-link>, University of Illinois at Urbana-Champaign, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2990303/overview">Kaushalya Thopate</ext-link>, Savitribai Phule Pune University, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3326905/overview">Xiuyu Shen</ext-link>, Southeast University, China</p>
</fn>
</fn-group>
</back>
</article>