<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Environ. Sci.</journal-id>
<journal-title>Frontiers in Environmental Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Environ. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-665X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1623630</article-id>
<article-id pub-id-type="doi">10.3389/fenvs.2025.1623630</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Environmental Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Multi-pollutant air quality forecasting using bidirectional attention and multi-scale temporal networks</article-title>
<alt-title alt-title-type="left-running-head">Xie et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenvs.2025.1623630">10.3389/fenvs.2025.1623630</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Xie</surname>
<given-names>Zi-Ang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3180139/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chow</surname>
<given-names>Chee-Onn</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2186547/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chuah</surname>
<given-names>Joon Huang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1889834/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Raymond</surname>
<given-names>Wong Jee Keen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2636530/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Electrical Engineering, Universiti Malaya</institution>, <addr-line>Kuala Lumpur</addr-line>, <country>Malaysia</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Faculty of Engineering and Information Technology, Southern University College</institution>, <addr-line>Skudai</addr-line>, <country>Malaysia</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/365752/overview">Manousos-Ioannis Manousakas</ext-link>, National Centre of Scientific Research Demokritos, Greece</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3002906/overview">Prakash Rao Ragiri</ext-link>, Netaji Subhas University of Technology, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3094131/overview">Mengfan Teng</ext-link>, Jiangxi University of Science and Technology, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Chee-Onn Chow, <email>cochow@um.edu.my</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1623630</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Xie, Chow, Chuah and Raymond.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Xie, Chow, Chuah and Raymond</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Accurate multi-pollutant forecasting is vital for urban governance and public health. Existing deep models struggle to capture multi-scale temporal dynamics and synergistic cross-pollutant relations.</p>
</sec>
<sec>
<title>Methods</title>
<p>We propose an Enhanced Bidirectional Attention Multi-scale Temporal Network (EBAMTN) that combines a multi-scale TCN with linear attention, a two-layer BiLSTM augmented by multi-head self-attention, and a gated fusion layer. Under a multi-task paradigm, the backbone jointly learns shared temporal representations and outputs PM<sub>2</sub>.<sub>5</sub> and PM<sub>10</sub> via task-specific heads.</p>
</sec>
<sec>
<title>Results</title>
<p>Using hourly data from Guangzhou, Beijing, and Chengdu, EBAMTN achieved R<sup>2</sup> &#x3e; 0.94 for both pollutants while maintaining low errors (e.g., PM<sub>2</sub>.<sub>5</sub> MAE&#x2248;2.03, RMSE&#x2248;2.94; PM<sub>10</sub> MAE&#x2248;3.44, RMSE&#x2248;4.99). Confidence-interval analyses and scatter plots indicate strong trend tracking and robustness, with remaining challenges mainly at sharp peaks.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The integration of multi-scale convolutions, bidirectional memory, attention, and gated fusion improves accuracy, interpretability, and generalization. The lightweight design (&#x2248;2.1M parameters; &#x223c; 13.2 ms/sample) supports real-time and edge deployment. Overall, EBAMTN offers a scalable, interpretable solution for multi-pollutant forecasting in complex urban settings.</p>
</sec>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>multi-task learning</kwd>
<kwd>air quality forecasting</kwd>
<kwd>temporal convolutional network</kwd>
<kwd>long short-term memory</kwd>
<kwd>linear attention</kwd>
<kwd>multi-head attention</kwd>
</kwd-group>
<counts>
<page-count count="13"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Big Data, AI, and the Environment</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>In recent years, with the rapid advancement of urbanization and industrialization, air pollution has emerged as an increasingly severe public health concern on a global scale. Fine particulate matter <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, specifically <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (particles with a diameter less than 2.5<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mtext>m</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>) and <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (particles with a diameter less than 10<inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mtext>m</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>), has been identified by the World Health Organization (WHO) as among the most hazardous air pollutants due to their small size and ability to penetrate deep into the human respiratory system (<xref ref-type="bibr" rid="B27">Organization, 2021</xref>). Numerous studies have demonstrated that prolonged exposure to high concentrations of <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> significantly increases the risk of asthma, chronic obstructive pulmonary disease (COPD), cardiovascular and cerebrovascular diseases, as well as the incidence and mortality rates of lung cancer (<xref ref-type="bibr" rid="B1">Ansari and Ehrampoush, 2019</xref>; <xref ref-type="bibr" rid="B20">Lelieveld et al., 2019</xref>). Consequently, developing efficient and accurate air quality forecasting models is of substantial importance for safeguarding public health and informing environmental policymaking.</p>
<p>Early air quality forecasting approaches primarily include numerical models, statistical techniques, and traditional machine learning methods. Numerical models, akin to weather forecasting systems, divide temporal and spatial domains into grids based on atmospheric physical and chemical principles, using computer simulations to predict meteorological and pollutant data. Common models include CMAQ, CAMx, and NAQPMS (<xref ref-type="bibr" rid="B2">Appel et al., 2021</xref>; <xref ref-type="bibr" rid="B28">Pouyaei et al., 2021</xref>; <xref ref-type="bibr" rid="B23">Liu H. et al., 2021</xref>; <xref ref-type="bibr" rid="B29">Qi et al., 2022</xref>; <xref ref-type="bibr" rid="B6">Cheng et al., 2022</xref>). Statistical models generally assume linearity and stationarity, using curve fitting and parameter estimation to model air quality. Typical examples include ARMA, ARIMA, MLR, and time series regression (<xref ref-type="bibr" rid="B41">Zhou et al., 2020</xref>; <xref ref-type="bibr" rid="B22">Liu B. et al., 2021</xref>; <xref ref-type="bibr" rid="B19">Lai and Dzombak, 2020</xref>; <xref ref-type="bibr" rid="B18">Kumari and Singh, 2023</xref>; <xref ref-type="bibr" rid="B9">Gong et al., 2022</xref>). For instance, ARIMA achieves good performance in low volatility <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scenarios in Beijing (<xref ref-type="bibr" rid="B40">Zhao et al., 2022</xref>), but its linear structure limits its capacity to capture nonlinear patterns, seasonality, and external influences (<xref ref-type="bibr" rid="B4">Box et al., 2015</xref>). To address these issues, machine learning models such as support vector machines (SVM) and random forests (RF) have been applied to improve nonlinear feature learning (<xref ref-type="bibr" rid="B14">Karimian et al., 2019</xref>). However, they often require extensive feature engineering and struggle with generalization.</p>
<p>With the rapid advancement of deep learning, an increasing number of studies have applied these techniques to air quality time series forecasting. Depending on architecture, models are commonly classified into CNN-based, RNN-based, and attention-based approaches. Convolutional neural networks (CNNs) are widely used due to their strength in extracting local spatial features (<xref ref-type="bibr" rid="B35">Wang et al., 2024</xref>). As standard CNNs operate on regular grids, hybrid models are often adopted. For instance, <xref ref-type="bibr" rid="B37">Zhang and Li (2022)</xref> implemented a CNN-LSTM model for air quality prediction in Beijing. To enhance accuracy, <xref ref-type="bibr" rid="B8">Duan et al. (2023)</xref> proposed an ARIMA-BiLSTM model, which improved performance by approximately 10%. Among RNN variants, long short-term memory (LSTM) networks are the most prominent. Compared with CNNs, LSTMs are better at modeling long-term temporal dependencies and integrating with other modules. <xref ref-type="bibr" rid="B31">Seng et al. (2021)</xref> predicted <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations in Beijing 1&#x2013;3&#xa0;h ahead using an LSTM-based approach, while Chen et al. (<xref ref-type="bibr" rid="B34">Tran et al., 2023</xref>) developed an optimized LSTM for hourly <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> forecasting in highly polluted regions of Taiwan, outperforming traditional statistical methods. <xref ref-type="bibr" rid="B12">Jin et al. (2021)</xref> proposed MTMC-NLSTM, a nested LSTM-based framework that achieved superior multivariate air quality forecasting with low training time, enabling near real-time AQI monitoring. <xref ref-type="bibr" rid="B24">Luo and Gong (2023)</xref> introduced an ARIMA-WOA-LSTM hybrid model for pollutant prediction. Additionally, GRU-based models have also been explored; for example, <xref ref-type="bibr" rid="B33">Tao et al. (2019)</xref> developed a CBGRU model combining 1D CNN with bidirectional GRU for <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> forecasting.</p>
<p>In recent years, recurrent neural network models such as RNNs and LSTMs have achieved strong performance across a wide range of tasks. However, due to their inherently sequential structure, they encounter difficulties in parallelizing the training process. Consequently, batch processing of long-term sequences often leads to memory limitations. Inspired by the human visual attention mechanism, attention-based models have been proposed to address these issues (<xref ref-type="bibr" rid="B26">Niu et al., 2021</xref>). Compared with recurrent models, attention mechanisms offer greater flexibility in handling inputs of varying shapes and help mitigate the problem of unbalanced computational resource allocation. As a result, attention-based architectures have gained widespread adoption and become one of the most prominent deep learning paradigms. Zhang et al. proposed a lightweight deep learning approach based on sparse attention mechanisms within Transformer Networks (<xref ref-type="bibr" rid="B39">Zhang et al., 2023</xref>), aimed at capturing long-term dependencies and complex feature relationships from input data. Iskandaryan et al. employed graph neural networks (GNNs) to predict air quality in Madrid (<xref ref-type="bibr" rid="B11">Iskandaryan et al., 2023</xref>). Their model integrates attention mechanisms, gated recurrent units (GRUs), and graph convolutional networks (GCNs). Experimental results show that the proposed method outperforms other approaches, including Time Graph Convolutional Networks (TGCNs), LSTM, and GRU models. Based on these research advances, incorporating attention mechanisms into other air quality forecasting models emerges as a promising direction for improving prediction accuracy and enhancing model interpretability (<xref ref-type="bibr" rid="B25">Ma et al., 2024</xref>).</p>
<p>Although deep learning-based models have achieved considerable progress in air quality forecasting, several key challenges remain. First, many existing models focus solely on single-scale temporal features, overlooking the multi-scale nature of pollutant concentration variations. This limitation hinders the model&#x2019;s ability to jointly capture short-term fluctuations and long-term trends. Second, most models adopt a single task learning architecture, which fails to exploit the inherent correlations and synergistic relationships between multiple pollutants (e.g., <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>), thereby limiting predictive performance. Furthermore, some models suffer from overly complex structures, high computational costs, and poor interpretability, which restrict their scalability and real-world applicability. Despite significant progress in deep learning-based air quality forecasting, a critical gap remains in integrating both multi-scale temporal dynamics and multi-task pollutant prediction. Most existing approaches either focus on fine-grained temporal modeling without considering inter-pollutant relationships, or treat each pollutant as an independent task, failing to leverage the inherent synergy between them. Additionally, there is limited exploration of architectures that combine multi-resolution convolutional modules with bidirectional sequence modeling and task-shared attention mechanisms. This lack of unified multi-scale, multi-task frameworks limits the adaptability and accuracy of models in complex, real-world urban environments. To address these issues, this paper proposes a novel multi-task air quality forecasting model with the following key contributions:</p>
<list list-type="simple">
<list-item>
<p>1. A novel deep learning model named Enhanced Bidirectional Attention Multi-scale Temporal Network (EBAMTN) which is introduced to capture dynamic patterns across multiple temporal scales, which integrates a multi-scale attention Temporal Convolutional Network with an enhanced bidirectional attention LSTM. By employing parallel multi-scale convolutional branches, the model effectively captures temporal features across different receptive fields, thereby improving its capability to model multi-scale dynamic patterns in air quality data.</p>
</list-item>
<list-item>
<p>2. A cross-branch attention mechanism and a temporal attention mechanism are introduced to dynamically fuse multi-scale features and enhance feature responses at critical time steps, respectively. These mechanisms improve both the expressive capacity and interpretability of the model.</p>
</list-item>
<list-item>
<p>3. A multi-task prediction framework is designed to enable the joint modeling of <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, effectively leveraging the synergistic relationship between pollutants and significantly enhancing overall prediction performance.</p>
</list-item>
</list>
<p>The remainder of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> (Materials and Methods) provides a comprehensive review of related work and introduces the structure of the proposed EBAMTN model, including detailed algorithmic components. <xref ref-type="sec" rid="s3">Section 3</xref> (Results and Analysis) presents experimental settings, performance comparisons, and visualized results across three cities. <xref ref-type="sec" rid="s4">Section 4</xref> (Conclusion) summarizes key contributions and outlines potential directions for future enhancement.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Related work</title>
<sec id="s2-1-1">
<title>2.1.1 Temporal Convolutional Network</title>
<p>The Temporal Convolutional Network (TCN) is a convolutional neural network architecture specifically designed for sequence modeling tasks (<xref ref-type="bibr" rid="B3">Bednarski et al., 2022</xref>). Unlike traditional recurrent neural networks (RNNs) and their variants such as LSTM and GRU, TCNs utilize causal and dilated convolutions to capture temporal dependencies while enabling high degrees of parallelism and ensuring stable gradient propagation. TCNs have demonstrated strong performance across various sequential tasks, including time series forecasting, speech synthesis, and natural language understanding (<xref ref-type="bibr" rid="B5">Chen et al., 2020</xref>). A complete TCN architecture consists of three main components: causal convolution, dilated convolution, and residual connections between inputs and outputs (denoted as X and Y). These components are described in detail below:<disp-formula id="e1">
<mml:math id="m17">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="normal">Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>Key formulations for the TCN components are summarized in <xref ref-type="disp-formula" rid="e1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e7">7</xref>.</p>
<sec id="s2-1-1-1">
<title>2.1.1.1 Causal convolution</title>
<p>To ensure temporal consistency and prevent information leakage from future time steps, the TCN employs causal convolution. In this design, the output at time step <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, denoted as <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, depends strictly on the inputs up to time <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, without accessing any future values. This property is essential for predictive modeling in real-world time series scenarios.</p>
<p>In a causal one-dimensional convolution, the output <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at time step <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is computed as:<disp-formula id="e3">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the kernel size, <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>is the <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>convolutional weight, and <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>represents the corresponding input at an earlier time step. This formulation ensures that the model adheres to the causal constraint, making it suitable for time-dependent forecasting tasks.</p>
</sec>
<sec id="s2-1-1-2">
<title>2.1.1.2 Dilated convolution</title>
<p>The second component is dilated convolution, which is employed in TCNs to expand the receptive field without significantly increasing model depth or computational cost. Dilated convolution introduces a fixed interval, known as the dilation factor, between input elements, allowing the model to efficiently capture long-range temporal dependencies. When used across multiple layers with exponentially increasing dilation factors, the model can simultaneously learn both short-term fluctuations and long-term trends. To expand the receptive field in causal convolution, the dilation factor <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is introduced, and the dilated convolution is defined as:<disp-formula id="e4">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where<inline-formula id="inf28">
<mml:math id="m32">
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the convolution kernel size, <inline-formula id="inf29">
<mml:math id="m33">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the dilation factor, <inline-formula id="inf30">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the <inline-formula id="inf31">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> convolution weight, and <inline-formula id="inf32">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the input at a dilated position. This formulation allows TCNs to model temporal dependencies over a broader range with fewer layers. When <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the dilated convolution becomes equivalent to a standard causal convolution. An exponentially expanding receptive field can be achieved by increasing the dilation factor exponentially across layers, for example: 1, 2, 4, 8. Under this configuration, the total receptive field of a multi-layer TCN can be calculated as:<disp-formula id="e5">
<mml:math id="m38">
<mml:mrow>
<mml:mtext>Receptive&#x2009;Field</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where <inline-formula id="inf34">
<mml:math id="m39">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the kernel size, <inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of layers, and <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the dilation factor at the <inline-formula id="inf37">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> layer. This formulation enables efficient modeling of both local and long-range temporal dependencies while maintaining computational efficiency.</p>
</sec>
<sec id="s2-1-1-3">
<title>2.1.1.3 Residual connections</title>
<p>TCN incorporates residual connections, where each residual block consists of two dilated convolutional layers, each followed by weight normalization, ReLU activation, and dropout for regularization. These residual links are crucial for facilitating gradient flow and mitigating degradation in deep networks. When the input and output dimensions differ, a 1<inline-formula id="inf38">
<mml:math id="m43">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1 convolution is applied to align them. Each residual block, denoted as <inline-formula id="inf39">
<mml:math id="m44">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, is defined as:<disp-formula id="e6">
<mml:math id="m45">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>Dropout</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>D</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>Dropout</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>D</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf40">
<mml:math id="m46">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the ReLU activation function. The final output of the residual block is obtained by adding the input <inline-formula id="inf41">
<mml:math id="m47">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to the block output:<disp-formula id="e7">
<mml:math id="m48">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>This residual structure helps stabilize training and enables the construction of deeper TCN models.</p>
</sec>
</sec>
<sec id="s2-1-2">
<title>2.1.2 Long short-term memory</title>
<p>Long Short-Term Memory (LSTM) networks have been widely used for sequence modeling due to their ability to capture long-range temporal dependencies. In this study, we adopt LSTM as one of the baseline models. Its structure and mathematical formulation can be found in prior works (<xref ref-type="bibr" rid="B10">Hochreiter and Schmidhuber, 1997</xref>). The detailed description is omitted here for brevity, as our focus lies in the proposed architectures.</p>
</sec>
</sec>
<sec id="s2-2">
<title>2.2 Method</title>
<sec id="s2-2-1">
<title>2.2.1 Problem formulation</title>
<p>The air quality forecasting task is formalized as a multi-task series prediction problem. Let the historical input sequence be <inline-formula id="inf42">
<mml:math id="m49">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where T denotes the number of time steps and d represents the feature dimension. The objective of modeling is to simultaneously predict the concentration levels of <inline-formula id="inf43">
<mml:math id="m50">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m51">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> at each time step t, represented as <inline-formula id="inf45">
<mml:math id="m52">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>y</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2009;y</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The proposed multi-task learning framework not only captures the potential interdependence between different pollutants but also improves the generalization ability of the model by leveraging shared representations. Prior studies have demonstrated a strong physicochemical correlation between <inline-formula id="inf46">
<mml:math id="m53">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf47">
<mml:math id="m54">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and this correlation can be effectively exploited through the feature-sharing mechanism to enhance prediction accuracy. To formally represent the multi-task prediction process, we denote the predictive function as follows:<disp-formula id="e8">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where f <inline-formula id="inf48">
<mml:math id="m56">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is the forecasting model with learnable parameters <inline-formula id="inf49">
<mml:math id="m57">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf50">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a window of past <inline-formula id="inf51">
<mml:math id="m59">
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> time steps. The model outputs the predicted values for <inline-formula id="inf52">
<mml:math id="m60">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf53">
<mml:math id="m61">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> simultaneously at each time step <inline-formula id="inf54">
<mml:math id="m62">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Enhanced bidirectional attention multi-scale temporal network (EBAMTN)</title>
<p>To effectively model the complex temporal evolution of air pollutant concentrations, this paper proposes a multi-module synergistic deep hybrid architecture. The overall architecture is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref> and comprises four key sub-modules: 1) a Multi-Scale Temporal Convolution Module with Linear Attention, 2) an Enhanced Bidirectional LSTM with Muti-Head Attention, 3) a Feature Fusion Module with Gating, 4) Multi-Task Output Heads. This integrated design enables the model to capture both short-term fluctuations and long-term trends in air quality data. Moreover, it demonstrates strong generalization capability and supports simultaneous multi-target forecasting.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Architecture of the EBAMTN model.</p>
</caption>
<graphic xlink:href="fenvs-13-1623630-g001.tif">
<alt-text content-type="machine-generated">Flowchart of a forecasting model incorporating PM2.5 and PM10 inputs. It includes modules: Multi-scale Temporal Convolutional Network with Attention, Linear Attention, Residual Block, Bi-directional LSTM with Attention, Multi-Head Attention, and Adaptive Gating Layer, leading to feature fusion. The output provides PM2.5 and PM10 forecasting results.</alt-text>
</graphic>
</fig>
<sec id="s2-2-2-1">
<title>2.2.2.1 Multi-scale TCN with attention</title>
<p>Air quality data are inherently nonlinear and non-stationary, often exhibiting multi-frequency and multi-periodic temporal patterns. These patterns arise from a variety of real-world factors, such as morning and evening traffic congestion, diurnal temperature fluctuations, seasonal monsoon cycles, and changes in human mobility during holidays (<xref ref-type="bibr" rid="B38">Zhang and Zhang, 2023</xref>). Such multi-scale temporal variations are reflected not only in short-term abrupt changes but also in long-term evolving trends. Therefore, developing a temporal modeling structure that can simultaneously perceive short-term fluctuations and long-term dependencies is essential for achieving high-accuracy air quality forecasting. To this end, we propose a Multi-Scale Temporal Convolutional Network (Multi-Scale TCN) module that integrates three key components: (1) parallel dilated convolution branches, (2) a lightweight channel-wise attention-based fusion mechanism, and (3) a stacked dilated convolutional structure with skip connections. This design enables the model to effectively capture air quality dynamics at multiple temporal resolutions.</p>
<p>First, the preprocessed input features are fed into three parallel Dilated Causal Convolutional branches, each using a different kernel size (3, 5, and 7) with fixed dilation. These branches are designed to capture temporal dependencies at local, intermediate, and broader scales, respectively. Through parallel multi-scale modeling, the network can simultaneously detect fine-grained variations and overarching temporal trends. Next, to enhance the flexibility and adaptiveness of multi-scale feature integration, a channel-wise attention fusion module is introduced. This mechanism applies global average pooling to the output of each convolutional branch to generate scale-specific descriptor vectors, followed by a linear attention mechanism to compute the importance weights for each scale. This dynamic weighting allows the model to emphasize informative branches and achieve adaptive scale-aware feature fusion. The resulting fused representation exhibits both strong temporal perception and scale discrimination capabilities. Finally, to extract deeper hierarchical temporal features, the fused output is passed through a stack of causal convolution layers with exponentially increasing dilation factors (e.g., d &#x3d; 1, 2, 4, 8, &#x2026;). Each layer incorporates skip connections to enhance feature propagation and stabilize gradient flow. The outputs from all skip connections are aggregated to produce the final representation of the multi-scale convolution module.</p>
<p>In summary, the proposed module demonstrates strong capabilities in temporal feature extraction and dynamic fusion. By leveraging the dilation mechanism to effectively expand the receptive field, the model significantly improves its performance and generalization in multi-scale air quality modeling tasks. Let the input tensor be <inline-formula id="inf55">
<mml:math id="m63">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where B is the batch size, <inline-formula id="inf56">
<mml:math id="m64">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of input features, and T is the temporal length. The input is processed by three parallel 1D dilated causal convolutions with different kernel sizes (3, 5, 7), producing outputs:<disp-formula id="e9">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>D</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>1,2,3</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>To fuse multi-scale features, we first apply global average pooling over the temporal dimension to obtain descriptor vectors:<disp-formula id="e10">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>:</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>:</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>An attention mechanism then computes scale-aware weights:<disp-formula id="e11">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where we are learnable weight vectors. The final fused output is the weighted sum of branch outputs:<disp-formula id="e12">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>multi</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>To capture deeper temporal dependencies, the fused representation is passed through a stack of dilated convolution layers with exponentially increasing dilation factors <inline-formula id="inf57">
<mml:math id="m69">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Each layer performs:<disp-formula id="e13">
<mml:math id="m70">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>ReLU</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>BN</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>D</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>followed by a skip connection:<disp-formula id="e14">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>skip</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>D</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>The final output of the module aggregates all skip outputs:<disp-formula id="e15">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>TCN</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mtext>skip</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>The multi-scale attention TCN is formally defined in <xref ref-type="disp-formula" rid="e9">Equations 9</xref>&#x2013;<xref ref-type="disp-formula" rid="e15">15</xref>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Multi-scale TCN with Attention.<list list-type="simple">
<list-item>
<p>1:&#x2003;for each kernel_size in [3,5,7]: do</p>
</list-item>
<list-item>
<p>2:&#x2003;branch_output[k] &#x3d; Conv1d(input, kernel_size &#x3d; k)</p>
</list-item>
<list-item>
<p>3:&#x2003;attention_weight &#x3d; Softmax(Linear(GlobalAvg(branch_output)))</p>
</list-item>
<list-item>
<p>4:&#x2003;multi_scale_output &#x3d; <inline-formula id="inf58">
<mml:math id="m73">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> attention_weight[i] &#x2a; branch_output[i]</p>
</list-item>
<list-item>
<p>5:&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>6:&#x2003;<bold>for</bold> each layer i in TCN_layers: <bold>do</bold>
</p>
</list-item>
<list-item>
<p>7:&#x2003;output_i &#x3d; Conv1d &#x2b; BN &#x2b; ReLU &#x2b; Dropout</p>
</list-item>
<list-item>
<p>8:&#x2003;skip_i &#x3d; Conv1d(output_i, kernel &#x3d; 1)</p>
</list-item>
<list-item>
<p>9:&#x2003;skip_list.append(skip_i)</p>
</list-item>
<list-item>
<p>10:&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>11:&#x2003;TCN_output &#x3d; <inline-formula id="inf59">
<mml:math id="m74">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> skip_list</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>The overall procedure is summarized in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
</sec>
<sec id="s2-2-2-2">
<title>2.2.2.2 Bi-LSTM with attention</title>
<p>The concentration sequences of air pollutants exhibit pronounced temporal dependencies, particularly under complex meteorological conditions such as cross-day lag and persistent high-pressure accumulation (<xref ref-type="bibr" rid="B42">Ziernicka-Wojtaszek et al., 2024</xref>). Traditional unidirectional recurrent models often fail to comprehensively capture the bidirectional flow of information in time series. To address this limitation, we incorporate a two-layer bidirectional Long Short-Term Memory (BiLSTM) network into the model, with 64 hidden units per direction. This structure is capable of modeling both forward and backward temporal dependencies, thereby facilitating the learning of pollutant accumulation, propagation, and feedback mechanisms over time. As a result, it significantly enhances the model&#x2019;s ability to capture the evolving trends in air pollution dynamics. To further strengthen the model&#x2019;s capacity to identify critical temporal segments, especially in cases of sudden pollution bursts, non-stationary fluctuations, or structural regime shifts (<xref ref-type="bibr" rid="B7">Dong et al., 2024</xref>). We introduce a multi-head self-attention mechanism following the BiLSTM outputs. This mechanism computes relevance scores between time steps using a Query&#x2013;Key&#x2013;Value structure and learns multiple types of dependencies in parallel subspaces. Conceptually, it constructs a soft &#x201c;global memory&#x201d; over the sequence, allowing the model to dynamically focus on salient moments and better capture non-local interactions within the temporal context.</p>
<p>However, LSTM and attention modules produce feature representations of different nature (<xref ref-type="bibr" rid="B15">Khan and Hossni, 2025</xref>). Simply concatenating or summing their outputs may result in redundancy, representational conflict, or even degradation in generalization. To alleviate such issues, we further introduce a gating mechanism to adaptively fuse the outputs from the LSTM and attention layers. This mechanism employs a learnable gate to generate dynamic weights based on the joint input, thereby regulating the flow and contribution of each representation and ensuring a more coherent integration. Formally, let the input sequence to the module be:<disp-formula id="e16">
<mml:math id="m75">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>where B is the batch size, T is the number of time steps, and D is the input feature dimension. The sequence is first passed through a two-layer BiLSTM, producing forward and backward hidden states concatenated as:<disp-formula id="e17">
<mml:math id="m76">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>This output is then used as Query, Key, and Value in the multi-head self-attention mechanism, defined as:<disp-formula id="e18">
<mml:math id="m77">
<mml:mrow>
<mml:mtext>Attention</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
</p>
<p>The resulting attention-enhanced representation is <inline-formula id="inf60">
<mml:math id="m78">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. To integrate both representations, a gating mechanism is applied:<disp-formula id="e19">
<mml:math id="m79">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
<disp-formula id="e20">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>gated</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>where <inline-formula id="inf61">
<mml:math id="m81">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the sigmoid activation function, <inline-formula id="inf62">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>and <inline-formula id="inf63">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are learnable parameters, and <inline-formula id="inf64">
<mml:math id="m84">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents element-wise multiplication. This gating strategy enables the model to dynamically select the most reliable information source at each time step, thereby improving the stability and discriminative power of the learned temporal features.</p>
<p>The BiLSTM-attention module and gating are given in <xref ref-type="disp-formula" rid="e16">Equations 16</xref>&#x2013;<xref ref-type="disp-formula" rid="e20">20</xref>.</p>
<p>Finally, the fused representation <inline-formula id="inf65">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>gated</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is passed through a fully connected projection layer to produce a unified hidden representation, which is subsequently fed into the downstream fusion and multi-task prediction modules. The specific pseudo-code is as follows:</p>
<p>
<statement content-type="algorithm" id="Algorithm_2">
<label>Algorithm 2</label>
<p>Bi-LSTM with Attention.<list list-type="simple">
<list-item>
<p>1:&#x2003;&#x23; <bold>Input:</bold> X_lstm <inline-formula id="inf66">
<mml:math id="m86">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf67">
<mml:math id="m87">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf68">
<mml:math id="m88">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> D}</p>
</list-item>
<list-item>
<p>2:&#x2003;&#x23; B: Batch size, T: Time steps, D: Feature dimension</p>
</list-item>
<list-item>
<p>3:&#x2003;&#x23; <bold>Step 1: Bidirectional LSTM</bold>
</p>
</list-item>
<list-item>
<p>4:&#x2003;H_fwd, H_bwd &#x3d; LSTM_forward(X_lstm)</p>
</list-item>
<list-item>
<p>5:&#x2003;H &#x3d; concat(H_fwd, H_bwd) &#x23; H <inline-formula id="inf69">
<mml:math id="m89">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf70">
<mml:math id="m90">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf71">
<mml:math id="m91">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2H}</p>
</list-item>
<list-item>
<p>6:&#x2003;&#x23; <bold>Step 2: Multi-head self-attention</bold>
</p>
</list-item>
<list-item>
<p>7:&#x2003;Q &#x3d; K &#x3d; V &#x3d; H</p>
</list-item>
<list-item>
<p>8:&#x2003;A &#x3d; MultiHeadAttention(Q, K, V) &#x23; A <inline-formula id="inf72">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf73">
<mml:math id="m93">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf74">
<mml:math id="m94">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2H}</p>
</list-item>
<list-item>
<p>9:&#x2003;&#x23; <bold>Step 3: Gating mechanism</bold>
</p>
</list-item>
<list-item>
<p>10:&#x2003;G &#x3d; sigmoid(Linear(concat(H, A))) &#x23; G <inline-formula id="inf75">
<mml:math id="m95">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf76">
<mml:math id="m96">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf77">
<mml:math id="m97">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2H}</p>
</list-item>
<list-item>
<p>11:&#x2003;H_gated &#x3d; G <inline-formula id="inf78">
<mml:math id="m98">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> H &#x2b; (1 - G) <inline-formula id="inf79">
<mml:math id="m99">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> A &#x23; Element-wise fusion</p>
</list-item>
<list-item>
<p>12:&#x2003;&#x23; <bold>Step 4: Output projection</bold>
</p>
</list-item>
<list-item>
<p>13:&#x2003;Output &#x3d; Linear(H_gated) &#x23; Project to desired hidden dimension</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>The steps of the BiLSTM-attention module are provided in <xref ref-type="statement" rid="Algorithm_2">Algorithm 2</xref>.</p>
</sec>
<sec id="s2-2-2-3">
<title>2.2.2.3 Fusion and prediction</title>
<p>Following the TCN and BiLSTM modules, the model concatenates the two output representations along the last dimension and applies a gated fusion network dynamically integrate temporal and contextual information. This fusion module adopts a fully connected layer followed by ReLU activation and dropout, enabling nonlinear feature transformation while suppressing redundant information. The fused representation from the TCN and BiLSTM modules is computed as:<disp-formula id="e21">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>fusion</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>ReLU</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>tcn</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>where <inline-formula id="inf80">
<mml:math id="m101">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>tcn</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the output from the TCN module, and <inline-formula id="inf81">
<mml:math id="m102">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the gated BiLSTM-attention output. The attention weights over the temporal dimension are computed as:<disp-formula id="e22">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>fusion</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>The time-aware representation is obtained by element-wise multiplication:<disp-formula id="e23">
<mml:math id="m104">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>weighted</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>fusion</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>
</p>
<p>At the final stage, two parallel output heads are employed to predict <inline-formula id="inf82">
<mml:math id="m105">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf83">
<mml:math id="m106">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations, respectively. Each head is implemented as a two-layer MLP, where the hidden dimension is reduced before generating one-step predictions (which can be extended to multi-step forecasting). This dual-head structure facilitates shared temporal representation learning while maintaining task-specific output variability. For each task <inline-formula id="inf84">
<mml:math id="m107">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mn>2.5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, the prediction is computed as:<disp-formula id="e24">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>ReLU</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>weighted</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>where <inline-formula id="inf85">
<mml:math id="m109">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>weighted</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the fused feature at the final time step. The overall training objective is defined as a weighted sum of mean squared errors for both prediction tasks:<disp-formula id="e25">
<mml:math id="m110">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(25)</label>
</disp-formula>The fusion, temporal weighting, task heads and loss follow <xref ref-type="disp-formula" rid="e21">Equations 21</xref>&#x2013;<xref ref-type="disp-formula" rid="e25">25</xref>.where <inline-formula id="inf86">
<mml:math id="m111">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is a hyperparameter that balances the learning priorities of the two tasks. The specific pseudo-code is as follows:</p>
<p>
<statement content-type="algorithm" id="Algorithm_3">
<label>Algorithm 3</label>
<p>Fusion and Multi-Task Output Module.<list list-type="simple">
<list-item>
<p>1:&#x2003;&#x23; <bold>Inputs:</bold>
</p>
</list-item>
<list-item>
<p>2:&#x2003;&#x23; F_tcn <inline-formula id="inf87">
<mml:math id="m112">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf88">
<mml:math id="m113">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf89">
<mml:math id="m114">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> C1} <inline-formula id="inf90">
<mml:math id="m115">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> from TCN module</p>
</list-item>
<list-item>
<p>3:&#x2003;&#x23; H_lstm <inline-formula id="inf91">
<mml:math id="m116">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mspace width="0.3333em"/>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2227;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> {B <inline-formula id="inf92">
<mml:math id="m117">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> T <inline-formula id="inf93">
<mml:math id="m118">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> C2} <inline-formula id="inf94">
<mml:math id="m119">
<mml:mrow>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> from BiLSTM &#x2b; Attention module</p>
</list-item>
<list-item>
<p>4:&#x2003;&#x23; B: Batch size, T: time steps, C1/C2: channel dimensions</p>
</list-item>
<list-item>
<p>5:&#x2003;&#x23; <bold>Step 1: Feature concatenation and nonlinear gated fusion</bold>
</p>
</list-item>
<list-item>
<p>6:&#x2003;H_concat &#x3d; concat(F_tcn, H_lstm, dim &#x3d; &#x2212;1) &#x23; [B, T, C1 &#x2b; C2]</p>
</list-item>
<list-item>
<p>7:&#x2003;H_fusion &#x3d; ReLU(Linear(H_concat)) &#x23; [B, T, C_fused]</p>
</list-item>
<list-item>
<p>8:&#x2003;H_fusion &#x3d; Dropout(H_fusion)</p>
</list-item>
<list-item>
<p>9:&#x2003;&#x23; <bold>Step 2: Temporal Attention Mechanism (TAM)</bold>
</p>
</list-item>
<list-item>
<p>10:&#x2003;w_t &#x3d; Sigmoid(Linear(H_fusion)) &#x23; [B, T, 1]</p>
</list-item>
<list-item>
<p>11:&#x2003;H_weighted &#x3d; H_fusion &#x2a; w_t &#x23; Element-wise weight across time</p>
</list-item>
<list-item>
<p>12:&#x2003;&#x23; <bold>Step 3: Extract final time step representation</bold>
</p>
</list-item>
<list-item>
<p>13:&#x2003;H_final &#x3d; H_weighted[:, &#x2212;1, :] &#x23; [B, C_fused]</p>
</list-item>
<list-item>
<p>14:&#x2003;&#x23; <bold>Step 4: Task-specific MLP heads for PM2.5 and PM10</bold>
</p>
</list-item>
<list-item>
<p>15:&#x2003;y_pm25 &#x3d; Linear2(ReLU(Linear1(H_final))) &#x23; [B, 1]</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>The rationale behind the architectural design of EBAMTN is further summarized below, emphasizing its effectiveness and explainability: The design of the EBAMTN architecture is motivated by the need to effectively model both fine-grained temporal dynamics and inter-pollutant interactions in real-world air quality forecasting scenarios. The use of parallel multi-scale convolutional branches enables the model to simultaneously capture short-term fluctuations and long-term periodic trends. The bidirectional LSTM component models sequential dependencies from both past and future directions, while the multi-head attention mechanism selectively focuses on informative time steps, improving interpretability. Moreover, the gated fusion mechanism adaptively balances contextual information from different modules, preventing feature redundancy and enhancing robustness. By jointly modeling <inline-formula id="inf95">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf96">
<mml:math id="m121">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in a multi-task setting, the framework leverages inherent pollutant correlations, leading to improved generalization. These design choices collectively contribute to the model&#x2019;s superior predictive performance, while maintaining interpretability and scalability for deployment.</p>
<p>Fusion and multi-task output are detailed in <xref ref-type="statement" rid="Algorithm_3">Algorithm 3</xref>.</p>
</sec>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Experiments</title>
<sec id="s2-3-1">
<title>2.3.1 Dataset and preprocessing</title>
<p>In this study, air quality monitoring data from three cities in China (Guangzhou, Chengdu and Beijing. For each city, data from a single central monitoring site was used to ensure consistency and avoid spatial heterogeneity.) are used to validate the effectiveness of the proposed model. The dataset contains hourly observations of two key pollutants, <inline-formula id="inf97">
<mml:math id="m122">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf98">
<mml:math id="m123">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The preprocessing procedure includes three main steps: temporal alignment, feature normalization, and supervised sequence construction. First, the raw data were sorted by timestamp (year-month-day-hour), and records with missing values were removed to ensure temporal continuity and data integrity. Second, the concentration values of <inline-formula id="inf99">
<mml:math id="m124">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf100">
<mml:math id="m125">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> were independently normalized to the [0, 1] range using the MinMaxScaler method, which improves gradient stability and convergence efficiency during training. Finally, supervised learning samples were generated using a sliding window strategy, where the past 168 consecutive hours (i.e., 1&#xa0;week) of pollutant concentrations are used to predict the concentration in the next hour. The dataset used in this study is divided into training, validation, and test sets in a ratio of 70:15:15, resulting in approximately 25,000 sample sequences for training and 5,400 sequences each for validation and testing. To enhance the robustness and generalization ability of the model, Gaussian noise with a noise factor of 0.05 is added to the input data during training. Data loading and mini-batch processing are implemented using PyTorch&#x2019;s DataLoader, with the batch size set to 128 to strike a balance between computational efficiency and training stability. Experimental results demonstrate that this preprocessing strategy significantly improves the model&#x2019;s predictive performance, reducing the average prediction error on the validation set by approximately 10%. To provide a clear view of the input features, <xref ref-type="table" rid="T1">Table 1</xref> lists all predictor variables used in this study. Each variable consists of the past 168 hourly observations (i.e., 1&#xa0;week of data).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>List of predictor variables used in the model.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Variable</th>
<th align="left">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<inline-formula id="inf101">
<mml:math id="m126">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>168</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Historical values of fine particulate matter (last 168&#xa0;h)</td>
</tr>
<tr>
<td align="left">
<inline-formula id="inf102">
<mml:math id="m127">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>168</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="left">Historical values of coarse particulate matter (last 168&#xa0;h)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Implementation details</title>
<p>To ensure the reproducibility of all experiments, a fixed random seed (seed &#x3d; 42) was used for data partitioning and model initialization. All experiments were conducted on a single workstation equipped with an NVIDIA GeForce RTX 3090 Laptop GPU, an 11th Gen Intel(R) Core (TM) i7-11800H CPU, 8&#xa0;GB of dedicated GPU memory, and 16&#xa0;GB of system RAM. The model was implemented using the PyTorch 1.9.0 deep learning framework. Model performance was comprehensively evaluated using three standard metrics: Mean Squared Error (MSE), Mean Absolute Error (MAE), and the Coefficient of Determination (R<sup>2</sup>). The model was trained using a mini-batch size of 128, and parameters were updated using the Adam optimizer, with an initial learning rate of 0.001 and a weight decay coefficient of 0.0001. A cosine annealing learning rate scheduler was adopted with a cycle length of 100 epochs to improve convergence. Additionally, gradient clipping with a threshold of 1.0 was applied to prevent gradient explosion. An early stopping strategy was used to prevent overfitting, whereby training was terminated if the validation loss did not improve for 10 consecutive epochs. To enhance model robustness, Gaussian noise with a noise factor of 0.05 was added to the input data during training.</p>
<p>The proposed model adopts a novel hybrid architecture that integrates multi-scale Temporal Convolutional Networks (TCN) and an enhanced Bidirectional LSTM. The multi-scale TCN module contains three parallel convolutional branches with kernel sizes of 3, 5, and 7, and corresponding output channel sizes of 32, 64, and 128, respectively. Each branch is followed by batch normalization, a ReLU activation function, and a dropout layer with a dropout rate of 0.1. The outputs of these branches are dynamically weighted and fused using a lightweight channel-wise attention mechanism, implemented via a linear transformation followed by a softmax function. The enhanced LSTM module employs a two-layer bidirectional LSTM with a hidden size of 64 and integrates a multi-head self-attention mechanism to strengthen the model&#x2019;s capacity for capturing long-range temporal dependencies. To effectively merge the outputs of the TCN and LSTM modules, a gated fusion mechanism is adopted. This mechanism uses a sigmoid-activated gating network to compute the importance of each representation and leverages residual connections to stabilize gradient propagation and mitigate vanishing gradients. At the output stage, the model adopts a multi-task prediction structure, where <inline-formula id="inf103">
<mml:math id="m128">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf104">
<mml:math id="m129">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations are predicted through two separate MLP heads. These heads share the same feature extraction backbone but operate independently in prediction, and their learning objectives are balanced using a dynamic task weighting strategy <inline-formula id="inf105">
<mml:math id="m130">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The training of the EBAMTN model was conducted on a single NVIDIA RTX 3090 GPU. The detail as shown on the <xref ref-type="table" rid="T2">Table 2</xref>, total training time for 100 epochs was approximately 2.4&#xa0;h, with an average of 1.45&#xa0;min per epoch on the combined multi-city dataset. The final model contains approximately 2.1 million trainable parameters. During inference, the model achieves an average forward pass time of 13.2 milliseconds per instance (batch size &#x3d; 1), making it suitable for real-time deployment. Due to its modular and lightweight design, the model can be efficiently quantized and deployed on edge devices such as NVIDIA Jetson or high-performance ARM-based systems with limited computational resources. In scenarios where on-device training is not feasible, the model can be pre-trained centrally and optimized for inference using techniques such as model pruning, weight quantization, or TensorRT acceleration. These approaches can significantly reduce memory and computational requirements, making real-time edge deployment viable.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Computational efficiency and deployment feasibility of EBAMTN.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Metric</th>
<th align="left">Value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Total training time (100 epochs)</td>
<td align="left">2.4&#xa0;h</td>
</tr>
<tr>
<td align="left">Avg. time per epoch</td>
<td align="left">1.45&#xa0;min</td>
</tr>
<tr>
<td align="left">Number of trainable parameters</td>
<td align="left">2.1 million</td>
</tr>
<tr>
<td align="left">Inference time per sample</td>
<td align="left">13.2&#xa0;ms</td>
</tr>
<tr>
<td align="left">Edge deployability</td>
<td align="left">Supported (e.g., Jetson Nano, ARM SoCs)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Through a combination of multi-scale feature extraction, attention-enhanced sequence modeling, and adaptive feature fusion, the proposed model achieves significantly improved prediction accuracy while maintaining computational efficiency. Detailed quantitative results and comparisons with baseline methods are presented in the following section.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<title>3 Results and Analysis</title>
<sec id="s3-1">
<title>3.1 Results performance</title>
<p>From <xref ref-type="table" rid="T3">Table 3</xref>, it can be concluded that for <inline-formula id="inf106">
<mml:math id="m131">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the EBAMTN model achieves an MAE of 2.0303, an RMSE of 2.9470, and an R<sup>2</sup> of 0.9461 in GuangZhou dataset, indicating high prediction accuracy and effective control over prediction errors. The R<sup>2</sup> value approaching 0.95 suggests that the model can explain more than 94% of the variance in <inline-formula id="inf107">
<mml:math id="m132">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations, reflecting its strong fitting capability and stable prediction performance for fine particulate matter. In the case of <inline-formula id="inf108">
<mml:math id="m133">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> prediction, although the error metrics are slightly higher (MAE &#x3d; 3.4484, RMSE &#x3d; 4.9916), the R<sup>2</sup> remains high at 0.9440, demonstrating that the EBAMTN model maintains robust temporal modeling capabilities, even in scenarios characterized by greater volatility and fluctuation in coarse particulate matter concentrations. The similarity of R<sup>2</sup> values for <inline-formula id="inf109">
<mml:math id="m134">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf110">
<mml:math id="m135">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> further highlights the model&#x2019;s cross-pollutant generalization ability, confirming its suitability for multi-pollutant synergistic forecasting tasks.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Prediction performance of the EBAMTN model for <inline-formula id="inf111">
<mml:math id="m136">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf112">
<mml:math id="m137">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="3" align="center">
<inline-formula id="inf113">
<mml:math id="m138">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="3" align="center">
<inline-formula id="inf114">
<mml:math id="m139">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf115">
<mml:math id="m140">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf116">
<mml:math id="m141">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">EBAMTN</td>
<td align="center">2.03</td>
<td align="center">2.94</td>
<td align="center">0.94</td>
<td align="center">3.44</td>
<td align="center">4.99</td>
<td align="center">0.94</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F2">Figure 2</xref> illustrates the effectiveness of the proposed model in long-term time-series forecasting of <inline-formula id="inf117">
<mml:math id="m142">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf118">
<mml:math id="m143">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>PM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations, while also providing a quantitative assessment of prediction uncertainty through the incorporation of confidence intervals. The upper panel presents the prediction results for <inline-formula id="inf119">
<mml:math id="m144">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the lower panel corresponds to <inline-formula id="inf120">
<mml:math id="m145">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. In the <inline-formula id="inf121">
<mml:math id="m146">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> prediction, the red dashed line (representing predicted values) closely follows the blue solid line (true values), demonstrating the model&#x2019;s strong capacity to capture both long-term trends and short-term fluctuations. The shaded regions representing confidence intervals remain relatively narrow across most of the time horizon and only expand slightly during periods of abrupt pollution changes. This indicates that the model not only delivers accurate point forecasts but also maintains high confidence and robustness in its probabilistic predictions. Similarly, for <inline-formula id="inf122">
<mml:math id="m147">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the predicted trend aligns well with the observed values. Although the confidence intervals become wider during moments of sudden pollution variation, the predicted values consistently fall within reasonable bounds. This highlights the model&#x2019;s strong generalization capability and temporal stability in modeling pollutants with different variability scales. The stable and consistent performance across both <inline-formula id="inf123">
<mml:math id="m148">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf124">
<mml:math id="m149">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> predictions further confirms the effectiveness of the proposed multi-task model architecture, demonstrating its ability to jointly learn and generalize across multiple pollutant forecasting tasks.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Prediction results for <inline-formula id="inf125">
<mml:math id="m150">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf126">
<mml:math id="m151">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using the EBAMTN model.</p>
</caption>
<graphic xlink:href="fenvs-13-1623630-g002.tif">
<alt-text content-type="machine-generated">Two graphs showing PM2.5 and PM10 prediction results with confidence intervals. The top graph depicts PM2.5 concentrations with true values in blue, predicted values in red, and a 95% confidence interval shaded in pink. The bottom graph represents PM10 concentrations with true values in green, predicted values in red, and a 95% confidence interval shaded in pink. Both graphs display fluctuations over time steps along the x-axis.</alt-text>
</graphic>
</fig>
<p>
<xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the predicted concentrations of <inline-formula id="inf127">
<mml:math id="m152">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf128">
<mml:math id="m153">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> over the final 100&#xa0;h of the test set, compared against the true observed values. The upper subplot presents the <inline-formula id="inf129">
<mml:math id="m154">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> results, showing that the model effectively captures the overall temporal trend and maintains a high degree of consistency with actual fluctuations. Nevertheless, during periods of abrupt changes in concentration, the predicted values exhibit slight overestimation or temporal lag, suggesting that the model&#x2019;s responsiveness to short-term rapid variations still has room for improvement. In the lower subplot for <inline-formula id="inf130">
<mml:math id="m155">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the model similarly captures the general trend; however, a noticeable and systematic overestimation occurs during pollution peaks. This bias may arise from the model&#x2019;s limited capacity to model dispersion dynamics or sensitivity to input features under high-pollution regimes. Despite this, during more stable periods with moderate pollution levels, the predictions align well with the true observations, demonstrating strong performance under relatively steady conditions.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Comparison of predicted and actual values for <inline-formula id="inf131">
<mml:math id="m156">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf132">
<mml:math id="m157">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> over the last 100&#xa0;h.</p>
</caption>
<graphic xlink:href="fenvs-13-1623630-g003.tif">
<alt-text content-type="machine-generated">Two line graphs compare PM2.5 and PM10 predictions with ground truth over the last 100 hours. The first graph shows true PM2.5 in blue and predicted PM2.5 in red, indicating maximum error at 54 hours and minimum error at 77 hours. The second graph displays true PM10 in green and predicted PM10 in red, also noting the largest error at 54 hours and the smallest at 77 hours.</alt-text>
</graphic>
</fig>
<p>Overall, the model shows promising results in modeling the temporal dynamics and variability of both <inline-formula id="inf133">
<mml:math id="m158">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf134">
<mml:math id="m159">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. However, enhancing accuracy at extreme fluctuation points remains an important area for further improvement.</p>
<p>The scatter plots provided (as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>) illustrate the regression analysis comparing the observed and predicted values of <inline-formula id="inf135">
<mml:math id="m160">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf136">
<mml:math id="m161">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, effectively visualizing the predictive performance of the proposed model. In the <inline-formula id="inf137">
<mml:math id="m162">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> plot (left panel), most data points are closely clustered around the regression line, indicating a strong linear correlation and demonstrating that the model effectively captures the overall trend of pollutant concentrations. However, a noticeable dispersion is observed in the high concentration region, suggesting that the model&#x2019;s prediction accuracy declines under extreme pollution conditions. In the <inline-formula id="inf138">
<mml:math id="m163">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> plot (right panel), a similar strong linear trend is observed, with most data points distributed tightly along the regression line, confirming the robustness and reliability of the model under typical conditions. Nonetheless, the spread of data points also increases at higher concentration values, reflecting a potential limitation of the model in predicting outliers or peak pollution levels.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Regression plots of true vs. predicted concentrations for <inline-formula id="inf139">
<mml:math id="m164">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf140">
<mml:math id="m165">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic xlink:href="fenvs-13-1623630-g004.tif">
<alt-text content-type="machine-generated">Scatter plots compare true versus predicted values for PM2.5 and PM10 concentrations, with data points in blue and a red regression line. The PM2.5 plot is on the left, showing values up to 80 micrograms per cubic meter. The PM10 plot, on the right, shows values up to 160 micrograms per cubic meter, both indicating a positive correlation.</alt-text>
</graphic>
</fig>
<p>Overall, the regression analysis confirms the model&#x2019;s strong predictive capability under normal pollution levels, while also highlighting areas for potential improvement under high-pollution scenarios. These limitations could be addressed through targeted model enhancements such as rebalancing the training data, introducing adaptive loss functions, or applying data augmentation strategies specifically designed to emphasize extreme value learning.</p>
</sec>
<sec id="s3-2">
<title>3.2 Comparison study</title>
<p>Based on the comparison table provided, the prediction performance of various models for air quality forecasting is comprehensively analyzed as shown in <xref ref-type="table" rid="T4">Table 4</xref>. Traditional machine learning models such as Random Forest (RF) and Support Vector Regression (SVR) exhibit relatively poor performance, with R<sup>2</sup> values for both <inline-formula id="inf141">
<mml:math id="m166">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf142">
<mml:math id="m167">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> falling below 0.7. This indicates their limited capacity in capturing complex temporal dependencies, which are essential for accurate air quality prediction. In contrast, deep learning models such as LSTM and TCN show significant improvements. Their R<sup>2</sup> scores increase to the range of 0.72&#x2013;0.77, highlighting the advantages of neural networks in modeling sequential patterns. However, both models still exhibit limitations in prediction accuracy and stability when used individually. The TCN-LSTM hybrid model, which integrates convolutional and recurrent architectures, achieves better performance for <inline-formula id="inf143">
<mml:math id="m168">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> prediction (R<sup>2</sup> &#x3d; 0.88). Nevertheless, its performance on <inline-formula id="inf144">
<mml:math id="m169">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> deteriorates significantly, suggesting that the model lacks robustness and generalization across pollutant types. The Informer model further enhances prediction performance, achieving R<sup>2</sup> values exceeding 0.9 for both pollutants, along with improved stability. This confirms the effectiveness of Transformer based architectures in long-sequence forecasting tasks.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Performance comparison of different models for <inline-formula id="inf145">
<mml:math id="m170">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf146">
<mml:math id="m171">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> prediction tasks.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="3" align="center">
<inline-formula id="inf147">
<mml:math id="m172">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="3" align="center">
<inline-formula id="inf148">
<mml:math id="m173">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf149">
<mml:math id="m174">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf150">
<mml:math id="m175">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">RF (<xref ref-type="bibr" rid="B16">Kim et al., 2023</xref>; <xref ref-type="bibr" rid="B13">Kalantari et al., 2025</xref>)</td>
<td align="center">13.13</td>
<td align="center">17.70</td>
<td align="center">0.65</td>
<td align="center">15.21</td>
<td align="center">17.81</td>
<td align="center">0.61</td>
</tr>
<tr>
<td align="center">SVR (<xref ref-type="bibr" rid="B13">Kalantari et al., 2025</xref>)</td>
<td align="center">18.19</td>
<td align="center">23.50</td>
<td align="center">0.58</td>
<td align="center">21.18</td>
<td align="center">24.61</td>
<td align="center">0.54</td>
</tr>
<tr>
<td align="center">LSTM (<xref ref-type="bibr" rid="B17">Kristiani et al., 2022</xref>; <xref ref-type="bibr" rid="B36">Xayasouk et al., 2020</xref>)</td>
<td align="center">11.28</td>
<td align="center">15.71</td>
<td align="center">0.72</td>
<td align="center">11.77</td>
<td align="center">16.02</td>
<td align="center">0.69</td>
</tr>
<tr>
<td align="center">TCN (<xref ref-type="bibr" rid="B32">Tang et al., 2021</xref>)</td>
<td align="center">10.49</td>
<td align="center">13.41</td>
<td align="center">0.77</td>
<td align="center">11.23</td>
<td align="center">13.21</td>
<td align="center">0.75</td>
</tr>
<tr>
<td align="center">TCN-LSTM (<xref ref-type="bibr" rid="B30">Ren et al., 2023</xref>)</td>
<td align="center">9.83</td>
<td align="center">15.43</td>
<td align="center">0.88</td>
<td align="center">15.75</td>
<td align="center">26.59</td>
<td align="center">0.87</td>
</tr>
<tr>
<td align="center">Informer (<xref ref-type="bibr" rid="B21">Lin et al., 2024</xref>)</td>
<td align="center">7.70</td>
<td align="center">9.46</td>
<td align="center">0.92</td>
<td align="center">10.32</td>
<td align="center">12.99</td>
<td align="center">0.91</td>
</tr>
<tr>
<td align="center">EBAMTN (ours)</td>
<td align="center">
<bold>2.03</bold>
</td>
<td align="center">
<bold>2.94</bold>
</td>
<td align="center">
<bold>0.94</bold>
</td>
<td align="center">
<bold>3.44</bold>
</td>
<td align="center">
<bold>4.99</bold>
</td>
<td align="center">
<bold>0.94</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the best result in each column (lowest MAE/RMSE or highest R<sup>2</sup>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Finally, the proposed EBAMTN model achieves the best overall performance across all metrics. It reduces the MAE and RMSE for <inline-formula id="inf151">
<mml:math id="m176">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to 2.03 and 2.94, and for <inline-formula id="inf152">
<mml:math id="m177">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to 3.44 and 4.99, respectively. The R<sup>2</sup> values for both pollutants exceed 0.94, fully demonstrating the strength of multi-task learning and the attention mechanism in capturing shared and task-specific temporal dynamics. While the quantitative comparisons in <xref ref-type="table" rid="T4">Table 4</xref> demonstrate the superior performance of EBAMTN over classical and recent models, it is important to further contextualize these results with respect to other multi-scale or attention-based frameworks. For instance, the TCN-LSTM hybrid model (<xref ref-type="bibr" rid="B30">Ren et al., 2023</xref>) partially captures hierarchical temporal patterns through convolutional and recurrent layers but lacks explicit attention mechanisms or task-specific optimization. Similarly, the Informer model (<xref ref-type="bibr" rid="B21">Lin et al., 2024</xref>) incorporates a sparse self-attention mechanism suitable for long-sequence forecasting but operates under a single-task setting, thus ignoring pollutant interdependencies. Compared with these models, EBAMTN not only leverages multi-scale convolutions and bidirectional memory but also integrates attention-guided feature fusion under a unified multi-task framework. This combination of architectural enhancements accounts for the model&#x2019;s improved generalization and robustness across cities and pollutants. These results confirm that the proposed model is highly suitable for high-precision air quality time series prediction tasks.</p>
<p>The subsequent analysis focuses on the performance of the proposed EBAMTN model across different urban environments. <xref ref-type="table" rid="T5">Table 5</xref> presents the prediction outcomes for <inline-formula id="inf153">
<mml:math id="m178">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf154">
<mml:math id="m179">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> concentrations in three cities: Guangzhou, Beijing, and Chengdu. Overall, the model demonstrates strong generalization capability and robustness across varied geographic and climatic contexts. In Guangzhou, the model achieves the best overall performance, with an R<sup>2</sup> of 0.94 for both <inline-formula id="inf155">
<mml:math id="m180">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf156">
<mml:math id="m181">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The prediction errors are also notably low, with MAE &#x3d; 2.03 and RMSE &#x3d; 2.94 for <inline-formula id="inf157">
<mml:math id="m182">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and MAE &#x3d; 3.44, RMSE &#x3d; 4.99 for <inline-formula id="inf158">
<mml:math id="m183">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. These results confirm the model&#x2019;s high accuracy and stability in the southern urban setting, where pollution patterns are relatively smooth and seasonal transitions less drastic. In Chengdu, the model maintains similarly excellent performance, with R<sup>2</sup> values of 0.93 and 0.92 for <inline-formula id="inf159">
<mml:math id="m184">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf160">
<mml:math id="m185">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. Interestingly, the error metrics in Chengdu are slightly lower than those in Guangzhou, suggesting the model&#x2019;s strong adaptability to the southwestern climate conditions, which are often characterized by humid weather and stable pollution dynamics. In contrast, the model&#x2019;s performance in Beijing, though still strong shows a relative decline. The R<sup>2</sup> values remain high at 0.91 <inline-formula id="inf161">
<mml:math id="m186">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and 0.90 <inline-formula id="inf162">
<mml:math id="m187">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, but the error metrics increase significantly (MAE &#x3d; 4.15, RMSE &#x3d; 4.68 for <inline-formula id="inf163">
<mml:math id="m188">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>; MAE &#x3d; 4.81, RMSE &#x3d; 5.01 for <inline-formula id="inf164">
<mml:math id="m189">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). This performance drop indicates that the model is more challenged by the complex and highly volatile pollution patterns in northern cities, where seasonal transitions and extreme pollution events are more frequent.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Prediction performance of EBAMTN across three Cities for <inline-formula id="inf165">
<mml:math id="m190">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf166">
<mml:math id="m191">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">City</th>
<th colspan="3" align="center">
<inline-formula id="inf167">
<mml:math id="m192">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="3" align="center">
<inline-formula id="inf168">
<mml:math id="m193">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf169">
<mml:math id="m194">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">
<inline-formula id="inf170">
<mml:math id="m195">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Guangzhou</td>
<td align="center">
<bold>2.03</bold>
</td>
<td align="center">
<bold>2.94</bold>
</td>
<td align="center">
<bold>0.94</bold>
</td>
<td align="center">
<bold>3.44</bold>
</td>
<td align="center">
<bold>4.99</bold>
</td>
<td align="center">
<bold>0.94</bold>
</td>
</tr>
<tr>
<td align="center">Beijing</td>
<td align="center">4.15</td>
<td align="center">4.68</td>
<td align="center">0.91</td>
<td align="center">4.81</td>
<td align="center">5.01</td>
<td align="center">0.90</td>
</tr>
<tr>
<td align="center">Chengdu</td>
<td align="center">2.17</td>
<td align="center">2.87</td>
<td align="center">0.93</td>
<td align="center">2.85</td>
<td align="center">3.01</td>
<td align="center">0.92</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold indicates the best result in each column (lowest MAE/RMSE or highest R<sup>2</sup>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In summary, the proposed EBAMTN model exhibits good cross-regional generalization and maintains stable performance across diverse urban environments. However, further refinement may be needed to enhance its responsiveness under northern seasonal extremes and pollution surge scenarios. To further support the superiority of the proposed model, we highlight that EBAMTN achieves better temporal alignment with the actual pollutant concentration trends across different urban environments. As illustrated in <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref>, the predicted values not only capture the overall fluctuations but also track the turning points more effectively than baseline methods. This indicates stronger trend generalization and dynamic adaptation capabilities.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<title>4 Conclusion</title>
<p>This paper presents a multi-task air quality forecasting framework named Enhanced Bidirectional Attention Multi-Scale Temporal Network (EBAMTN), which integrates multi-scale Temporal Convolutional Networks (TCNs), enhanced BiLSTM, and linear/multi-head attention mechanisms to jointly improve forecasting accuracy and temporal representation learning. The proposed model demonstrates significant improvements in capturing both short-term fluctuations and long-term trends across multiple urban environments. By combining parallel multi-Scale TCNs with linear attention, the model effectively captures temporal dependencies at various resolutions while maintaining computational efficiency. The incorporation of multi-head attention in the BiLSTM module enhances the model&#x2019;s ability to detect salient time intervals and bidirectional dependencies, improving interpretability and sequence modeling depth. The multi-task learning architecture further leverages inter-pollutant correlations to achieve superior accuracy compared to single-task models, with experiments showing R<sup>2</sup> values exceeding 0.94 for both <inline-formula id="inf171">
<mml:math id="m196">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf172">
<mml:math id="m197">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> across all test cities. Despite these advantages, the model has certain limitations. Specifically, during extreme pollution events or periods of rapid concentration changes, the prediction results exhibit minor lag or deviation, particularly for <inline-formula id="inf173">
<mml:math id="m198">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This may be attributed to insufficient emphasis on rare events during training and the challenge of modeling nonlinear dispersion dynamics with limited features.</p>
<p>EBAMTN is well-suited for practical applications in real-time air quality monitoring and early warning systems. Its lightweight and modular design allows deployment on resource-constrained devices, while its strong generalization ability ensures robust performance across diverse urban regions. The dual benefits of accuracy and efficiency offer valuable decision support for environmental authorities.</p>
<p>Future work may focus on refining the attention mechanism to enhance responsiveness to sudden pollution spikes, introducing adaptive loss functions or importance-weighted sampling to improve performance on rare events, and extending the model to include more pollutants such as <inline-formula id="inf174">
<mml:math id="m199">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf175">
<mml:math id="m200">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Furthermore, integrating probabilistic forecasting techniques and online learning strategies could enhance the model&#x2019;s capacity to operate under uncertainty and evolving environmental conditions, ensuring its long-term robustness and adaptability in real-world deployments.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu/dataset/501/beijing+multi+site+air+quality+data">https://archive.ics.uci.edu/dataset/501/beijing&#x2b;multi&#x2b;site&#x2b;air&#x2b;quality&#x2b;data</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>Z-AX: Conceptualization, Investigation, Data curation, Writing &#x2013; original draft, Methodology, Formal Analysis. C-OC: Validation, Methodology, Writing &#x2013; review and editing, Supervision, Conceptualization. JC: Writing &#x2013; review and editing, Supervision. WR: Writing &#x2013; review and editing, Validation, Supervision.</p>
</sec>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. During the preparation of this work the authors used ChatGPT to improve the language and readability. After using this tool, the authors reviewed and edited the content as needed and take full responsibility for the content of the publication.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ansari</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ehrampoush</surname>
<given-names>M. H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Meteorological correlates and airq&#x2b; health risk assessment of ambient fine particulate matter in Tehran, Iran</article-title>. <source>Environ. Res.</source> <volume>170</volume>, <fpage>141</fpage>&#x2013;<lpage>150</lpage>. <pub-id pub-id-type="doi">10.1016/j.envres.2018.11.046</pub-id>
<pub-id pub-id-type="pmid">30579988</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Appel</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Bash</surname>
<given-names>J. O.</given-names>
</name>
<name>
<surname>Fahey</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Foley</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Gilliam</surname>
<given-names>R. C.</given-names>
</name>
<name>
<surname>Hogrefe</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>The community multiscale air quality (cmaq) model versions 5.3 and 5.3. 1: system updates and evaluation</article-title>. <source>Geosci. Model Dev. Discuss.</source> <volume>2020</volume>, <fpage>1</fpage>&#x2013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.5194/gmd-14-2867-2021</pub-id>
<pub-id pub-id-type="pmid">34676058</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bednarski</surname>
<given-names>B. P.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Naeim</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ramezani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Temporal convolutional networks and data rebalancing for clinical length of stay and mortality prediction</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>21247</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-25472-z</pub-id>
<pub-id pub-id-type="pmid">36481828</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Box</surname>
<given-names>G. E.</given-names>
</name>
<name>
<surname>Jenkins</surname>
<given-names>G. M.</given-names>
</name>
<name>
<surname>Reinsel</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Ljung</surname>
<given-names>G. M.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Time series analysis: forecasting and control</source>. <publisher-name>John Wiley &#x26; Sons</publisher-name>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Probabilistic forecasting with temporal convolutional neural network</article-title>. <source>Neurocomputing</source> <volume>399</volume>, <fpage>491</fpage>&#x2013;<lpage>501</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.03.011</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Navon</surname>
<given-names>I. M.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Spatio-temporal hourly and daily ozone forecasting in China using a hybrid machine learning model: autoencoder and generative adversarial networks</article-title>. <source>J. Adv. Model. Earth Syst.</source> <volume>14</volume>, <fpage>e2021MS002806</fpage>. <pub-id pub-id-type="doi">10.1029/2021ms002806</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Short-term air quality prediction based on emd-transformer-bilstm</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>20513</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-67626-1</pub-id>
<pub-id pub-id-type="pmid">39227685</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Air-quality prediction based on the arima-cnn-lstm combination model optimized by dung beetle optimizer</article-title>. <source>Sci. Rep.</source> <volume>13</volume>, <fpage>12127</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-023-36620-4</pub-id>
<pub-id pub-id-type="pmid">37495616</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Multi-scale analysis of the impacts of meteorology and emissions on pm2. 5 and o3 trends at various regions in China from 2013 to 2020 2. Key weather elements and emissions</article-title>. <source>Sci. Total Environ.</source> <volume>824</volume>, <fpage>153847</fpage>. <pub-id pub-id-type="doi">10.1016/j.scitotenv.2022.153847</pub-id>
<pub-id pub-id-type="pmid">35189213</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hochreiter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schmidhuber</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Long short-term memory</article-title>. <source>Neural Comput.</source> <volume>9</volume>, <fpage>1735</fpage>&#x2013;<lpage>1780</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
<pub-id pub-id-type="pmid">9377276</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iskandaryan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Trilles</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Graph neural network for air quality prediction: a case study in Madrid</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>2729</fpage>&#x2013;<lpage>2742</lpage>. <pub-id pub-id-type="doi">10.1109/access.2023.3234214</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jin</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multivariate air quality forecasting with nested long short term memory neural network</article-title>. <source>IEEE Trans. Industrial Inf.</source> <volume>17</volume>, <fpage>8514</fpage>&#x2013;<lpage>8522</lpage>. <pub-id pub-id-type="doi">10.1109/tii.2021.3065425</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kalantari</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gholami</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Malakooti</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kaskaoutis</surname>
<given-names>D. G.</given-names>
</name>
<name>
<surname>Saneei</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>An integrated feature selection and machine learning framework for pm10 concentration prediction</article-title>. <source>Atmos. Pollut. Res.</source> <volume>16</volume>, <fpage>102456</fpage>. <pub-id pub-id-type="doi">10.1016/j.apr.2025.102456</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karimian</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Evaluation of different machine learning approaches to forecasting pm2. 5 mass concentrations</article-title>. <source>Aerosol Air Qual. Res.</source> <volume>19</volume>, <fpage>1400</fpage>&#x2013;<lpage>1410</lpage>. <pub-id pub-id-type="doi">10.4209/aaqr.2018.12.0450</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hossni</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A comparative analysis of lstm models aided with attention and squeeze and excitation blocks for activity recognition</article-title>. <source>Sci. Rep.</source> <volume>15</volume>, <fpage>3858</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-025-88378-6</pub-id>
<pub-id pub-id-type="pmid">39890983</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Pm2. 5 concentration forecasting using weighted bi-lstm and random forest feature importance-based feature selection</article-title>. <source>Atmosphere</source> <volume>14</volume>, <fpage>968</fpage>. <pub-id pub-id-type="doi">10.3390/atmos14060968</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kristiani</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.-R.</given-names>
</name>
<name>
<surname>Chuang</surname>
<given-names>Y.-H.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>C.-T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Short-term prediction of pm2. 5 using lstm deep learning methods</article-title>. <source>Sustainability</source> <volume>14</volume>, <fpage>2068</fpage>. <pub-id pub-id-type="doi">10.3390/su14042068</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumari</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S. K.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Machine learning-based time series models for effective co2 emission prediction in India</article-title>. <source>Environ. Sci. Pollut. Res.</source> <volume>30</volume>, <fpage>116601</fpage>&#x2013;<lpage>116616</lpage>. <pub-id pub-id-type="doi">10.1007/s11356-022-21723-8</pub-id>
<pub-id pub-id-type="pmid">35780266</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Dzombak</surname>
<given-names>D. A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Use of the autoregressive integrated moving average (arima) model to forecast near-term regional temperature and precipitation</article-title>. <source>Weather Forecast.</source> <volume>35</volume>, <fpage>959</fpage>&#x2013;<lpage>976</lpage>. <pub-id pub-id-type="doi">10.1175/waf-d-19-0158.1</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lelieveld</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Klingm&#xfc;ller</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Pozzer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Burnett</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Haines</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ramanathan</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Effects of fossil fuel and total anthropogenic emission removal on public health and climate</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>116</volume>, <fpage>7192</fpage>&#x2013;<lpage>7197</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1819989116</pub-id>
<pub-id pub-id-type="pmid">30910976</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhi</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Fei</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Incorporating the third law of geography with spatial attention module&#x2013;convolutional neural network&#x2013;transformer for fine-grained non-stationary air quality predictive learning</article-title>. <source>Mathematics</source> <volume>12</volume>, <fpage>1457</fpage>. <pub-id pub-id-type="doi">10.3390/math12101457</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Analysis and prediction of air quality in nanjing from autumn 2018 to summer 2019 using pcr&#x2013;svr&#x2013;arma combined model</article-title>. <source>Sci. Rep.</source> <volume>11</volume>, <fpage>348</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-79462-0</pub-id>
<pub-id pub-id-type="pmid">33431941</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Intelligent modeling strategies for forecasting air quality time series: a review</article-title>. <source>Appl. Soft Comput.</source> <volume>102</volume>, <fpage>106957</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2020.106957</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Air pollutant prediction based on arima-woa-lstm model</article-title>. <source>Atmos. Pollut. Res.</source> <volume>14</volume>, <fpage>101761</fpage>. <pub-id pub-id-type="doi">10.1016/j.apr.2023.101761</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Air pollutant prediction model based on transfer learning two-stage attention mechanism</article-title>. <source>Sci. Rep.</source> <volume>14</volume>, <fpage>7385</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-024-57784-7</pub-id>
<pub-id pub-id-type="pmid">38548823</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A review on the attention mechanism of deep learning</article-title>. <source>Neurocomputing</source> <volume>452</volume>, <fpage>48</fpage>&#x2013;<lpage>62</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.03.091</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Organization</surname>
<given-names>W. H.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Who global air quality guidelines</source>. <publisher-loc>Geneva, Switzerland</publisher-loc>: <publisher-name>WHO</publisher-name>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pouyaei</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sadeghi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Souri</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Development and implementation of a physics-based convective mixing scheme in the community multiscale air quality modeling framework</article-title>. <source>J. Adv. Model. Earth Syst.</source> <volume>13</volume>, <fpage>e2021MS002475</fpage>. <pub-id pub-id-type="doi">10.1029/2021ms002475</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Multi-model evaluation and Bayesian model averaging in quantitative air quality forecasting in central China</article-title>. <source>Aerosol Air Qual. Res.</source> <volume>22</volume>, <fpage>210247</fpage>. <pub-id pub-id-type="doi">10.4209/aaqr.210247</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Deep learning coupled model based on tcn-lstm for particulate matter concentration prediction</article-title>. <source>Atmos. Pollut. Res.</source> <volume>14</volume>, <fpage>101703</fpage>. <pub-id pub-id-type="doi">10.1016/j.apr.2023.101703</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seng</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Spatiotemporal prediction of air quality based on LSTM neural network</article-title>. <source>Alexandria Eng. J.</source> <volume>60</volume>, <fpage>2021</fpage>&#x2013;<lpage>2032</lpage>. <pub-id pub-id-type="doi">10.1016/j.aej.2020.12.009</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Forecasting hourly pm<sub>2.5</sub> based on deep temporal convolutional network</article-title>. <source>Appl. Soft Comput.</source> <volume>112</volume>, <fpage>107751</fpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2021.107751</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tao</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sidorov</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Air pollution forecasting using a deep learning model based on 1d convnets and bidirectional gru</article-title>. <source>IEEE access</source> <volume>7</volume>, <fpage>76690</fpage>&#x2013;<lpage>76698</lpage>. <pub-id pub-id-type="doi">10.1109/access.2019.2921578</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tran</surname>
<given-names>H. D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.-Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.-H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Forecasting hourly pm2. 5 concentration with an optimized lstm model</article-title>. <source>Atmos. Environ.</source> <volume>315</volume>, <fpage>120161</fpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2023.120161</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Ssfan: a compact and efficient spectral-spatial feature extraction and attention-based neural network for hyperspectral image classification</article-title>. <source>Remote Sens.</source> <volume>16</volume>, <fpage>4202</fpage>. <pub-id pub-id-type="doi">10.3390/rs16224202</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xayasouk</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Air pollution prediction using long short-term memory (lstm) and deep autoencoder (Dae) models</article-title>. <source>Sustainability</source> <volume>12</volume>, <fpage>2570</fpage>. <pub-id pub-id-type="doi">10.3390/su12062570</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Air quality index forecast in beijing based on cnn-lstm multi-model</article-title>. <source>Chemosphere</source> <volume>308</volume>, <fpage>136180</fpage>. <pub-id pub-id-type="doi">10.1016/j.chemosphere.2022.136180</pub-id>
<pub-id pub-id-type="pmid">36058367</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Modeling air quality pm2. 5 forecasting using deep sparse attention-based transformer networks</article-title>. <source>Int. J. Environ. Sci. Technol.</source> <volume>20</volume>, <fpage>13535</fpage>&#x2013;<lpage>13550</lpage>. <pub-id pub-id-type="doi">10.1007/s13762-023-04900-1</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Sparse attention mechanism in transformer networks for time series forecasting</article-title>. <source>IEEE Access</source> <volume>11</volume>, <fpage>45678</fpage>&#x2013;<lpage>45689</lpage>. <pub-id pub-id-type="doi">10.1007/s13762-023-04900-1</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Forecasting of beijing PM<sub>2.5</sub> with a hybrid ARIMA model based on integrated AIC and improved GS fixed-order methods and seasonal decomposition</article-title>. <source>Heliyon</source> <volume>8</volume>, <fpage>e12239</fpage>. <pub-id pub-id-type="doi">10.1016/j.heliyon.2022.e12239</pub-id>
<pub-id pub-id-type="pmid">36590504</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Using long short-term memory networks to predict energy consumption of air-conditioning systems</article-title>. <source>Sustain. Cities Soc.</source> <volume>55</volume>, <fpage>102000</fpage>. <pub-id pub-id-type="doi">10.1016/j.scs.2019.102000</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ziernicka-Wojtaszek</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zu&#x15b;ka</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kopci&#x144;ska</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Assessment of the effect of meteorological conditions on the concentration of suspended pm2. 5 particulate matter in central Europe</article-title>. <source>Sustainability</source> <volume>16</volume>, <fpage>4797</fpage>. <pub-id pub-id-type="doi">10.3390/su16114797</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>
