<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Environ. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Environmental Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Environ. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-665X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1775982</article-id>
<article-id pub-id-type="doi">10.3389/fenvs.2026.1775982</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Development of a novel imputation framework for PM2.5 particle data in Pakistani cities using machine learning and statistical techniques</article-title>
<alt-title alt-title-type="left-running-head">Khan et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenvs.2026.1775982">10.3389/fenvs.2026.1775982</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Khan</surname>
<given-names>Muhammad Asad</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2757953"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pan</surname>
<given-names>Jiazhu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3233825"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Alshatti</surname>
<given-names>Amani</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Alsaber</surname>
<given-names>Ahmad</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2649016"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gray</surname>
<given-names>Alison</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2016460"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Department of Mathematics and Statistics, University of Strathclyde</institution>, <city>Glasgow</city>, <country country="GB">United Kingdom</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Health Sciences, Public Authority of Applied Education and Training (PAAET) College of Health Sciences</institution>, <city>Safat</city>, <country country="KW">Kuwait</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Management Department, College of Business and Economics, American University of Kuwait (AUK)</institution>, <city>Salmiya</city>, <country country="KW">Kuwait</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Muhammad Asad Khan, <email xlink:href="mailto:muhammad.a.khan@strath.ac.uk">muhammad.a.khan@strath.ac.uk</email>; Ahmad Alsaber, <email xlink:href="mailto:aalsaber@auk.edu.kw">aalsaber@auk.edu.kw</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-19">
<day>19</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>14</volume>
<elocation-id>1775982</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>29</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Khan, Pan, Alshatti, Alsaber and Gray.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Khan, Pan, Alshatti, Alsaber and Gray</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-19">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Missing PM2.5 observations in environmental monitoring systems, caused by sensor malfunctions, communication failures, maintenance issues, and coverage gaps, compromise public health assessments and evidence-based air quality policymaking. Reliable imputation strategies are therefore essential to preserve data integrity and analytical validity.</p>
</sec>
<sec>
<title>Methods</title>
<p>This study evaluated five imputation techniques: Bayesian Regression (BR), K-Nearest Neighbors (KNN), missForest, Predictive Mean Matching (PMM), and Random Forest (RF), using daily PM2.5 measurements collected between May 2019 and December 2024 from monitoring stations in Islamabad, Karachi, Lahore, and Peshawar, Pakistan. Three missing data mechanisms, MCAR, MAR, and MNAR, were simulated at missing rates ranging from 5% to 25%. Model performance was assessed using Root Mean Square Error (RMSE) and Mean Absolute Error (MAE).</p>
</sec>
<sec>
<title>Results</title>
<p>Imputation under the MAR mechanism consistently yielded lower error values as missingness increased. Across all mechanisms and missing rates, missForest and KNN demonstrated superior performance. Notably, missForest achieved the lowest RMSE and MAE values overall and effectively preserved the temporal structure, range, and variability of the PM2.5 series.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The findings suggest that machine-learning-based approaches, particularly missForest, provide robust and reliable imputation for PM2.5 datasets with varying missingness patterns. These results support the use of missForest as a preferred method for handling incomplete air quality data in similar monitoring contexts, thereby strengthening the reliability of environmental health analyses and air quality policy development.</p>
</sec>
</abstract>
<kwd-group>
<kwd>air quality monitoring</kwd>
<kwd>machine learning</kwd>
<kwd>missForest</kwd>
<kwd>Pakistan</kwd>
<kwd>PM2.5 missing data imputation</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="9"/>
<table-count count="6"/>
<equation-count count="13"/>
<ref-count count="48"/>
<page-count count="15"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Environmental Informatics and Remote Sensing</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>In data-driven environmental health, incomplete monitoring records bias inference, reduce model accuracy, and impede policy decisions; addressing missing data through imputation and machine learning (ML) is essential for reliable PM2.5 particulate matter assessment and intervention planning (<xref ref-type="bibr" rid="B15">J&#xe4;ger et al., 2021</xref>). Combining machine-learning forecasts and statistical imputation utilizes multi-source inputs (meteorology, traffic, remote sensing) to reconstruct high-resolution PM2.5 time series, improving predictive performance while remaining computationally efficient for local and regional applications (<xref ref-type="bibr" rid="B12">Fan et al., 2023</xref>; <xref ref-type="bibr" rid="B35">Saheer et al., 2022</xref>). Benchmarking imputation approaches and adapting methods to the temporal and spatial structure of urban PM2.5 maximizes reconstruction fidelity and supports robust exposure&#x2013;health analyses in Pakistani cities, thereby strengthening evidence for air quality and public health policy (<xref ref-type="bibr" rid="B11">Darji et al., 2024</xref>; <xref ref-type="bibr" rid="B26">Mendes et al., 2022</xref>).</p>
<p>Ambient air pollution represents one of the most significant environmental and public health challenges of the 21st century. Among the various pollutants, particulate matter with an aerodynamic diameter of size than 2.5&#xa0;&#x3bc;m (PM2.5) (<xref ref-type="bibr" rid="B22">Liang et al., 2016</xref>) which poses significant health risks and environmental challenges globally. Accurate PM2.5 data is essential for effective monitoring, policy-making, and public health interventions. In developing countries like Pakistan, rapid industrialization, urbanization, and increased vehicular emissions have led to a significant deterioration of air quality, especially in major urban centers like Lahore, Karachi, Peshawar and Islamabad. A major and predominant source of fine particles in the studied region is the vehicular activity, biomass fires and industry (<xref ref-type="bibr" rid="B29">Ngangmo et al., 2023</xref>; <xref ref-type="bibr" rid="B27">Mezoue et al., 2023</xref>). The country frequently ranks among those with the highest levels of PM2 pollution globally, posing a severe threat to the economy and to the health of millions of its citizens leading to estimate an annual cost of 6.5% of GDP per year due to health cost, reduce productivity and agricultural degradation (<xref ref-type="bibr" rid="B38">Suleman, 2022</xref>). Recognizing this crisis, government has installed air quality monitoring stations across the country to generate time-series data that is fundamental for various research and regulatory purposes.</p>
<p>However, the data collected from these monitoring stations are notoriously prone to incompleteness. Missing values in time-series data presents an inherent challenge, often arising from a confluence of factors: instrument malfunctions (e.g., sensor drift, calibration periods), harsh environmental conditions, sensor failures, power outages, communication failures, and routine maintenance schedules (<xref ref-type="bibr" rid="B14">Hua et al., 2024</xref>; <xref ref-type="bibr" rid="B39">Sun et al., 2023</xref>; <xref ref-type="bibr" rid="B5">Arnaut et al., 2024</xref>). The presence of missing data creates a significant impediment for data scientists and environmental researchers, as most statistical models and machine learning algorithms require complete datasets for training and inference. The critical question, therefore, shifts from whether data is missing to how to handle its absence effectively.</p>
<p>(<xref ref-type="bibr" rid="B44">Wijesekara and Liyanage, 2023</xref>) describe three different approaches to dealing with missing data: data deletion, imputation, and predictive estimation. In univariate time-series data, where a single variable is studied across time, the handling of missing data is especially problematic for the purpose of maintaining temporal dependency structures within imputation. Missing values in time series, if not handled properly, may often lead to biased results, subdue statistical power, and distort findings (<xref ref-type="bibr" rid="B43">Wijesekara and Liyanage, 2021</xref>). Avoiding these gaps or using assumptions such as mean imputation generates datasets that do not adequately model reality, resulting in erroneous insights. Consequently, many researchers have developed sophisticated imputation techniques, each with their unique advantages and disadvantages.</p>
<p>The missing data in PM10 time series has been shown to lead to underestimating pollution level exceedances, making the assessment and management of air quality more problematic (<xref ref-type="bibr" rid="B1">Albano et al., 2018</xref>). This poses major difficulties in fulfilling public health protection obligations outlined in European law. Failing to acknowledge missing data can result in inadequate sampling, faulty measurements and data collection problems (<xref ref-type="bibr" rid="B16">Junninen et al., 2004</xref>). Some of the literature reviews the principal techniques developed to accommodate missing values within the context of univariate time series data, assessing their effectiveness, theory, and application. Two common approaches to cope with missing data include a single imputation (SI) approach and a multiple imputation (MI) approach (<xref ref-type="bibr" rid="B31">Noor et al., 2015</xref>). It has been shown that the choice of imputation technique is determined by the attributes of the time series, the proportion of missing data, and the acceptable margin of error (<xref ref-type="bibr" rid="B33">Ribeiro and Castro, 2022</xref>). Single imputation methods are more straightforward and quicker to execute, yet multiple imputation and model-driven methods are advanced, trustworthy and precise. As noted by <xref ref-type="bibr" rid="B9">Chhabra (2023)</xref>, the approach to deal with the missing values in a univariate time series is either simple imputation-based or model-based strategies. Imputation-based techniques involve direct estimation of missing values with mean, median or mode while methods that solve equations, based on likelihoods are model-based methods (<xref ref-type="bibr" rid="B4">Armina et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Aljuaid and Sasi, 2016</xref>; <xref ref-type="bibr" rid="B32">Pereira et al., 2024</xref>).</p>
<p>To understand missing data in detail, it is necessary to acknowledge the various missing data mechanisms. These include missing completely at random (MCAR), missing at random (MAR), and missing not at random (MNAR), all of which need custom approaches to formulate effective imputation strategies (<xref ref-type="bibr" rid="B17">Kleinke et al., 2020</xref>).</p>
<p>The primary objective of this study is to develop and evaluate a robust imputation framework for addressing missing values in PM2.5 air quality datasets using advanced statistical and machine learning methods. Thus, this paper provides the first systematic evaluation of missing data methods such as (BR), (KNN), missForest, (PMM), and (RF), with a focus on the multi-year PM2.5 data from four major Pakistani cities. By simulating realistic missing data mechanisms and measuring the accuracy of imputations through metrics like RMSE and MAE, the researcher provides a solid framework to combat the gaps in the gaps in sparse air quality monitoring networks. These findings will be useful to policymakers and researchers that rely on proper estimates of PM2.5 in health risk assessment and environmental planning.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<p>Here we discuss how scholars try to mitigate the still overwhelming issue of high rates of missing data. Several methodological frameworks are discussed, highlighting both the advances and ongoing challenges of the problem. Some basic imputation methods, such as simple mean imputation and spline interpolation, often perform poorly in univariate settings due to their reliance on inter-attribute dependencies (<xref ref-type="bibr" rid="B28">Moritz et al., 2015</xref>). Likewise, <xref ref-type="bibr" rid="B24">Lien et al. (2023)</xref>, mean imputation, in which missing data points were replaced with the mean of the available data, has been shown to yield inadequate results in datasets with more than 10% missing data because of temporal variability in PM2.5 data. <xref ref-type="bibr" rid="B30">Niako et al. (2024)</xref> applied and compared multiple imputation methods, including Kalman filtering, linear interpolation, and moving averages, quantifying their effects on forecasting accuracy of the ARIMA and LSTM models. Advanced methods such as ARIMA state-space models with Kalman smoothing have been proven to be robust against shifts of means and heavy-tailed distributions (<xref ref-type="bibr" rid="B45">Zainuddin et al., 2022</xref>; <xref ref-type="bibr" rid="B13">Haile et al., 2024</xref>; <xref ref-type="bibr" rid="B36">Sharma et al., 2025</xref>) used ARIMA model to predict PM2.5 concentration in Indian satellite cities and the model exhibited a high level of accuracy (<xref ref-type="bibr" rid="B18">Kumar et al., 2024</xref>). Implemented the machine learning models by building intricate relationships with various metrological variables and considered the linear regression model as the most favorable methods to PM2.5 concentration. Some studies employed various predictive modelling techniques to predict PM2.5, PM10 and other leading pollutants. For instance, Researchers used machine learning and data mining algorithms including Independent component regression (ICR), ElasticNet (ENET), boosted tree (BT), Random Forest (RF), Support Vector Machine (SVM), Bagged Multivariate Adaptive Regression Splines (MARS), and Bayesian Regularized Neural Networks (BRNN) to predict the distribution of pollutants like <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>2.5</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>10</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>S</mml:mi>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B21">Kumar P. et al., 2025</xref>; <xref ref-type="bibr" rid="B10">Choudhary et al., 2023</xref>).</p>
<p>Further to this <xref ref-type="bibr" rid="B42">Wijesekara and Liyanage (2020)</xref> found that Kalman smoothing on Structural Time Series showed strong performance compared to six other methods of imputation on air quality data. A study by <xref ref-type="bibr" rid="B5">Arnaut et al. (2024)</xref> investigated the application of the Random Forest (RF) algorithm for bi-directional imputation of missing values for air quality data, with emphasis on PM2.5 concentrations, and analyzed its performance compared to the rather simple approaches of imputation such as mean and median imputation.</p>
<p>A study by <xref ref-type="bibr" rid="B40">Tyagi et al. (2021)</xref> analyzed air quality data and investigated five imputation techniques: K-Nearest Neighbors (KNN), Linear Interpolation (LI), Expectation-Maximization (EM), Multiple Imputation by Chained Equations (MICE), and Random Forest (RF). The outcomes suggest that Random Forest imputation is the optimal method for filling in absent data (<xref ref-type="bibr" rid="B3">Alsaber, Pan, and Al-Hurban, 2021</xref>). Used imputation methods for missing values in environmental data sets collected in Kuwait and evaluates the performance of missForest, K-Nearest Neighbors (KNN), and Bayesian principal component analysis (Bayesian PCA) in a systematic way. Results showed that missForest has the lowest imputation error for varying degrees of missing data. The R statistical software has several packages designed for missing data, including imputation. A prominent R package, mice (Multivariate Imputation by Chained Equations), while primarily designed for multivariate data, can also be applied to univariate time series by treating lagged values of the series as predictors (<xref ref-type="bibr" rid="B8">Buuren and Groothuis-Oudshoorn, 2011</xref>). Bayesian regression also offers a robust statistical approach for dealing with the missing data. According to <xref ref-type="bibr" rid="B6">A&#xdf;mann et al. (2023)</xref>, a Bayesian regression statistical approach, which incorporates prior knowledge and manages uncertainty during the imputation process is very useful for dealing with missing data in air quality datasets.</p>
<p>Several studies highlight the role of evaluation metrics in comparing imputation methods. <xref ref-type="bibr" rid="B48">Rantou et al. (2017)</xref> advocate for using Mean Root Squared Error (MRSE) and Mean Absolute Percentage Error (MAPE) to assess imputation accuracy, particularly in predictive modelling contexts. The approach with the lowest expected mean square error (EMSE) for each type of missing data for six imputation techniques across various scenarios, intended to improve logistic regression models. <xref ref-type="bibr" rid="B16">Junninen et al. (2004)</xref> assesses different techniques for imputing missing values in air quality data sets, classifying them into univariate, multivariate, and hybrid methods, as well as multiple imputation approaches. The main objective was to evaluate their efficacy in addressing data shortages. The effectiveness of each imputation method was assessed using the Root Mean Square Error (RMSE) and the Normalized Root Mean Square Error (NRMSE), which compare predicted values to actual observed values (<xref ref-type="bibr" rid="B40">Tyagi et al., 2021</xref>). <xref ref-type="bibr" rid="B24">Lien et al. (2023)</xref> applied classical imputation methods and advanced approaches on a crowd-sourced Fitbit data set, revealing that KNN performs best under low to moderate missingness (&#x3c;30%).</p>
<p>This paper evaluates multiple imputation methodologies for PM2.5 air quality data, emphasizing their effectiveness in addressing different missing data scenarios. The analysis systematically compares established and emerging techniques. These include multiple imputation methods, as well as spectral and machine learning-based approaches. The study considers each method under a range of missing data mechanisms to inform the selection of optimal imputation strategies (<xref ref-type="bibr" rid="B23">Libasin et al., 2020</xref>; <xref ref-type="bibr" rid="B46">Zhang and Thorburn, 2022</xref>). The data set, from four monitoring stations or US consulates in Pakistan, has missing data, for various possible reasons. One reason is that there were many changes to the routine maintenance at the monitoring sites. Human error is a second reason. Thirdly, the PM2.5 data-sharing network needed to publish the information has been put on hold because of a lack of funds, resulting in missing data in the studied data set.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Objectives of the study</title>
<p>
<list list-type="bullet">
<list-item>
<p>To investigate the trend and percentage of missing in PM2.5 data in urban cities of Pakistan.</p>
</list-item>
<list-item>
<p>To analysis the pattern of missingness through statistical and machine learning approaches.</p>
</list-item>
<list-item>
<p>To compare and measure the precision of each imputation approach.</p>
</list-item>
<list-item>
<p>To suggest the most appropriate imputation method for PM2.5 in the context of urban Pakistani data.</p>
</list-item>
</list>
</p>
</sec>
<sec sec-type="methods" id="s4">
<label>4</label>
<title>Methods</title>
<p>This section consists of data collection, analysis of missing data and implementation of various imputation methods.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Data description and missingness</title>
<p>In Pakistan air pollution data is collected through government monitoring stations, low-cost sensor networks, satellite remote sensing and research based intermittent sampling. Despite the fact that the Pakistan environmental protection agency (Pak -EPA) and provincial environmental protection Departments (EPDs) have relatively low volumes of such monitoring stations, the reliance on such monitoring stations and the density of such stations is relatively low, hence requiring a significant reliance on other modalities to provide the complete, real-time information, particularly with reference to PM2.5. In this study the daily average PM2.5 concentration data sourced from four urban air quality monitoring stations situated in four major Pakistani cities: Lahore, Karachi, Islamabad, and Peshawar, collectively operated by federal government department (EPA) with provincial (EPA) and US consulate monitoring networks. The data were acquired from the publicly accessible Air Quality Index (AQI) platform (<ext-link ext-link-type="uri" xlink:href="https://www.iqair.com/pakistan">https://www.iqair.com/pakistan</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://aqicn.org/map/pakistan/">https://aqicn.org/map/pakistan/</ext-link>) which compiles real-time sensor. The data set encompasses almost a 6-year period from 27 May 2019, to 31 December 2024, comprising over 2,046 daily average observations per station, assuming complete temporal coverage. Each record includes a timestamp (local time), PM2.5 concentration values (in &#xb5;g/m<sup>3</sup>), and metadata such as station ID and location coordinates. Supplementary meteorological parameters temperature, humidity, and wind speed were also recorded from <ext-link ext-link-type="uri" xlink:href="https://www.visualcrossing.com/weather-data/">https://www.visualcrossing.com/weather-data/</ext-link> for multivariate analysis but does not have any missing values. Before analysis, data set for each city was cleaned and quality checked. Duplicate in records were removed and incorrect time stamps were corrected to maintain the chronological consistency of the data set. Preliminary summary analysis and time series plots were used to ensure the seasonal variation and data integrity. These processing steps were necessary for accurate imputation analysis.</p>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> shows the descriptive statistics of the PM2.5 concentrations across the four major Pakistani cities, which exhibited substantial spatial and statistical variability, reflecting differing pollution dynamics and urban environmental pressures. Lahore recorded the highest mean concentration (186&#xa0;&#xb5;g/m<sup>3</sup>) and the widest range (575&#xa0;&#xb5;g/m<sup>3</sup>), with a pronounced right-skewed distribution (skewness &#x3d; 1.147), indicating frequent extreme pollution episodes. Peshawar followed with a mean of 149.9&#xa0;&#xb5;g/m<sup>3</sup> and the highest skewness (1.704), suggesting even more asymmetrical pollution patterns. In contrast, Islamabad and Karachi showed relatively lower mean values (114.4&#xa0;&#xb5;g/m<sup>3</sup> and 108.6&#xa0;&#xb5;g/m<sup>3</sup>, respectively) and moderate skewness, pointing to more stable but still elevated pollution levels. Overall, the mean concentration was higher than the median. Standard deviations ranged from 42.1&#xa0;&#xb5;g/m<sup>3</sup> in Karachi to 87.79&#xa0;&#xb5;g/m<sup>3</sup> in Lahore, underscoring the heterogeneity in pollutant dispersion.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Descriptive statistics of pollutant PM2.5 concentrations (May 2019-December 2024) for the four studied Pakistani cities.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="5" align="center">City</th>
</tr>
<tr>
<th align="left">Pollutant</th>
<th align="left">Islamabad</th>
<th align="left">Karachi</th>
<th align="left">Lahore</th>
<th align="left">Peshawar</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="5" align="left">PM2.5</td>
</tr>
<tr>
<td align="left">Valid observations</td>
<td align="center">2,046</td>
<td align="center">2,046</td>
<td align="center">2,046</td>
<td align="center">2,046</td>
</tr>
<tr>
<td align="left">Mean</td>
<td align="center">114.436</td>
<td align="center">108.576</td>
<td align="center">185.981</td>
<td align="center">149.887</td>
</tr>
<tr>
<td align="left">Median</td>
<td align="center">107.000</td>
<td align="center">95.000</td>
<td align="center">162.000</td>
<td align="center">142.000</td>
</tr>
<tr>
<td align="left">Std. Deviation</td>
<td align="center">43.486</td>
<td align="center">42.103</td>
<td align="center">87.787</td>
<td align="center">58.558</td>
</tr>
<tr>
<td align="left">Skewness</td>
<td align="center">0.679</td>
<td align="center">0.741</td>
<td align="center">1.147</td>
<td align="center">1.147</td>
</tr>
<tr>
<td align="left">Range</td>
<td align="center">288.000</td>
<td align="center">268.000</td>
<td align="center">575.000</td>
<td align="center">551.000</td>
</tr>
<tr>
<td align="left">Minimum</td>
<td align="center">10.000</td>
<td align="center">9.000</td>
<td align="center">5.000</td>
<td align="center">6.000</td>
</tr>
<tr>
<td align="left">Maximum</td>
<td align="center">298.000</td>
<td align="center">277.000</td>
<td align="center">580.000</td>
<td align="center">557.000</td>
</tr>
<tr>
<td align="left">No. of missing observations</td>
<td align="center">152</td>
<td align="center">245</td>
<td align="center">272</td>
<td align="center">137</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Out of a total of 2,046 daily observations, a substantial percentage, approximately 39.4%, of the data were missing, with the highest incidence of missing entries reported in Lahore (272 records) and Karachi (245 records). Such data gaps have implications for the accuracy and reliability of longitudinal trend analyses and necessitate robust imputation strategies, as explored in subsequent sections.</p>
<p>
<xref ref-type="fig" rid="F1">Figure 1</xref> illustrates the time series of the PM2.5 concentrations from 2019 to 2024, including seasonal variations and the completeness of the data from the four urban monitoring stations. Some of the more notable patterns of the data spikes above the trend line are in the winter months, which may be the result of emissions resulting from temperature inversion, and other associated emissions. The seasonal variability is high; during the post-monsoon season and winter, high concentrations and the incidence of the poor air quality can be observed, which can be explained by the burning of stubble, the increase of anthropogenic emissions, and specific meteorological circumstances (<xref ref-type="bibr" rid="B19">Kumar et al., 2025a</xref>). It is important to note that seasonal and spatial heterogeneity is observed in the distribution of air pollution with PM 2. Five recorded to occur at the highest levels in winter in some Indian urban cities (<xref ref-type="bibr" rid="B20">Kumar et al., 2025b</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Time series plot of PM2.5 concentrations from May 2019 to December 2024, having missing values from four different monitoring stations in Pakistan.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g001.tif">
<alt-text content-type="machine-generated">Four line graphs display PM2.5 concentration trends from 2019 to 2024 for Islamabad, Karachi, Lahore, and Peshawar, revealing recurring seasonal spikes with Lahore and Peshawar peaking higher than Islamabad and Karachi.</alt-text>
</graphic>
</fig>
<p>The discontinuities in the series signal periods of the captured data that are often to be found in clumps. <xref ref-type="fig" rid="F2">Figure 2</xref> shows the volume of missing PM2.5 data recorded by year and it is seen that a greater percentage of data is missing during the years of 2021 and onward, and the greatest percentage of missing data is for the cities of Lahore and Karachi. <xref ref-type="fig" rid="F3">Figure 3</xref> breaks the information down further by month and indicates that the greatest loss of data happens during the months of the summer season, from June to August, and the first months of the winter season, in November and December. <xref ref-type="fig" rid="F4">Figure 4</xref> presents a simplified view of the available data in which colours gray and black are used, where gray stands for the available data and black for missing data. Overall, the missing data only constitute 9.8% of the entire data set. The missing data is not uniformly distributed, and it is more concentrated in the data from the cities of Lahore and Karachi. In contrast, the data from the cities of Islamabad and Peshawar have a more even distribution.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Missing values of PM2.5 per year (May 2019 to December 2024).</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g002.tif">
<alt-text content-type="machine-generated">Dot plot showing percentage of missing values from 2019 to 2024 for Lahore, Karachi, Islamabad, and Peshawar, with missing values increasing over time, peaking in 2022 for Lahore and Karachi.</alt-text>
</graphic>
</fig>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Missing values of PM2.5 per month (May 2019 to December 2024).</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g003.tif">
<alt-text content-type="machine-generated">Line chart grid displays the percentage of missing values across four sites&#x2014;Lahore, Karachi, Islamabad, and Peshawar&#x2014;for each month. Each subplot represents a different month, with horizontal lines for each site indicating varying percentages, ranging from near zero to just over thirty percent.</alt-text>
</graphic>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Distribution of missing (shown in black) and non-missing (shown in gray) values of PM2.5 across the monitoring sites in the four major cities.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g004.tif">
<alt-text content-type="machine-generated">Matrix plot visualizing missing and present data across observations for four cities&#x2014;Islamabad, Karachi, Lahore, and Peshawar&#x2014;where black lines indicate missing values, comprising nine point eight percent of the data.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Imputation methods</title>
<sec id="s4-2-1">
<label>4.2.1</label>
<title>Bayesian regression</title>
<p>Bayesian regression imputation puts the missing-data imputation problem within a Bayesian framework. Prior distributions are applied to the model parameters after assuming a regression model for a variable with missing values conditional on other variables. Next, using the observed data, the posterior distribution of the parameters and missing values is obtained. Assume the linear regression model having missing data.<disp-formula id="equ1">
<mml:math id="m2">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x223c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="normal">Y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> vector of the observed and missing values of the target variable, <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the matrix of predictors (observed or imputed before), <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the regression coefficients, and <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the residual variance. The response <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the predictor matrix <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are partitioned into observed and missing components as<disp-formula id="equ2">
<mml:math id="m10">
<mml:mrow>
<mml:mi mathvariant="normal">Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>As in the case of observed data, the likelihood function can be expressed as<disp-formula id="equ3">
<mml:math id="m11">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="italic">exp</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The model parameters are then determined as prior distributions, usually with conjugate priors:<disp-formula id="equ4">
<mml:math id="m12">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x223c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x223c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The joint distribution of likelihood and priors yields the joint posterior distribution:<disp-formula id="equ5">
<mml:math id="m13">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x221d;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Based on this, the posterior parameters distributions can be obtained as follows:<disp-formula id="equ6">
<mml:math id="m14">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="" close="|" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>where<disp-formula id="equ7">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mn>0</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3b2;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mn>0</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The posterior distribution for the residual variance <inline-formula id="inf9">
<mml:math id="m16">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is given by<disp-formula id="equ8">
<mml:math id="m17">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>with<disp-formula id="equ9">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:msubsup>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mn>0</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The posterior distribution once obtained, the missing values for the conditional posterior distribution are imputed. As<disp-formula id="equ10">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>This process is repeated M times, and each time new values of parameter <inline-formula id="inf10">
<mml:math id="m20">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf11">
<mml:math id="m21">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are sampled according to their respective posterior distributions, both generating multiple imputations.</p>
<p>To generate multiple imputations, this process is repeated <inline-formula id="inf12">
<mml:math id="m22">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> times, each time drawing new parameter from their respective posterior distributions and the resulting multiple imputations are as follows.<disp-formula id="equ11">
<mml:math id="m23">
<mml:mrow>
<mml:msubsup>
<mml:mi>Y</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:msubsup>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>m</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
</sec>
<sec id="s4-2-2">
<label>4.2.2</label>
<title>KNN</title>
<p>The KNN imputation approach relies on the k -nearest neighbours algorithm which is based on calculation of the pairwise distances between observations to select the k nearest most similar records. The main idea of this approach is that the missing datum can be estimated by examining the closest neighbours which are closest to the target datum. The similarity, or proximity, between observations is measured using a number of distance measures; the most common distance measure when using continuous variables is the Euclidean distance, which is defined as follows.<disp-formula id="equ12">
<mml:math id="m24">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>p</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>where,</p>
<p>
<inline-formula id="inf13">
<mml:math id="m25">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> the observation on which the distance is to be calculated, which is usually the observation with missing data.</p>
<p>
<inline-formula id="inf14">
<mml:math id="m26">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Another observation that is going to be compared, typically one with fully observed data.</p>
<p>
<inline-formula id="inf15">
<mml:math id="m27">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>.</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where, p is the total number of features/attributes.</p>
</sec>
<sec id="s4-2-3">
<label>4.2.3</label>
<title>Random forest</title>
<p>Random Forest was first introduced by <xref ref-type="bibr" rid="B7">Breiman (2001)</xref>, is ensemble learning algorithms which builds a series decision trees, each of which is trained on a bootstrap sample of the observed data. The approach consider each variable with missing values as a dependent variable and uses the rest of the variables as predictors to estimate the missing values. The resulting final imputed value is determined by adding up predictions across all trees-using the mean with continuous variables or majority vote with categorical variables. In regression tasks, the final prediction is derived by<disp-formula id="equ13">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>B</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where,</p>
<p>
<inline-formula id="inf16">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; imputed value for observation <inline-formula id="inf17">
<mml:math id="m30">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
<p>
<inline-formula id="inf18">
<mml:math id="m31">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; number of trees in the forest</p>
<p>
<inline-formula id="inf19">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; prediction from tree <inline-formula id="inf20">
<mml:math id="m33">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> based on the observed predictor values <inline-formula id="inf21">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for observation <inline-formula id="inf22">
<mml:math id="m35">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s4-2-4">
<label>4.2.4</label>
<title>MissForest</title>
<p>
<xref ref-type="bibr" rid="B47">Stekhoven and B&#xfc;hlmann (2012)</xref> introduced an iterative imputation method known as MissForest based on the Random Forest algorithm that is specifically designed to handle incomplete data both in continuous and categorical variables. In each iteration and for every variable <inline-formula id="inf23">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with missing values (<inline-formula id="inf24">
<mml:math id="m37">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>), the data is partitioned into four subsets: 1. the observed values of <inline-formula id="inf25">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, denoted <inline-formula id="inf26">
<mml:math id="m39">
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>; 2. the missing values of <inline-formula id="inf27">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, denoted <inline-formula id="inf28">
<mml:math id="m41">
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>; 3. the corresponding observed values of the remaining variables, <inline-formula id="inf29">
<mml:math id="m42">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>; and 4. the corresponding missing values of the remaining variables, <inline-formula id="inf30">
<mml:math id="m43">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. A Random Forest model <inline-formula id="inf31">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#xb7;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is then trained on the observed data pairs <inline-formula id="inf32">
<mml:math id="m45">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, obtaining an estimator <inline-formula id="inf33">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>f</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This trained model is used subsequently to make predictions of the missing values as <inline-formula id="inf34">
<mml:math id="m47">
<mml:mrow>
<mml:msubsup>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>f</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the imputed entries are updated accordingly in matrix <inline-formula id="inf35">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The process is repeated until the convergence criterion for all variables are met.</p>
</sec>
<sec id="s4-2-5">
<label>4.2.5</label>
<title>Predictive mean matching (PMM)</title>
<p>Predictive Mean Matching (PMM) is being commonly used statistical method for imputing missing data, particularly in the context of multiple imputations procedures. The approach was first proposed by Donald B. Rubin and R. J. A. Little in the late 1980s (<xref ref-type="bibr" rid="B37">Sugden and Rubin, 1988</xref>; <xref ref-type="bibr" rid="B25">Little, 1988</xref>). PMM of its ability to produce plausible imputed values through a use of available data, therefore maintaining the integrity of the original data. PMM works by Fitting regression models <inline-formula id="inf36">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mover accent="true">
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, the predicted value for those with <inline-formula id="inf37">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> missing and <inline-formula id="inf38">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mover accent="true">
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, the predicted value for those with <inline-formula id="inf39">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> observed. For each missing case <inline-formula id="inf40">
<mml:math id="m53">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, an absolute distance metric <inline-formula id="inf41">
<mml:math id="m54">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is computed. One donor is <inline-formula id="inf42">
<mml:math id="m55">
<mml:mrow>
<mml:msubsup>
<mml:mi>Y</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>&#x2a;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> randomly selected from a set of <inline-formula id="inf43">
<mml:math id="m56">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="italic">argmin</mml:mi>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>Y</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and is used to fill in the missing value. The process is repeated until the predetermined criteria for the number of iterations is defined. A mice package in R is implemented and serves as default imputation for continuous variables (<xref ref-type="bibr" rid="B8">Buuren and Groothuis-Oudshoorn, 2011</xref>).</p>
</sec>
<sec id="s4-2-6">
<label>4.2.6</label>
<title>Results and discussion</title>
<p>This section discusses the empirical results across cities, interprets the implications of RMSE and MAE variations, and highlights the conditions under which specific algorithms, particularly missForest and KNN, exhibit superior robustness and predictive fidelity.</p>
<p>
<xref ref-type="table" rid="T2">Tables 2</xref>&#x2013;<xref ref-type="table" rid="T5">5</xref> report the performance of the five imputation methods, namely, Bayesian Regression (BR), K-Nearest Neighbors (KNN), missForest, Predictive Mean Matching (PMM), and Random Forest (RF) applied to the PM2.5 data of the four studied major cities under varying missingness mechanisms (MCAR, MAR, MNAR) and missingness rates 5%&#x2013;25%. Performances were evaluated using Root Mean Square Error (RMSE) and Mean Absolute Error (MAE) as accuracy metrics, for which lower values indicate superior imputation accuracy.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Imputation performance of BR, KNN, missForest, PMM, and RF under MCAR, MAR, and MNAR mechanisms for PM2.5 data from Islamabad (May 2019 &#x2013; December 2024).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="7" align="center">Islamabad</th>
</tr>
<tr>
<th rowspan="2" align="center">Method</th>
<th colspan="3" align="center">RMSE</th>
<th colspan="3" align="center">MAE</th>
</tr>
<tr>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">5% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">6.9459</td>
<td align="center">4.4185</td>
<td align="center">2.9759</td>
<td align="center">0.7660</td>
<td align="center">0.3688</td>
<td align="center">0.2541</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">4.7392</td>
<td align="center">4.2329</td>
<td align="center">2.8011</td>
<td align="center">0.7349</td>
<td align="center">0.3810</td>
<td align="center">0.2505</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">4.2914</td>
<td align="center">3.8774</td>
<td align="center">2.6892</td>
<td align="center">1.0311</td>
<td align="center">0.3355</td>
<td align="center">0.2461</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">6.0434</td>
<td align="center">5.6554</td>
<td align="center">4.2209</td>
<td align="center">0.9765</td>
<td align="center">0.5217</td>
<td align="center">0.3941</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">6.2198</td>
<td align="center">3.8022</td>
<td align="center">3.5838</td>
<td align="center">1.1698</td>
<td align="center">0.3547</td>
<td align="center">0.2940</td>
</tr>
<tr>
<td colspan="7" align="center">10% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">10.5227</td>
<td align="center">5.4818</td>
<td align="center">5.9691</td>
<td align="center">2.6693</td>
<td align="center">0.6393</td>
<td align="center">0.8236</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">7.8141</td>
<td align="center">4.7298</td>
<td align="center">4.7332</td>
<td align="center">1.9206</td>
<td align="center">0.5487</td>
<td align="center">0.5507</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">7.3325</td>
<td align="center">4.2569</td>
<td align="center">3.5428</td>
<td align="center">1.8384</td>
<td align="center">0.4721</td>
<td align="center">0.4793</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">10.9324</td>
<td align="center">5.7756</td>
<td align="center">5.7082</td>
<td align="center">2.6225</td>
<td align="center">0.6756</td>
<td align="center">0.7391</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">10.1362</td>
<td align="center">7.1249</td>
<td align="center">7.6545</td>
<td align="center">2.4417</td>
<td align="center">0.8247</td>
<td align="center">0.9434</td>
</tr>
<tr>
<td colspan="7" align="center">15% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">13.1217</td>
<td align="center">8.8642</td>
<td align="center">7.3151</td>
<td align="center">4.0718</td>
<td align="center">1.2294</td>
<td align="center">1.0944</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">10.8521</td>
<td align="center">7.5404</td>
<td align="center">6.2323</td>
<td align="center">3.0283</td>
<td align="center">1.0380</td>
<td align="center">0.9641</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">10.1198</td>
<td align="center">7.0496</td>
<td align="center">5.3759</td>
<td align="center">2.7971</td>
<td align="center">0.9837</td>
<td align="center">0.8639</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">14.1481</td>
<td align="center">9.6377</td>
<td align="center">8.3045</td>
<td align="center">4.0973</td>
<td align="center">1.4086</td>
<td align="center">1.2298</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">13.3478</td>
<td align="center">8.0975</td>
<td align="center">8.6110</td>
<td align="center">3.7191</td>
<td align="center">1.1836</td>
<td align="center">1.2077</td>
</tr>
<tr>
<td colspan="7" align="center">20% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">16.2800</td>
<td align="center">7.8382</td>
<td align="center">10.2836</td>
<td align="center">5.4275</td>
<td align="center">1.4104</td>
<td align="center">1.8043</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">13.0344</td>
<td align="center">6.5182</td>
<td align="center">7.7775</td>
<td align="center">4.3299</td>
<td align="center">1.1028</td>
<td align="center">1.3368</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">12.1552</td>
<td align="center">6.0432</td>
<td align="center">7.6423</td>
<td align="center">3.9848</td>
<td align="center">1.0232</td>
<td align="center">1.2814</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">17.0911</td>
<td align="center">7.5663</td>
<td align="center">10.7591</td>
<td align="center">5.7978</td>
<td align="center">1.3554</td>
<td align="center">1.8668</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">16.2007</td>
<td align="center">7.7744</td>
<td align="center">9.0320</td>
<td align="center">5.2650</td>
<td align="center">1.3368</td>
<td align="center">1.5431</td>
</tr>
<tr>
<td colspan="7" align="center">25% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">18.3932</td>
<td align="center">9.7033</td>
<td align="center">10.4893</td>
<td align="center">7.0511</td>
<td align="center">1.9052</td>
<td align="center">2.0177</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">15.5873</td>
<td align="center">8.5390</td>
<td align="center">7.6341</td>
<td align="center">5.7405</td>
<td align="center">1.6798</td>
<td align="center">1.4141</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">14.5335</td>
<td align="center">7.1138</td>
<td align="center">8.7437</td>
<td align="center">5.2957</td>
<td align="center">1.4006</td>
<td align="center">1.6884</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">18.5346</td>
<td align="center">9.4831</td>
<td align="center">12.5926</td>
<td align="center">6.9013</td>
<td align="center">1.9476</td>
<td align="center">2.3768</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">18.5208</td>
<td align="center">10.2792</td>
<td align="center">11.4883</td>
<td align="center">6.8847</td>
<td align="center">2.0048</td>
<td align="center">2.1470</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Imputation performance of BR, KNN, missForest, PMM, and RF under MCAR, MAR, and MNAR mechanisms for PM2.5 data from Karachi (May 2019 &#x2013; December 2024).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="7" align="center">Karachi</th>
</tr>
<tr>
<th rowspan="2" align="center">Method</th>
<th colspan="3" align="center">RMSE</th>
<th colspan="3" align="center">MAE</th>
</tr>
<tr>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">5% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">11.3127</td>
<td align="center">7.0097</td>
<td align="center">5.3521</td>
<td align="center">1.9791</td>
<td align="center">0.6712</td>
<td align="center">0.4623</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">7.7584</td>
<td align="center">4.4153</td>
<td align="center">5.8571</td>
<td align="center">1.2802</td>
<td align="center">0.3699</td>
<td align="center">0.5059</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">6.8086</td>
<td align="center">4.8447</td>
<td align="center">4.8891</td>
<td align="center">1.1308</td>
<td align="center">0.4232</td>
<td align="center">0.4030</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">10.1285</td>
<td align="center">7.7706</td>
<td align="center">5.7794</td>
<td align="center">1.7081</td>
<td align="center">0.7067</td>
<td align="center">0.4790</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">7.7644</td>
<td align="center">6.0794</td>
<td align="center">6.9224</td>
<td align="center">1.3154</td>
<td align="center">0.5128</td>
<td align="center">0.5638</td>
</tr>
<tr>
<td colspan="7" align="center">10% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">16.0114</td>
<td align="center">7.2181</td>
<td align="center">8.1367</td>
<td align="center">4.1237</td>
<td align="center">0.8513</td>
<td align="center">1.0355</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">12.3929</td>
<td align="center">6.9778</td>
<td align="center">5.8515</td>
<td align="center">2.9586</td>
<td align="center">0.8647</td>
<td align="center">0.7329</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">10.3800</td>
<td align="center">6.4588</td>
<td align="center">6.1914</td>
<td align="center">2.4900</td>
<td align="center">0.7603</td>
<td align="center">0.7656</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">13.9272</td>
<td align="center">9.9383</td>
<td align="center">7.4271</td>
<td align="center">3.1946</td>
<td align="center">1.2422</td>
<td align="center">0.9096</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">14.6627</td>
<td align="center">7.4785</td>
<td align="center">8.9750</td>
<td align="center">3.6184</td>
<td align="center">0.8834</td>
<td align="center">1.0614</td>
</tr>
<tr>
<td colspan="7" align="center">15% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">17.9959</td>
<td align="center">9.5874</td>
<td align="center">11.3960</td>
<td align="center">5.3865</td>
<td align="center">1.4883</td>
<td align="center">1.8134</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">14.1040</td>
<td align="center">9.6130</td>
<td align="center">9.7382</td>
<td align="center">4.0559</td>
<td align="center">1.4983</td>
<td align="center">1.5480</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">13.6361</td>
<td align="center">8.7902</td>
<td align="center">8.8851</td>
<td align="center">4.0320</td>
<td align="center">1.3439</td>
<td align="center">1.3593</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">16.6428</td>
<td align="center">11.1686</td>
<td align="center">11.4660</td>
<td align="center">4.8820</td>
<td align="center">1.7239</td>
<td align="center">1.6501</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">18.2083</td>
<td align="center">10.1163</td>
<td align="center">12.5136</td>
<td align="center">5.3975</td>
<td align="center">1.5383</td>
<td align="center">1.8910</td>
</tr>
<tr>
<td colspan="7" align="center">20% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">20.9656</td>
<td align="center">12.5430</td>
<td align="center">14.9225</td>
<td align="center">7.5508</td>
<td align="center">2.2805</td>
<td align="center">2.7069</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">16.2445</td>
<td align="center">10.5427</td>
<td align="center">11.9095</td>
<td align="center">5.1774</td>
<td align="center">1.9703</td>
<td align="center">2.1132</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">15.0468</td>
<td align="center">9.3889</td>
<td align="center">11.1508</td>
<td align="center">5.0122</td>
<td align="center">1.7235</td>
<td align="center">1.9517</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">20.2340</td>
<td align="center">12.2224</td>
<td align="center">13.3238</td>
<td align="center">6.8903</td>
<td align="center">2.1415</td>
<td align="center">2.4389</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">19.6015</td>
<td align="center">10.5827</td>
<td align="center">13.0786</td>
<td align="center">6.5335</td>
<td align="center">1.8516</td>
<td align="center">2.3658</td>
</tr>
<tr>
<td colspan="7" align="center">25% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">23.5024</td>
<td align="center">14.2354</td>
<td align="center">13.9488</td>
<td align="center">8.9869</td>
<td align="center">3.0380</td>
<td align="center">2.8647</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">18.9021</td>
<td align="center">13.2710</td>
<td align="center">13.0453</td>
<td align="center">7.1463</td>
<td align="center">2.7564</td>
<td align="center">2.6522</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">17.6292</td>
<td align="center">12.2496</td>
<td align="center">11.9908</td>
<td align="center">6.6553</td>
<td align="center">2.4354</td>
<td align="center">2.4850</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">24.5560</td>
<td align="center">14.6149</td>
<td align="center">14.9728</td>
<td align="center">9.3561</td>
<td align="center">2.9731</td>
<td align="center">3.1201</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">22.1069</td>
<td align="center">15.1012</td>
<td align="center">14.1526</td>
<td align="center">8.5100</td>
<td align="center">2.9331</td>
<td align="center">2.8309</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Imputation performance of BR, KNN, missForest, PMM, and RF under MCAR, MAR, and MNAR mechanisms for PM2.5 data from Lahore (May 2019 &#x2013; December 2024).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="7" align="center">Lahore</th>
</tr>
<tr>
<th rowspan="2" align="center">Method</th>
<th colspan="3" align="center">RMSE</th>
<th colspan="3" align="center">MAE</th>
</tr>
<tr>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">5% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">18.5552</td>
<td align="center">9.0194</td>
<td align="center">10.0925</td>
<td align="center">3.1451</td>
<td align="center">0.7154</td>
<td align="center">0.9324</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">11.2296</td>
<td align="center">4.6633</td>
<td align="center">8.6900</td>
<td align="center">1.7757</td>
<td align="center">0.4369</td>
<td align="center">0.8199</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">11.3943</td>
<td align="center">4.1519</td>
<td align="center">8.6085</td>
<td align="center">1.7225</td>
<td align="center">0.3594</td>
<td align="center">0.7884</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">18.9745</td>
<td align="center">10.6164</td>
<td align="center">11.9413</td>
<td align="center">2.8399</td>
<td align="center">0.7909</td>
<td align="center">1.0738</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">14.0462</td>
<td align="center">7.4573</td>
<td align="center">11.6312</td>
<td align="center">2.3533</td>
<td align="center">0.6963</td>
<td align="center">1.0538</td>
</tr>
<tr>
<td colspan="7" align="center">10% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">28.1091</td>
<td align="center">17.7309</td>
<td align="center">12.6096</td>
<td align="center">6.8083</td>
<td align="center">1.9197</td>
<td align="center">1.5397</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">20.3444</td>
<td align="center">12.4769</td>
<td align="center">10.3735</td>
<td align="center">4.5162</td>
<td align="center">1.2698</td>
<td align="center">1.2105</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">18.9415</td>
<td align="center">12.8041</td>
<td align="center">10.2677</td>
<td align="center">4.0208</td>
<td align="center">1.2937</td>
<td align="center">1.2859</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">27.1513</td>
<td align="center">15.0401</td>
<td align="center">15.3854</td>
<td align="center">5.8999</td>
<td align="center">1.6763</td>
<td align="center">1.7088</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">27.1839</td>
<td align="center">18.7879</td>
<td align="center">16.2533</td>
<td align="center">5.9986</td>
<td align="center">1.9655</td>
<td align="center">2.0407</td>
</tr>
<tr>
<td colspan="7" align="center">15% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">32.1213</td>
<td align="center">18.3188</td>
<td align="center">24.4118</td>
<td align="center">9.6584</td>
<td align="center">2.5032</td>
<td align="center">3.5942</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">27.0447</td>
<td align="center">17.9144</td>
<td align="center">18.0595</td>
<td align="center">7.2643</td>
<td align="center">2.0849</td>
<td align="center">2.6253</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">25.6894</td>
<td align="center">15.8219</td>
<td align="center">18.0701</td>
<td align="center">7.0947</td>
<td align="center">1.8859</td>
<td align="center">2.5703</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">31.1574</td>
<td align="center">16.1361</td>
<td align="center">23.9724</td>
<td align="center">8.4679</td>
<td align="center">2.0428</td>
<td align="center">3.4741</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">32.3940</td>
<td align="center">22.8152</td>
<td align="center">22.9764</td>
<td align="center">8.9641</td>
<td align="center">3.0545</td>
<td align="center">3.4824</td>
</tr>
<tr>
<td colspan="7" align="center">20% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">36.0795</td>
<td align="center">18.4557</td>
<td align="center">24.2248</td>
<td align="center">12.5334</td>
<td align="center">2.9019</td>
<td align="center">4.2682</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">26.6868</td>
<td align="center">13.8071</td>
<td align="center">18.1267</td>
<td align="center">8.8261</td>
<td align="center">2.0683</td>
<td align="center">3.2905</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">24.8515</td>
<td align="center">16.3134</td>
<td align="center">18.7313</td>
<td align="center">8.2427</td>
<td align="center">2.3614</td>
<td align="center">3.3073</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">32.3884</td>
<td align="center">18.0584</td>
<td align="center">26.7171</td>
<td align="center">10.9255</td>
<td align="center">2.7888</td>
<td align="center">4.8247</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">34.7324</td>
<td align="center">22.2056</td>
<td align="center">26.2488</td>
<td align="center">11.4934</td>
<td align="center">3.1077</td>
<td align="center">4.5114</td>
</tr>
<tr>
<td colspan="7" align="center">25% missingness rate</td>
</tr>
<tr>
<td align="center">BR</td>
<td align="center">42.1727</td>
<td align="center">18.1403</td>
<td align="center">27.4497</td>
<td align="center">16.3832</td>
<td align="center">2.5914</td>
<td align="center">5.5036</td>
</tr>
<tr>
<td align="center">KNN</td>
<td align="center">32.9418</td>
<td align="center">15.2492</td>
<td align="center">20.8084</td>
<td align="center">11.5562</td>
<td align="center">2.4167</td>
<td align="center">3.9593</td>
</tr>
<tr>
<td align="center">missForest</td>
<td align="center">33.6900</td>
<td align="center">13.7558</td>
<td align="center">22.0600</td>
<td align="center">11.7553</td>
<td align="center">3.6135</td>
<td align="center">4.2723</td>
</tr>
<tr>
<td align="center">PMM</td>
<td align="center">43.1453</td>
<td align="center">21.8732</td>
<td align="center">30.3646</td>
<td align="center">15.3009</td>
<td align="center">3.5583</td>
<td align="center">5.9593</td>
</tr>
<tr>
<td align="center">RF</td>
<td align="center">44.0573</td>
<td align="center">23.1512</td>
<td align="center">30.6094</td>
<td align="center">15.6708</td>
<td align="center">3.0048</td>
<td align="center">5.6211</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Imputation performance of BR, KNN, missForest, PMM, and RF under MCAR, MAR, and MNAR mechanisms for PM2.5 data from Peshawar (May 2019 &#x2013; December 2024).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="7" align="center">Peshawar</th>
</tr>
<tr>
<th rowspan="2" align="center">Method</th>
<th colspan="3" align="center">RMSE</th>
<th colspan="3" align="center">MAE</th>
</tr>
<tr>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
<th align="center">MCAR</th>
<th align="center">MAR</th>
<th align="center">MNAR</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">5% missingness rate</td>
</tr>
<tr>
<td align="left">BR</td>
<td align="center">11.2081</td>
<td align="center">7.6705</td>
<td align="center">8.0021</td>
<td align="center">1.9966</td>
<td align="center">0.5647</td>
<td align="center">0.7226</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="center">7.9243</td>
<td align="center">5.9647</td>
<td align="center">5.9912</td>
<td align="center">1.1739</td>
<td align="center">0.4334</td>
<td align="center">0.5528</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="center">6.9446</td>
<td align="center">5.4055</td>
<td align="center">5.3786</td>
<td align="center">1.1072</td>
<td align="center">0.4185</td>
<td align="center">0.5155</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="center">12.1558</td>
<td align="center">9.0391</td>
<td align="center">6.7441</td>
<td align="center">2.0656</td>
<td align="center">0.7315</td>
<td align="center">0.5321</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="center">11.4600</td>
<td align="center">7.3479</td>
<td align="center">8.2715</td>
<td align="center">1.7088</td>
<td align="center">0.5576</td>
<td align="center">0.6915</td>
</tr>
<tr>
<td colspan="7" align="center">10% missingness rate</td>
</tr>
<tr>
<td align="left">BR</td>
<td align="center">16.4352</td>
<td align="center">11.3951</td>
<td align="center">9.4832</td>
<td align="center">3.9666</td>
<td align="center">1.1088</td>
<td align="center">1.0723</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="center">13.2767</td>
<td align="center">9.4284</td>
<td align="center">9.5037</td>
<td align="center">2.8502</td>
<td align="center">0.8854</td>
<td align="center">1.0373</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="center">13.1888</td>
<td align="center">8.4275</td>
<td align="center">9.9840</td>
<td align="center">2.7981</td>
<td align="center">0.7615</td>
<td align="center">1.2020</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="center">16.4851</td>
<td align="center">13.6884</td>
<td align="center">14.0755</td>
<td align="center">3.6508</td>
<td align="center">1.2733</td>
<td align="center">1.5652</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="center">19.9262</td>
<td align="center">10.7153</td>
<td align="center">10.2943</td>
<td align="center">3.7840</td>
<td align="center">1.0455</td>
<td align="center">1.2402</td>
</tr>
<tr>
<td colspan="7" align="center">15% missingness rate</td>
</tr>
<tr>
<td align="left">BR</td>
<td align="center">22.9630</td>
<td align="center">10.6287</td>
<td align="center">11.9480</td>
<td align="center">6.8402</td>
<td align="center">1.3216</td>
<td align="center">1.8298</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="center">15.6158</td>
<td align="center">8.6463</td>
<td align="center">10.2389</td>
<td align="center">4.4507</td>
<td align="center">1.1215</td>
<td align="center">1.5141</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="center">15.1271</td>
<td align="center">7.6521</td>
<td align="center">9.4985</td>
<td align="center">4.1924</td>
<td align="center">0.9912</td>
<td align="center">1.4417</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="center">19.6347</td>
<td align="center">13.6666</td>
<td align="center">15.2880</td>
<td align="center">5.6253</td>
<td align="center">1.7129</td>
<td align="center">2.0952</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="center">21.6763</td>
<td align="center">13.5531</td>
<td align="center">14.0440</td>
<td align="center">5.3575</td>
<td align="center">1.5342</td>
<td align="center">1.8958</td>
</tr>
<tr>
<td colspan="7" align="center">20% missingness rate</td>
</tr>
<tr>
<td align="left">BR</td>
<td align="center">22.9622</td>
<td align="center">13.5042</td>
<td align="center">17.1455</td>
<td align="center">7.8244</td>
<td align="center">2.0311</td>
<td align="center">2.8959</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="center">17.0087</td>
<td align="center">10.3588</td>
<td align="center">16.6286</td>
<td align="center">5.5155</td>
<td align="center">1.4831</td>
<td align="center">2.6625</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="center">18.2668</td>
<td align="center">9.8270</td>
<td align="center">15.3086</td>
<td align="center">5.5439</td>
<td align="center">1.3722</td>
<td align="center">2.4490</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="center">20.7053</td>
<td align="center">12.4517</td>
<td align="center">18.1341</td>
<td align="center">6.8792</td>
<td align="center">1.7219</td>
<td align="center">3.0952</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="center">21.2274</td>
<td align="center">15.5859</td>
<td align="center">17.4297</td>
<td align="center">6.6266</td>
<td align="center">2.2533</td>
<td align="center">2.8820</td>
</tr>
<tr>
<td colspan="7" align="center">25% missingness rate</td>
</tr>
<tr>
<td align="left">BR</td>
<td align="center">26.9482</td>
<td align="center">15.0989</td>
<td align="center">22.1357</td>
<td align="center">10.2556</td>
<td align="center">2.3193</td>
<td align="center">4.0627</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="center">21.7375</td>
<td align="center">15.9218</td>
<td align="center">19.2263</td>
<td align="center">7.5155</td>
<td align="center">2.2070</td>
<td align="center">3.2616</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="center">20.3368</td>
<td align="center">13.2474</td>
<td align="center">19.7537</td>
<td align="center">7.0825</td>
<td align="center">1.8272</td>
<td align="center">3.3543</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="center">29.8099</td>
<td align="center">15.6488</td>
<td align="center">22.3431</td>
<td align="center">10.5790</td>
<td align="center">2.3623</td>
<td align="center">3.9924</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="center">29.0618</td>
<td align="center">15.5781</td>
<td align="center">24.7987</td>
<td align="center">9.8489</td>
<td align="center">2.4658</td>
<td align="center">4.1201</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Consistently, both RMSE and MAE increased monotonically with the missingness rate across all cities and methods, reflecting the deteriorating reliability of imputations. Errors were generally lowest under MAR, higher under MNAR, and highest under MCAR for each mechanism. Of all methods employed, missForest was the most consistent and reliable across the cities, followed by KNN. In contrast, BR and PMM had low robustness and RF had intermediate performance and instabilities at higher missingness levels.</p>
<p>The results for Islamabad demonstrated the lowest overall error magnitudes across all methods and mechanisms. At low missingness (5%&#x2013;10%), missForest achieved the best performance (RMSE &#x2248; 2.7&#x2013;7.3, MAE &#x2248; 0.24&#x2013;1.83). KNN is also able to perform well, particularly under MAR and MNAR, though not quite as accurately as missForest.</p>
<p>As missingness increased to 20%&#x2013;25%, errors rose sharply for all methods, but missForest and KNN remained comparatively stable while BR, PMM, and RF deteriorated substantially, suggesting that the Islamabad dataset is less prone to extreme variation, making it relatively easier to impute.</p>
<p>Karachi displayed higher error levels than Islamabad across all rates, particularly under BR and PMM. At 5% missingness, missForest again outperformed others. At low missingness (5%), missForest produces the most accurate imputations (RMSE &#x3d; 4.89&#x2013;6.81; MAE &#x3d; 0.40&#x2013;1.13), closely followed by KNN (RMSE &#x3d; 4.42&#x2013;7.76; MAE &#x3d; 0.37&#x2013;1.28) while RF performs moderately well under MCAR, and BR and PMM lag with larger errors. At 10%&#x2013;15% missing rates, overall errors increase; although, missForest outperforms its counterparts and KNN did reasonably well. In contrast, RF, BR, and PMM deteriorate much more notably, particularly under MCAR. whereas RF, BR, and PMM deteriorate, especially under MCAR. With higher missingness (20%&#x2013;25%) rates, missForest and KNN continue to provide relatively robust imputations, while BR, PMM, and RF declines sharply, with RMSE values above 20 and MAE values exceeding 2.5.</p>
<p>Lahore consistently demonstrates the greatest imputation challenge, with markedly higher error magnitudes across all methods even at low missingness rates (RMSE &#x3e; 10 for several approaches) under MCAR and severe deterioration at 10%&#x2013;25%, where even missForest and KNN record RMSE values above 30 and MAE exceeding 3.0; these results underscore the city&#x2019;s volatile and irregular PM2.5 dynamics. Peshawar exhibits moderate levels of difficulty. Under all three mechanisms and at the 5% missingness rate, missForest and KNN again outperform other approaches, yielding lowest errors. Conversely, BR, RF, and PMM demonstrate markedly poorer performance which is contrast to the study conducted by <xref ref-type="bibr" rid="B10">Choudhary et al. (2023)</xref>, stated that RF is more effective model for PM2.5 prediction and is superior to other data mining algorithms.</p>
<p>Mechanism-specific comparisons confirm that MAR consistently yields the lowest errors, whereas MCAR produces the highest errors across cities, reflecting the difficulty of imputing values tied to unobserved processes. Overall, the cross-city analysis highlights that Islamabad is the least challenging environment for imputation of air quality data, Peshawar and Karachi present moderate difficulty, and Lahore poses the most severe challenge, while across all locations, non-parametric ensemble and neighbors-based methods (missForest and KNN) are far more effective approaches for recovering incomplete PM2.5 time series. These findings corroborate with the previous research that demonstrates the effectiveness of different imputation approaches for environmental data with high rates of missing values. For instance, the Kuwait air-quality data analysis by <xref ref-type="bibr" rid="B3">Alsaber et al. (2021)</xref> and <xref ref-type="bibr" rid="B34">Ritthewa and Samart (2024)</xref> stated that missForest demonstrated the lowest RMSE and MAE in various conditions of missing data. Similarly <xref ref-type="bibr" rid="B41">Umar and Gray (2023)</xref>, in a different environmental context, who also found that missForest and KNN performed strongly in comparison of imputation approaches using RMSE and MAE for evaluation of imputed time series water level data. A comparative summary <xref ref-type="table" rid="T6">Table 6</xref> explicitly compares the performance of various imputation methods concisely.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Consie summary of multiple imputation approaches.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Methods</th>
<th align="left">RMSE/MAE accuracy</th>
<th align="left">Stability as missingness rates</th>
<th align="left">Sensitivity to missing mechanism</th>
<th align="center">Performance across locations</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">BR</td>
<td align="left">Moderate to higher errors; competitive with RF and PMM in number of cases</td>
<td align="left">Stability declines at higher missingness</td>
<td align="left">Highly sensitive to MCAR</td>
<td align="left">Performance varies across all locations</td>
</tr>
<tr>
<td align="left">KNN</td>
<td align="left">Comparable to missForest in some cases</td>
<td align="left">Stability is reasonable</td>
<td align="left">Stronger to some cases of MCAR, MNAR and MAR</td>
<td align="left">Good performance across all locations in some cases even results better than missForest</td>
</tr>
<tr>
<td align="left">missForest</td>
<td align="left">Lowest/near lowest errors in most cases</td>
<td align="left">Higher stability</td>
<td align="left">Mostly robust across MCAR, MNAR and MAR</td>
<td align="left">Mostly strong and consistent across all locations</td>
</tr>
<tr>
<td align="left">PMM</td>
<td align="left">Moderate to high errors; overlaps with BR and RF</td>
<td align="left">Lower stability</td>
<td align="left">Sensitive to MNAR and MCAR</td>
<td align="left">Performance varies across all locations</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="left">Moderate accuracy and is competitive with BR and PMM depending on situation</td>
<td align="left">Lower stability</td>
<td align="left">Highly sensitive to MCAR, MNAR and MAR</td>
<td align="left">Performance varies across all locations</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To evaluate the reliability of different imputation methods, missing values were imputed into the original datasets and then the alignment with observed PM2.5 concentrations was assessed. <xref ref-type="fig" rid="F5">Figure 5</xref> shows that gaps in missing data were effectively filled in a manner consistent with historical patterns, indicating that advanced approaches can provide robust estimates. Across Islamabad, Karachi, Lahore, and Peshawar (2019&#x2013;2024), all methods preserved seasonal cycles and winter peaks, but their ability to capture variability differed and KNN and missForest performed well by closely tracking observed fluctuations. Similarly, the density distributions of observed versus imputed values of PM2.5 for the different methods of imputation shown in <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F7">7</xref>, illustrate that the values from all approaches are in broad correspondence to the observed data, validating their capabilities for estimating the missing values of PM2.5 concentration. Based on both visual inspection and error metrics (RMSE and MAE; <xref ref-type="table" rid="T2">Tables 2</xref>-<xref ref-type="table" rid="T5">5</xref> and <xref ref-type="fig" rid="F8">Figures 8</xref>, <xref ref-type="fig" rid="F9">9</xref>), missForest emerged as the most accurate method, slightly more accurate than KNN, making missForest the most suitable technique for imputing daily PM2.5 concentrations in these environmental time series.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>RMSE of imputation methods (BR, KNN, missForest, PMM, RF) for PM2.5 across four cities under MAR, MCAR, and MNAR mechanisms.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g005.tif">
<alt-text content-type="machine-generated">Line chart grid comparing mean absolute error (MAE) for Bayesian Regression, KNN, MissForest, PMM, and RandomForest imputation methods across missing rates from 5% to 25% in Islamabad, Karachi, Lahore, and Peshawar under MAR, MCAR, and MNAR conditions. All methods show increasing MAE with higher missing rates, with variation by city and method.</alt-text>
</graphic>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Daily concentration of PM2.5 values after estimating missing values by different imputation approaches.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g006.tif">
<alt-text content-type="machine-generated">Line charts display PM2.5 concentrations from 2020 to 2024 for Islamabad, Karachi, Lahore, and Peshawar, comparing five imputation methods (BR, KNN, MissForest, PMM, Random Forest) with observed versus imputed data.</alt-text>
</graphic>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Workflow of the PM2.5 data imputation process.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g007.tif">
<alt-text content-type="machine-generated">Flowchart illustrating the process of imputing missing PM2.5 data, starting from raw data and splitting into complete and incomplete cases, creating artificial missing rates, considering missing mechanisms, applying five imputation methods (BR, KNN, missForest, PMM, RF), evaluating results, and selecting the best imputed data set.</alt-text>
</graphic>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>RMSE of imputation methods (BR, KNN, missForest, PMM, RF) for PM2.5 across four cities under MAR, MCAR, and MNAR mechanisms.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g008.tif">
<alt-text content-type="machine-generated">Line graph grid compares Root Mean Squared Error (RMSE) versus missing rate for five imputation methods across Islamabad, Karachi, Lahore, and Peshawar under MAR, MCAR, and MNAR missingness; MissForest and KNN generally have lower RMSE.</alt-text>
</graphic>
</fig>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Density plots of observed versus imputed values by different imputation approaches for the datasets from the four cities.</p>
</caption>
<graphic xlink:href="fenvs-14-1775982-g009.tif">
<alt-text content-type="machine-generated">Four-panel density plot compares observed and imputed PM2.5 concentration distributions in Islamabad, Karachi, Lahore, and Peshawar. Each panel overlays observed, KNN, PMM, BR, MissForest, and Random Forest methods, with similar peak patterns indicating consistent imputation across cities.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Limitations and future work</title>
<p>Although this study provides valuable insights into the performance of various imputation techniques for PM2.5 data, certain limitations should be acknowledged. The analysis focused primarily on daily averages from four urban monitoring stations, which may not capture short-term variability or spatial heterogeneity in air quality. Additionally, only five imputation methods were evaluated, excluding emerging deep learning&#x2013;based approaches that may further enhance accuracy. Future research should extend this framework to incorporate high-frequency and multi-pollutant datasets, explore hybrid or ensemble deep learning models, and examine the influence of meteorological and socioeconomic factors to develop more adaptive and scalable imputation strategies for environmental monitoring systems.</p>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>The results of this study provide critical insights into the comparative performance of statistical and machine learning-based imputation techniques applied to PM2.5 air quality datasets across major Pakistani cities. By systematically evaluating five methods&#x2014;Bayesian Regression (BR), K-Nearest Neighbors (KNN), missForest, Predictive Mean Matching (PMM), and Random Forest (RF) under varying missingness mechanisms (MCAR, MAR, MNAR) and rates (5%&#x2013;25%), the analysis reveals distinct patterns of imputation accuracy and reliability. The findings underscore the sensitivity of model performance to both the volume and mechanism of missing data, demonstrating that improper handling of missingness can lead to substantial estimation errors and distort temporal pollution trends. The study used data from four major cities of Pakistan from May 2019 to December 2024, which allowed for an in-depth analysis of the various missing data mechanisms (MCAR, MAR, MNAR) and missing data rates (5%&#x2013;25%) considered. The results indicate that as the amount of missing data increases, the quality of the data also diminishes. Moreover, data that is classified as Missing at Random (MAR) was found to have relatively less imputation error than MCAR (Missing Completely at Random) and MNAR (Missing Not at Random) because of the inherent temporal or spatial correlations within the datasets. Among all the methods, missForest had the lowest error overall, in terms of RMSE and MAE measures, and KNN also had relatively good results. These two techniques maintained the trends, cycles, and extremes of the time series data that are necessary for the robust analysis of environmental data. On the other hand, the Bayesian Regression, PMM technique and Random Forest technique, although more stable, lacked adequate reliability especially at higher levels of missingness. These examples confirmed the importance of customizing imputation strategies for the missingness mechanism and the attributes of the data set. In particular, imputation methods have not, to our knowledge, been studied for PM2.5 data from Pakistan and the use of missForest is recommended for imputation of missing PM2.5 data in that context.</p>
<p>In short, this study demonstrates that ensemble missForest and neighbor-based methods, the missForest, is the most effective machine learning approach for partial PM2.5 data set imputation relatively to KNN. Accurate imputations are necessary for improving the statistical credibility of environmental assessments and to ensure effective policy formulation for air quality management.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: All datasets used in this study are publicly available through established online air-quality and meteorological data platforms.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>MK: Methodology, Resources, Writing &#x2013; original draft, Software, Investigation, Visualization, Formal Analysis, Data curation, Validation, Conceptualization, Writing &#x2013; review and editing. JP: Project administration, Validation, Supervision, Writing &#x2013; review and editing, Visualization, Investigation. AmA: Investigation, Writing &#x2013; review and editing, Validation. AhA: Writing &#x2013; review and editing, Methodology, Software, Formal Analysis, Validation. AG: Supervision, Writing &#x2013; review and editing, Project administration, Validation.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Albano</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>La Rocca</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Perna</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>On the imputation of missing values in univariate $PM_10$ P M 10 time series</article-title>,&#x201d; in <source>Computer aided systems theory &#x2013; EUROCAST 2017, 12&#x2013;19. Lecture notes in computer science</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>).</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Aljuaid</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sasi</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Proper imputation techniques for missing values in data sets</article-title>,&#x201d; in <source>2016 international conference on data science and engineering (ICDSE)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>5</lpage>.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alsaber</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Al-Hurban</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Handling complex missing data using random forest approach for an air quality monitoring dataset: a case study of Kuwait environmental data (2012 to 2018)</article-title>. <source>Int. J. Environ. Res. Public Health</source> <volume>18</volume> (<issue>3</issue>), <fpage>1333</fpage>. <pub-id pub-id-type="doi">10.3390/ijerph18031333</pub-id>
<pub-id pub-id-type="pmid">33540610</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Armina</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zain</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Sallehuddin</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>A review on missing value estimation using imputation algorithm</article-title>. <source>J. Phys. Conf. Ser.</source> <volume>892</volume> (<issue>September</issue>), <fpage>012004</fpage>. <pub-id pub-id-type="doi">10.1088/1742-6596/892/1/012004</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arnaut</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>&#x110;ur&#x111;evi&#x107;</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kolarski</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sre&#x107;kovi&#x107;</surname>
<given-names>V. A.</given-names>
</name>
<name>
<surname>Jevremovi&#x107;</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Improving air quality data reliability through Bi-Directional univariate imputation with the random forest algorithm</article-title>. <source>Sustainability</source> <volume>16</volume> (<issue>17</issue>), <fpage>7629</fpage>. <pub-id pub-id-type="doi">10.3390/su16177629</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>A&#xdf;mann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gaasch</surname>
<given-names>J.-C.</given-names>
</name>
<name>
<surname>Stingl</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A bayesian approach towards missing covariate data in multilevel latent regression models</article-title>. <source>Psychometrika</source> <volume>88</volume> (<issue>4</issue>), <fpage>1495</fpage>&#x2013;<lpage>1528</lpage>. <pub-id pub-id-type="doi">10.1007/s11336-022-09888-0</pub-id>
<pub-id pub-id-type="pmid">36418780</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Random forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume> (<issue>1</issue>), <fpage>5</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1023/a:1010933404324</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Buuren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Groothuis-Oudshoorn</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>MICE: multivariate imputation by chained equations in R</article-title>. <source>J. Stat. Softw.</source> <volume>45</volume> (<issue>December</issue>), <fpage>1</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v045.i03</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chhabra</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Comparison of imputation methods for univariate time series</article-title>. <source>Int. J. Recent Innovation Trends Comput. Commun.</source> <volume>11</volume> (<issue>2s</issue>), <fpage>286</fpage>&#x2013;<lpage>292</lpage>. <pub-id pub-id-type="doi">10.17762/ijritcc.v11i2s.6148</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choudhary</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pradhan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sahu</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Chaudhary</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Joshi</surname>
<given-names>P. K.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Evaluating air quality and criteria pollutants prediction disparities by data mining along a stretch of urban-rural agglomeration includes coal-mine belts and thermal power plants</article-title>. <source>Front. Environ. Sci.</source> <volume>11</volume> (<issue>November</issue>), <fpage>1132159</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2023.1132159</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Darji</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Biswas</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Padul</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gill</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Kesari</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ashili</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Efficient use of binned data for imputing univariate time series data</article-title>. <source>Front. Big Data</source> <volume>7</volume>, <fpage>1422650</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2024.1422650</pub-id>
<pub-id pub-id-type="pmid">39234189</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dhammapala</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Harrington</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Lamb</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Machine learning-based ozone and PM2.5 forecasting: application to multiple AQS sites in the Pacific northwest</article-title>. <source>Front. Big Data</source> <volume>6</volume>, <fpage>1124148</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2023.1124148</pub-id>
<pub-id pub-id-type="pmid">36910164</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haile</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>AlNemer</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Multiscale change point detection for univariate time series data with missing value</article-title>. <source>Mathematics</source> <volume>12</volume> (<issue>20</issue>), <fpage>3189</fpage>. <pub-id pub-id-type="doi">10.3390/math12203189</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hua</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>M.-S.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>H. D.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>B. T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>The impact of data imputation on air quality prediction problem</article-title>. <source>PloS One</source> <volume>19</volume> (<issue>9</issue>), <fpage>e0306303</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0306303</pub-id>
<pub-id pub-id-type="pmid">39264957</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>J&#xe4;ger</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Allhorn</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bie&#xdf;mann</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A benchmark for data imputation methods</article-title>. <source>Front. Big Data</source> <volume>4</volume>, <fpage>693674</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2021.693674</pub-id>
<pub-id pub-id-type="pmid">34308343</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Junninen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Niska</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tuppurainen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Ruuskanen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kolehmainen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Methods for imputation of missing values in air quality data sets</article-title>. <source>Atmos. Environ. Oxf. Engl. 1994</source> <volume>38</volume> (<issue>18</issue>), <fpage>2895</fpage>&#x2013;<lpage>2907</lpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2004.02.026</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kleinke</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Reinecke</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Salfr&#xe1;n</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Spiess</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Applied multiple imputation: advantages, pitfalls, new developments and applications in R. 2020th ed. Statistics for social and behavioral sciences</source>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer Nature</publisher-name>.</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Prakash</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Machine learning-based prediction of hazards fine PM2.5 concentrations: a case study of Delhi, India</article-title>. <source>Discov. Geosci.</source> <volume>2</volume> (<issue>1</issue>), <fpage>34</fpage>. <pub-id pub-id-type="doi">10.1007/s44288-024-00043-z</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Jahan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bag</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bhatla</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2025a</year>). <article-title>Spatio-temporal analysis of air pollution and meteorological influences in Western Uttar Pradesh using geospatial techniques: insights for policy and management</article-title>. <source>Int. J. Remote Sens.</source> <volume>00</volume> (<issue>00</issue>), <fpage>1</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1080/01431161.2025.2529601</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Rana</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Choudhary</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2025b</year>). <article-title>Spatiotemporal variability and source attribution of PM2.5/PM10 ratios: aerosol type classification and AQI evaluation across seventy monitoring stations in Delhi and Haryana, India</article-title>. <source>Phys. Chem. Earth</source> <volume>140</volume> (<issue>104005</issue>), <fpage>104005</fpage>. <pub-id pub-id-type="doi">10.1016/j.pce.2025.104005</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Choudhary</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Joshi</surname>
<given-names>P. K.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Bhatla</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Machine learning models for estimating criteria pollutants and health risk-based air quality indices over eastern Coast coal mine complex belts</article-title>. <source>Front. Environ. Sci.</source> <volume>13</volume> (<issue>May</issue>), <fpage>1589991</fpage>. <pub-id pub-id-type="doi">10.3389/fenvs.2025.1589991</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.X.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>PM<sub>2.5</sub>Data reliability, consistency, and air quality assessment in five Chinese cities: CONSISTENCY IN CHINA&#x2019;S PM<sub>2.5</sub>DATA</article-title>. <source>J. Geophys. Res. Atmos.</source> <volume>121</volume> (<issue>17</issue>), <fpage>10220</fpage>&#x2013;<lpage>10236</lpage>. <pub-id pub-id-type="doi">10.1002/2016jd024877</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Libasin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ul-Saufie</surname>
<given-names>A. Z.</given-names>
</name>
<name>
<surname>Ahmat</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shaziayani</surname>
<given-names>W. N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Single and multiple imputation method to replace missing values in air pollution datasets: a review</article-title>. <source>IOP Conf. Ser. Earth Environ. Sci.</source> <volume>616</volume> (<issue>1</issue>), <fpage>012002</fpage>. <pub-id pub-id-type="doi">10.1088/1755-1315/616/1/012002</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lien</surname>
<given-names>P.L.</given-names>
</name>
<name>
<surname>Do</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Data imputation for multivariate time-series data</article-title>,&#x201d; in <source>2023 15th international conference on knowledge and systems engineering (KSE)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Little</surname>
<given-names>R. J. A.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>Missing-data adjustments in large surveys</article-title>. <source>J. Bus. and Econ. Statistics A Publ. Am. Stat. Assoc.</source> <volume>6</volume> (<issue>3</issue>), <fpage>287</fpage>&#x2013;<lpage>296</lpage>. <pub-id pub-id-type="doi">10.1080/07350015.1988.10509663</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mendes</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Monjardino</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ferreira</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Air quality forecast by statistical methods: application to Portugal and Macao</article-title>. <source>Front. Big Data</source> <volume>5</volume>, <fpage>826517</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2022.826517</pub-id>
<pub-id pub-id-type="pmid">35360510</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mezoue</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Cedric Ngangmo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Choudhary</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Monkam</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Measurement of fine particle concentrations and estimation of air quality index (AQI) over northeast douala, Cameroon</article-title>. <source>Environ. Monit. Assess.</source> <volume>195</volume> (<issue>8</issue>), <fpage>965</fpage>. <pub-id pub-id-type="doi">10.1007/s10661-023-11582-2</pub-id>
<pub-id pub-id-type="pmid">37462835</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moritz</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sard&#xe1;</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bartz-Beielstein</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zaefferer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Stork</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Comparison of different methods for univariate time series imputation in R</article-title>. <source>arXiv preprint arXiv:1510.03924</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1510.03924">http://arxiv.org/abs/1510.03924</ext-link>.</comment>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ngangmo</surname>
<given-names>Y. C.</given-names>
</name>
<name>
<surname>Mezoue Adiang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Choudhary</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Monkam</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Road traffic-induced particle matter dispersion in a calm wind environment at the main roundabout in douala, Central Africa</article-title>. <source>J. Air Pollut. Health</source>. <pub-id pub-id-type="doi">10.18502/japh.v8i1.12030</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niako</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Melgarejo</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Maestre</surname>
<given-names>G. E.</given-names>
</name>
<name>
<surname>Vatcheva</surname>
<given-names>K. P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Effects of missing data imputation methods on univariate blood pressure time series data analysis and forecasting with ARIMA and LSTM</article-title>. <source>BMC Med. Res. Methodol.</source> <volume>24</volume> (<issue>1</issue>), <fpage>320</fpage>. <pub-id pub-id-type="doi">10.1186/s12874-024-02448-3</pub-id>
<pub-id pub-id-type="pmid">39725886</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Noor</surname>
<given-names>M. N.</given-names>
</name>
<name>
<surname>Yahaya</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Ramli</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Al Bakri Abdullah</surname>
<given-names>M. M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Filling the missing data of air pollutant concentration using single imputation methods</article-title>. <source>Appl. Mech. Mater.</source> <volume>754&#x2013;755</volume> (<issue>April</issue>), <fpage>923</fpage>&#x2013;<lpage>932</lpage>. <pub-id pub-id-type="doi">10.4028/www.scientific.net/amm.754-755.923</pub-id>
<pub-id pub-id-type="pmid">754</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pereira</surname>
<given-names>R. C.</given-names>
</name>
<name>
<surname>Abreu</surname>
<given-names>P. H.</given-names>
</name>
<name>
<surname>Pereira Rodrigues</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Figueiredo</surname>
<given-names>M. A. T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Imputation of data missing not at random: artificial generation and benchmark analysis</article-title>. <source>Expert Syst. Appl.</source> <volume>249</volume> (<issue>123654</issue>), <fpage>123654</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2024.123654</pub-id>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rantou</surname>
<given-names>K. E.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Missing data in time series and imputation methods (Master&#x2019;s thesis)</article-title> (<publisher-loc>Samos, Greece</publisher-loc>: <publisher-name>University of the Aegean</publisher-name>).</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ribeiro</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Castro</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Missing data in time series: a review of imputation methods and case study</article-title>,&#x201d; in <source>Learning and nonlinear models</source>. <pub-id pub-id-type="doi">10.21528/lnlm-vol20-no1-art3</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ritthewa</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Samart</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Performance of different imputation methods in logistic regression with multicollinearity</article-title>. <source>Philipp. J. Sci.</source> <volume>153</volume> (<issue>3</issue>). <pub-id pub-id-type="doi">10.56899/153.03.05</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saheer</surname>
<given-names>L. B.</given-names>
</name>
<name>
<surname>Bhasy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maktabdar</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zarrin</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Data-driven framework for understanding and predicting air quality in urban areas</article-title>. <source>Front. Big Data</source> <volume>5</volume>. <pub-id pub-id-type="doi">10.3389/fdata.2022.822573</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Ghosh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mishra</surname>
<given-names>V. N.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Spatio-temporal variations and forecast of PM2.5 concentration around selected satellite cities of Delhi, India using ARIMA model</article-title>. <source>Phys. Chem. Earth</source> <volume>138</volume> (<issue>103849</issue>), <fpage>103849</fpage>. <pub-id pub-id-type="doi">10.1016/j.pce.2024.103849</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stekhoven</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>B&#x00FC;hlmann</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>MissForest--non-parametric missing value imputation for mixed-type data</article-title>. <source>Bioinformatics</source> <volume>28</volume> (<issue>1</issue>), <fpage>112</fpage>&#x2013;<lpage>118</lpage>.<pub-id pub-id-type="pmid">22039212</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sugden</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Rubin</surname>
<given-names>D. B.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>Multiple imputation for nonresponse in surveys</article-title>. <source>J. R. Stat. Soc. Ser. A, Statistics Soc.</source> <volume>151</volume> (<issue>3</issue>), <fpage>567</fpage>. <pub-id pub-id-type="doi">10.2307/2983027</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suleman</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>World bank report</article-title>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Deep learning <italic>versus</italic> conventional methods for missing data imputation: a review and comparative study</article-title>. <source>Expert Syst. Appl.</source> <volume>227</volume> (<issue>120201</issue>), <fpage>120201</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2023.120201</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tyagi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Koul</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mahajan</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Performance analysis of imputation methods on air quality dataset</article-title>,&#x201d; in <source>Smart computing</source> (<publisher-loc>London</publisher-loc>: <publisher-name>CRC Press</publisher-name>), <fpage>694</fpage>&#x2013;<lpage>701</lpage>.</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Umar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Gray</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Comparing single and multiple imputation approaches for missing values in univariate and multivariate water level data</article-title>. <source>Water</source> <volume>15</volume> (<issue>8</issue>), <fpage>1519</fpage>. <pub-id pub-id-type="doi">10.3390/w15081519</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wijesekara</surname>
<given-names>W. M. L. K. N.</given-names>
</name>
<name>
<surname>Liyanage</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Comparison of imputation methods for missing values in air pollution data: case study on Sydney air quality index</article-title>,&#x201d; in <source>Advances in intelligent systems and computing</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>257</fpage>&#x2013;<lpage>269</lpage>.</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wijesekara</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liyanage</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Air quality data pre-processing: a novel algorithm to impute missing values in univariate time series</article-title>,&#x201d; in <source>2021 IEEE 33rd international conference on tools with artificial intelligence (ICTAI)</source> (<publisher-name>IEEE</publisher-name>), <fpage>996</fpage>&#x2013;<lpage>1001</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wijesekara</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liyanage</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mind the large gap: novel algorithm using seasonal decomposition and elastic net regression to impute large intervals of missing data in air quality data</article-title>. <source>Atmosphere</source> <volume>14</volume> (<issue>2</issue>), <fpage>355</fpage>. <pub-id pub-id-type="doi">10.3390/atmos14020355</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zainuddin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hairuddin</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Yassin</surname>
<given-names>A. I. M.</given-names>
</name>
<name>
<surname>Latiff</surname>
<given-names>Z. I. A.</given-names>
</name>
<name>
<surname>Azhar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Time series data and recent imputation techniques for missing data: a review</article-title>,&#x201d; in <source>2022 international conference on green energy, computing and sustainable technology (GECOST)</source> (<publisher-name>IEEE</publisher-name>), <fpage>346</fpage>&#x2013;<lpage>350</lpage>.</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Thorburn</surname>
<given-names>P. J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Handling missing data in near real-time environmental monitoring: a system and a review of selected methods</article-title>. <source>Future Gener. Comput. Syst. FGCS</source> <volume>128</volume> (<issue>March</issue>), <fpage>63</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1016/j.future.2021.09.033</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1668201/overview">Zhongfan Zhu</ext-link>, Beijing Normal University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1690465/overview">Pradeep Kumar</ext-link>, Manipal University Jaipur, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1690506/overview">Arti Choudhary</ext-link>, Banaras Hindu University, India</p>
</fn>
</fn-group>
</back>
</article>