<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Environ. Sci.</journal-id>
<journal-title>Frontiers in Environmental Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Environ. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-665X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1223160</article-id>
<article-id pub-id-type="doi">10.3389/fenvs.2023.1223160</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Environmental Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Developing high-resolution PM<sub>2.5</sub> exposure models by integrating low-cost sensors, automated machine learning, and big human mobility data</article-title>
<alt-title alt-title-type="left-running-head">Yu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenvs.2023.1223160">10.3389/fenvs.2023.1223160</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yu</surname>
<given-names>Manzhu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1082368/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Shiyan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Kai</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/129927/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yin</surname>
<given-names>Junjun</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Varela</surname>
<given-names>Matthew</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2313783/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Miao</surname>
<given-names>Jiheng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2319080/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Geography</institution>, <institution>The Pennsylvania State University</institution>, <addr-line>University Park</addr-line>, <addr-line>PA</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Environmental Health Sciences</institution>, <institution>School of Public Health</institution>, <institution>University at Albany</institution>, <institution>State University of New York</institution>, <addr-line>Albany</addr-line>, <addr-line>NY</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Social Science Research Institute</institution>, <institution>The Pennsylvania State University</institution>, <addr-line>University Park</addr-line>, <addr-line>PA</addr-line>, <country>United States</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Meteorology</institution>, <institution>University of Oklahoma</institution>, <addr-line>Norman</addr-line>, <addr-line>OK</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1508502/overview">Yichun Xie</ext-link>, Eastern Michigan University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2314887/overview">Xining Yang</ext-link>, Eastern Michigan University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1973199/overview">Tao Hu</ext-link>, Oklahoma State University, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Manzhu Yu, <email>mqy5198@psu.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>07</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>11</volume>
<elocation-id>1223160</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>07</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Yu, Zhang, Zhang, Yin, Varela and Miao.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Yu, Zhang, Zhang, Yin, Varela and Miao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> Traditional methods to estimate exposure to PM<sub>2.5</sub> (particulate matter with less than 2.5&#x00a0;&#x00B5;m in diameter) have typically relied on limited regulatory monitors and do not consider human mobility and travel. However, the limited spatial coverage of regulatory monitors and the lack of consideration of mobility limit the ability to capture actual air pollution exposure.</p>
<p>
<bold>Methods:</bold> This study aims to improve traditional exposure assessment methods for PM<sub>2.5</sub> by incorporating the measurements from a low-cost sensor network (PurpleAir) and regulatory monitors, an automated machine learning modeling framework, and big human mobility data. We develop a monthly-aggregated hourly land use regression (LUR) model based on automated machine learning (AutoML) and assess the model performance across eight metropolitan areas within the US.</p>
<p>
<bold>Results:</bold> Our results show that integrating low-cost sensor with regulatory monitor measurements generally improves the AutoML-LUR model accuracy and produces higher spatial variation in PM<sub>2.5</sub> concentration maps compared to using regulatory monitor measurements alone. Feature importance analysis shows factors highly correlated with PM<sub>2.5</sub> concentrations, including satellite aerosol optical depth, meteorological variables, vegetation, and land use. In addition, we incorporate human mobility data on exposure estimates regarding where people visit to identify spatiotemporal hotspots of places with higher risks of exposure, emphasizing the need to consider both visitor numbers and PM<sub>2.5</sub> concentrations when developing exposure reduction strategies.</p>
<p>
<bold>Discussion:</bold> This research provides important insights for further public health studies on air pollution by comprehensively assessing the performance of AutoML-LUR models and incorporating human mobility into considering human exposure to air pollution.</p>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>land use regression</kwd>
<kwd>human mobility</kwd>
<kwd>low-cost sensors</kwd>
<kwd>PM<sub>2.5</sub>
</kwd>
</kwd-group>
<contract-sponsor id="cn001">Pennsylvania State University<named-content content-type="fundref-id">10.13039/100008321</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Big Data, AI, and the Environment</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Exposure to air pollution can directly affect human health and increase healthcare use (<xref ref-type="bibr" rid="B44">Reid et al., 2016</xref>; <xref ref-type="bibr" rid="B5">Black et al., 2017</xref>). The World Health Organization (WHO) estimates that 4.2&#xa0;million deaths annually can be attributed to outdoor air pollution (<xref ref-type="bibr" rid="B49">Shaddick et al., 2020</xref>). Among various types of air pollutants, fine particle (PM<sub>2.5</sub>) is an air pollutant that is a concern for people&#x2019;s health as it penetrates the lungs and circulatory system, contributing &#x223c;98% to &#x223c;7&#xa0;million deaths globally (<xref ref-type="bibr" rid="B7">Butt et al., 2017</xref>; <xref ref-type="bibr" rid="B57">WHO, 2022</xref>). PM<sub>2.5</sub> often comes from various emission sources, including the combustion of gas, oil, diesel fuel, and wood, industrial processes, power generators, and natural phenomena, such as wildfires, dust storms, and volcanic eruptions (<xref ref-type="bibr" rid="B33">McDuffie et al., 2021</xref>). The accurate and frequent monitoring of PM<sub>2.5</sub> is crucial to notify citizens of potentially poor air quality, as the concentration of PM<sub>2.5</sub> at a particular location might change rapidly depending on the emission sources, wind speed, wind direction, and other meteorological factors (<xref ref-type="bibr" rid="B51">Sun et al., 2022</xref>). Regulatory monitors from local and U.S. Environmental Protection Agencies (EPA) have played an indispensable role in measuring local air quality but are limited in their sparse distribution and high cost of maintenance and deployment.</p>
<p>The development of low-cost air quality sensors, such as PurpleAir and Clarity sensors, provides new opportunities for capturing air quality dynamics in a high spatial and temporal resolution (<xref ref-type="bibr" rid="B18">Gupta et al., 2018</xref>; <xref ref-type="bibr" rid="B9">Caubel et al., 2019</xref>; <xref ref-type="bibr" rid="B16">Fowlie et al., 2020</xref>). These sensors are not only cost-effective and easy to deploy but can also wirelessly transmit the data they gather, providing a contrast to the traditional, complex, and expensive regulatory air monitoring stations. PurpleAir sensors use dual laser particle counters that can provide a more detailed view of particulate pollution. Clarity sensors work by measuring the attenuation of infrared radiation in the air. They consist of an infrared radiation source, a light-water tube, and an infrared detector with an appropriate filter. New insights have been discovered using the low-cost sensor networks regarding the spatial patterns of air quality on a local or neighborhood scale (<xref ref-type="bibr" rid="B56">Weissert et al., 2020</xref>; <xref ref-type="bibr" rid="B25">Kelly et al., 2021</xref>), local influences of emission sources (<xref ref-type="bibr" rid="B62">Zimmerman et al., 2020</xref>; <xref ref-type="bibr" rid="B30">Lu et al., 2021</xref>), and fine-scaled human exposure assessments (<xref ref-type="bibr" rid="B4">Bi et al., 2022</xref>). Calibrations and correction methods have been developed to improve the data plausibility compared to regulatory monitors (<xref ref-type="bibr" rid="B53">Tryner et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Barkjohn et al., 2021</xref>; <xref ref-type="bibr" rid="B55">Wallace et al., 2021</xref>).</p>
<p>Land use regression (LUR) models are commonly used in air pollution exposure assessment to produce averaged exposure risks in a temporal range with a high spatial resolution (<xref ref-type="bibr" rid="B3">Beelen et al., 2014</xref>; <xref ref-type="bibr" rid="B28">Li et al., 2020</xref>; <xref ref-type="bibr" rid="B46">Ren et al., 2020</xref>). LUR models are particularly useful for identifying spatial features that are important determinants of pollutant concentration variability and for enhancing our understanding of the spatial distribution of air pollutants (<xref ref-type="bibr" rid="B34">Meng et al., 2015</xref>; <xref ref-type="bibr" rid="B27">Lee et al., 2017</xref>; <xref ref-type="bibr" rid="B37">Muttoo et al., 2018</xref>). These models are based on the assumption that the average air quality within a specific area is linearly associated with geographic covariates such as land use, road density, and emission sources (<xref ref-type="bibr" rid="B20">Hoek et al., 2008</xref>). The LUR modeling process generally involves the preparation of such covariates, creating LUR models by linear regression while selecting variables that are highly correlated with the model and avoiding redundant variables, validating and selecting the final model, and applying the final model to a high-resolution grid for the area where predictions are to be made (<xref ref-type="bibr" rid="B36">Morley and Gulliver, 2018</xref>; <xref ref-type="bibr" rid="B31">Ma et al., 2020</xref>). The integration of low-cost sensors and regulatory monitors in LUR models also showed the potential of better capturing within-city variations (<xref ref-type="bibr" rid="B29">Lu et al., 2022</xref>). However, LUR models generally suffer from their limitations in area generalizability: a LUR model developed for a particular area within a specific time range cannot be easily adopted, and models have to be re-developed for any other spatiotemporal range (<xref ref-type="bibr" rid="B4">Bi et al., 2022</xref>). In addition, while timely-averaged (e.g., annual or multi-year) concentrations could reduce the biases resulting from a few high-level outliers, it is essential to develop PM<sub>2.5</sub> exposure models at finer temporal resolutions (e.g., daily or hourly) for a more frequent assessment of air pollution exposure (<xref ref-type="bibr" rid="B32">Masiol et al., 2018</xref>; <xref ref-type="bibr" rid="B30">Lu et al., 2021</xref>).</p>
<p>Recent studies have improved LUR models to address the limitations using generalized additive models (<xref ref-type="bibr" rid="B43">Ravindra et al., 2019</xref>), principal component analysis (<xref ref-type="bibr" rid="B12">de Souza et al., 2018</xref>), Least Absolute Shrinkage and Selection Operator (LASSO) (<xref ref-type="bibr" rid="B47">Roberts and Martin, 2005</xref>), and Bayesian inference (<xref ref-type="bibr" rid="B52">Thomas et al., 2007</xref>; <xref ref-type="bibr" rid="B40">Orun et al., 2018</xref>; <xref ref-type="bibr" rid="B19">Han et al., 2022</xref>). Spatial and temporal variations captured at a high spatial and temporal resolution can reveal conditions where air quality differs from the expected land-use effect (<xref ref-type="bibr" rid="B56">Weissert et al., 2020</xref>). Machine learning approaches, especially ensemble-based methods such as random forests, have provided a non-parametric solution without assuming a linear relationship between air pollutant concentration and the predictors (<xref ref-type="bibr" rid="B46">Ren et al., 2020</xref>; <xref ref-type="bibr" rid="B10">Coker et al., 2021</xref>; <xref ref-type="bibr" rid="B22">Jain et al., 2021</xref>; <xref ref-type="bibr" rid="B58">Wong et al., 2021</xref>); instead, complex relationships can be captured within such models. Specifically, <xref ref-type="bibr" rid="B56">Weissert et al. (2020)</xref> applied a random forest model to data from a low-cost sensor network to analyze the impact of land use on local air quality and to capture air quality variations on an hourly basis at a detailed spatial scale. <xref ref-type="bibr" rid="B10">Coker et al. (2021)</xref> explored various ML base-learner and ensemble algorithms to improve LUR predictions in monthly PM<sub>2.5</sub> in urban regions of central and eastern Uganda. <xref ref-type="bibr" rid="B25">Kelly et al. (2021)</xref> developed a Gaussian process model to accurately predict neighborhood-scale PM<sub>2.5</sub> concentrations during pollution events such as fireworks, wildfires, and persistent cold air pools.</p>
<p>In addition, integrating time-varying predictors (e.g., meteorological conditions and satellite-retrieved aerosol optical depth) has also improved LUR models&#x2019; temporal resolution to daily or hourly (<xref ref-type="bibr" rid="B32">Masiol et al., 2018</xref>; <xref ref-type="bibr" rid="B59">Yao et al., 2018</xref>; <xref ref-type="bibr" rid="B30">Lu et al., 2021</xref>). Combining the predictor selection procedure in LUR and the ML-based prediction model in estimating the non-linear relationships has been explored to leverage both advantages into an integrated framework (<xref ref-type="bibr" rid="B22">Jain et al., 2021</xref>; <xref ref-type="bibr" rid="B58">Wong et al., 2021</xref>). A particular challenge in hourly LUR modeling is the availability of satellite-observed aerosol optical depth (AOD). Most existing studies rely on MODIS AOD product, which has an overpass of twice daily and in the afternoon. The limited availability of satellite-observed AOD hampers the ability of LUR models to predict hours that are outside these overpassed times. This is also one of the reasons why LUR models are used mostly for monthly or yearly assessments of air pollution.</p>
<p>Human exposure to air pollution has been a longstanding concern in public policy. Epidemiologic evidence demonstrated causal relationships between particulate matter (PM) and health outcomes and revealed disparities between different groups regarding their risks of related health effects (<xref ref-type="bibr" rid="B23">Jbaily et al., 2022</xref>). Exposure risks are generally calculated based on pollutant concentrations and the population affected by the pollutants. Environmental agencies have been providing online dashboards for public showcases of environmental justice (EJ) related to air pollution, e.g., the U.S. EPA&#x2019;s EJScreen and CalEnviroScreen. These approaches fail to account for all components of exposure since 1) there might be a high spatial and temporal variability of air pollutant concentrations, and 2) people spend time both in places they live and visit (<xref ref-type="bibr" rid="B8">Canha et al., 2021</xref>). Human exposure studies have considered physical movements and activities of individuals and the environments in which they spend their time. Most of these studies rely on individual mobility patterns inferred from mobile phone data or Call Detail Record (CDR) data collected by mobile network operators (<xref ref-type="bibr" rid="B38">Nyhan et al., 2016</xref>; <xref ref-type="bibr" rid="B39">2019</xref>; <xref ref-type="bibr" rid="B61">Yu et al., 2020</xref>), but focused on a particular area and in a limited time window. Comprehensive studies across different spatial regions are necessary to accurately assess the risks of air pollution and inform public health policy. These studies can help to identify patterns of exposure and potential health risks and can inform the development of strategies to reduce or eliminate exposure to harmful substances.</p>
<p>This research proposes an empirical foundation by integrating machine learning and low-cost sensor measurements into estimating spatiotemporal variability of PM<sub>2.5</sub> concentrations in eight major metropolitan areas in the U.S. and assessing exposure to PM<sub>2.5</sub> based on where people visit. We integrate the measurements of PM<sub>2.5</sub> from a low-cost sensor network (PurpleAir) with EPA&#x2019;s regulatory monitors into LUR model development. We use a LUR model based on automated machine learning, i.e., AutoML-LUR, to capture the relationship between geographic covariates and PM<sub>2.5</sub> concentrations. In addition, we take human mobility into consideration of human exposure regarding where people visit and investigate how mobility impact exposure estimates. Details on the methods used in this study are presented in the next section, followed by the results of the study and a discussion of the potential of the methods and data, as well as associated limitations.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Study regions</title>
<p>Our analyses aimed to assess data and modeling performance from multi-city low-cost sensors and existing regulatory monitoring networks. We selected core-based statistical areas (CBSAs) with at least seven EPA regulatory monitors and seven PurpleAir sensors between 1 January 2018, and 31 December 2021 (<xref ref-type="table" rid="T1">Table 1</xref>). The choice of requiring at least seven sensors from each type (EPA monitors and PurpleAir sensors) in the study is based on our goal to ensure a sufficient spatial coverage for LUR models. The specific number, seven, is chosen in reference to <xref ref-type="bibr" rid="B29">Lu et al. (2022)</xref>, which deployed seven sensors of each type in six cities across the United States, and this configuration showed promising results in assessing data and modeling performance for low-cost sensors in different urban environments. Following this approach, we aimed to ensure a similar level of representativeness in our analyses.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Selected core-based statistical areas in the study, 2018&#x2013;2021.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Core-based statistical areas (CBSAs)</th>
<th align="left">Short name</th>
<th align="left">Number of EPA monitors</th>
<th align="left">Number of PurpleAir sensors</th>
<th align="left">Area (km<sup>2</sup>)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Chicago-Naperville-Elgin, IL-IN-WI</td>
<td align="right">Chicago</td>
<td align="right">10</td>
<td align="right">50</td>
<td align="right">18,920.52</td>
</tr>
<tr>
<td align="left">Riverside-San Bernardino-Ontario, CA</td>
<td align="right">Riverside</td>
<td align="right">7</td>
<td align="right">154</td>
<td align="right">70,988.93</td>
</tr>
<tr>
<td align="left">Las Vegas-Henderson-Paradise, NV</td>
<td align="right">Las Vegas</td>
<td align="right">11</td>
<td align="right">18</td>
<td align="right">20,876.66</td>
</tr>
<tr>
<td align="left">Los Angeles-Long Beach-Anaheim, CA</td>
<td align="right">Los Angeles</td>
<td align="right">8</td>
<td align="right">541</td>
<td align="right">12,691.04</td>
</tr>
<tr>
<td align="left">San Francisco-Oakland-Hayward, CA</td>
<td align="right">San Francisco</td>
<td align="right">7</td>
<td align="right">1794</td>
<td align="right">6,637.35</td>
</tr>
<tr>
<td align="left">New York-Newark-Jersey City, NY-NJ-PA</td>
<td align="right">New York</td>
<td align="right">10</td>
<td align="right">106</td>
<td align="right">23,381.94</td>
</tr>
<tr>
<td align="left">Houston-The Woodlands-Sugar Land, TX</td>
<td align="right">Houston</td>
<td align="right">10</td>
<td align="right">56</td>
<td align="right">22,273.72</td>
</tr>
<tr>
<td align="left">Phoenix-Mesa-Scottsdale, AZ</td>
<td align="right">Phoenix</td>
<td align="right">13</td>
<td align="right">41</td>
<td align="right">37,809.73</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-2">
<title>2.2 EPA and PurpleAir PM<sub>2.5</sub> measurements</title>
<p>Hourly PM<sub>2.5</sub> measurements are downloaded from the EPA Air Quality System (AQS) database for Federal Reference Method (FRM) monitors and Federal Equivalent Method (FEM) monitors. In addition, we downloaded publicly available outdoor PM<sub>2.5</sub> measurements from the PurpleAir website <italic>via</italic> an open-source R package: AirSensor, developed by the South Coast Air Quality Management District (South Coast AQMD) and Mazama Science. The raw data was aggregated into hourly averages using the quality control function provided by AirSensor. The function creates a PM<sub>2.5</sub> time series by averaging the A and B channels and removing the invalidate date when 1) the measurement count is lower than 20, 2) the hourly difference between A and B channels is higher than 5, and 3) the hourly percent difference between A and B channels is higher than 70%. Samples with PM<sub>2.5</sub> measurements greater than 1,000&#xa0;&#x3bc;g/m<sup>3</sup> and with missing or abnormal temperature and humidity readings were removed (humidity readings should be within 0%&#x2013;100%; temperature readings should be within 20&#xb0;F to 140&#xb0;F). Approximately 15% of the raw PurpleAir measurements were removed due to these quality issues. We applied the correction to the PurpleAir PM<sub>2.5</sub> measurements based on the U.S.-wide calibration proposed by <xref ref-type="bibr" rid="B2">Barkjohn et al. (2021)</xref>.</p>
<p>Comparing the monthly hourly PM<sub>2.5</sub> concentrations between EPA and PurpleAir in the selected study regions (<xref ref-type="fig" rid="F1">Figure 1</xref>), measurements from the PurpleAir sensors generally have a lower median value than the EPA monitors. There could be several reasons why EPA air quality measurements are consistently higher than PurpleAir measurements. One reason could be that the EPA uses more sophisticated and expensive air quality monitoring equipment, which is subject to strict quality control and calibration procedures to ensure the accuracy of the measurements. In contrast, PurpleAir monitors are designed for personal use and are not subject to the same level of quality control and calibration, which can lead to a more significant margin of error in the readings. Another reason could be the placement of the monitors. The EPA often selects monitoring sites based on strict criteria to ensure that they represent the air quality in a given area (<xref ref-type="bibr" rid="B41">Raffuse et al., 2007</xref>). In contrast, PurpleAir monitors are typically installed by individuals in their own homes or businesses, and the placement of the monitors can vary widely (<xref ref-type="bibr" rid="B1">Ardon-Dryer et al., 2020</xref>). It is also worth noting that the EPA and PurpleAir use different methods to measure air quality. The EPA primarily measures PM<sub>2.5</sub> using gravimetric analysis, while PurpleAir uses a laser-based sensor technology. While both methods are considered reliable, they can produce slightly different measurements depending on factors such as humidity and temperature.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Boxplots of PM<sub>2.5</sub> concentrations measured by EPA monitors and PurpleAir monitors in eight metropolitan areas. Each box extends from the first quartile (Q1) to the third quartile (Q3) of the data, with a line at the median. The whiskers extend from the box by 1.5x the interquartile range (IQR). Outlier points are those past the end of the whiskers.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g001.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 Land use regression modeling</title>
<p>Traditional LUR models are statistical methods developed to estimate PM<sub>2.5</sub> concentrations based on geographical features. The models can create an empirical relationship between PM<sub>2.5</sub> concentrations and land use variables such as traffic volume, distance to major roads, population density, and presence of emission sources. A typical LUR model can be formulated as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the PM<sub>2.5</sub> concentration, <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the intercept term, <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, &#x2026;, <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the predictor variables, i.e., different land use, traffic, and geographical features, <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, &#x2026;, <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the coefficients for these variables that indicate the strength and direction of the relationships between each predictor variable and PM<sub>2.5</sub> concentration, and <inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the error term that captures the variation in PM<sub>2.5</sub> concentration not explained by the model. The &#x3b2; coefficients are estimated using regression techniques, most commonly multiple linear regression. The goal is to find the set of &#x3b2; values that minimizes the sum of the squared differences between the observed and predicted PM<sub>2.5</sub> concentrations.</p>
<p>In this study, we used automated machine learning (detailed in <xref ref-type="sec" rid="s2-3-3">Section 2.3.3</xref>) to construct more sophisticated versions of LUR models. Traditional LUR models, illustrated in Eq. <xref ref-type="disp-formula" rid="e1">1</xref>, use multiple linear regression, which assumes a linear relationship between predictor variables and the target outcome. However, real-world relationships between land-use variables and PM<sub>2.5</sub> concentrations may be nonlinear or involve complex interactions, which can be better captured with machine learning techniques.</p>
<p>We analyzed 4&#xa0;years of available PM<sub>2.5</sub> concentration data for LUR modeling. Hourly measurements of PM<sub>2.5</sub> concentration were used to compute monthly averages for each hour (e.g., January 2021 will have 24&#xa0;h averages; monthly-hourly thereafter). These monthly-hourly averages were used to train and test the LUR models. To test the usefulness of low-cost sensor networks for developing AutoML-LUR models, we developed three types of models 1) using the EPA measurements, 2) using the PurpleAir measurements, and 3) incorporating PM<sub>2.5</sub> measurements from EPA monitors and PurpleAir sensors for the eight CBSAs. For each monitoring location, we prepare the geographic covariates following the guidelines from the Multi-Ethnic Study of Atherosclerosis and Air Pollution (<xref ref-type="bibr" rid="B24">Keller et al., 2015</xref>; <xref ref-type="bibr" rid="B26">Kirwa et al., 2021</xref>).</p>
<sec id="s2-3-1">
<title>2.3.1 Geographic covariates</title>
<p>The initial set of geographic covariates for LUR modeling contains proximity and buffer variables regarding land-use categories, emission sources, and vegetation indices (<xref ref-type="bibr" rid="B24">Keller et al., 2015</xref>; <xref ref-type="bibr" rid="B26">Kirwa et al., 2021</xref>). Proximity variables include distances to major land use categories, roads and truck routes, airports, coastlines and railroads, ports, and point emission sources. Buffer variables include summarized statistics of spatial features within a group of buffer ratios (0.5&#x2013;30&#xa0;km). These features include the sum of road or truck route length, total emission, percentiles of NDVI, average imperviousness value, and average elevation, respectively, calculated in buffers. Beyond the time-invariant variables suggested in MESA Air, we also added time-varying variables that account for the complex dynamics of air pollutant concentrations, such as meteorological conditions and satellite-retrieved aerosol optical depth (AOD). The complete list of variables is illustrated in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Geographical covariates.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Category</th>
<th align="left">Measure</th>
<th align="left">Variable description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Location</td>
<td align="left">Location of the monitoring station</td>
<td align="left">Longitude and latitude</td>
</tr>
<tr>
<td align="left">Time</td>
<td align="left">Temporal indicators</td>
<td align="left">Hour, month, year (<italic>dt_year, dt_month, dt_hour</italic>)</td>
</tr>
<tr>
<td rowspan="2" align="left">Traffic</td>
<td align="left">
<xref ref-type="table-fn" rid="Tfn1">
<sup>a</sup>
</xref> Distance to the nearest road</td>
<td rowspan="2" align="left">Distances to primary roads (<italic>dist_A1</italic>), secondary roads (<italic>dist_A23</italic>), and truck route (<italic>dist_tr</italic>)</td>
</tr>
<tr>
<td align="left">Sum within buffers of 0.05&#x2013;15&#xa0;km</td>
</tr>
<tr>
<td align="left">Land use/land cover</td>
<td align="left">Percent within buffers of 0.05&#x2013;15&#xa0;km</td>
<td align="left">Developed low, medium, and high density; developed open space; agricultural land (cropland, groves, feeding); forest land (deciduous, evergreen, mixed); open water, <italic>etc.</italic> (<italic>lu_&#x3c;type_num&#x3e;</italic>)</td>
</tr>
<tr>
<td align="left">Source</td>
<td align="left">
<bold>
<italic>&#x2a;</italic>
</bold> Distance to the nearest source</td>
<td align="left">Distances to coastline (<italic>dist_cl</italic>), railroad (<italic>dist_rail</italic>), airport (<italic>dist_airp</italic>), major airport (<italic>dist_l_airp</italic>), large port (<italic>dist_port</italic>)</td>
</tr>
<tr>
<td align="left">Emission</td>
<td align="left">Sum of site-specific facility emissions within buffers of 3&#x2013;30&#xa0;km</td>
<td align="left">Total emission in PM<sub>2.5</sub> (<italic>emission</italic>)</td>
</tr>
<tr>
<td align="left">Vegetation</td>
<td align="left">Quantiles within buffers of 0.5&#x2013;10&#xa0;km</td>
<td align="left">Normalized Difference Vegetation Index (<italic>ndvi</italic>)</td>
</tr>
<tr>
<td align="left">Imperviousness</td>
<td align="left">Percent within buffers of 0.05&#x2013;5&#xa0;km</td>
<td align="left">Impervious surface value (<italic>im</italic>)</td>
</tr>
<tr>
<td rowspan="2" align="left">Elevation</td>
<td align="left">Elevation above sea levels</td>
<td rowspan="2" align="left">Elevation value (<italic>elevation</italic>)</td>
</tr>
<tr>
<td align="left">Counts of points above or below a threshold within buffers of 1&#x2013;5&#xa0;km</td>
</tr>
<tr>
<td align="left">Satellite-based</td>
<td align="left">Multi-Angle Implementation of Atmospheric Correction (MAIAC)</td>
<td align="left">Aerosol optical depth (<italic>aod</italic>)</td>
</tr>
<tr>
<td align="left">Reanalysis AOD</td>
<td align="left">Modern-Era Retrospective Analysis for Research and Applications, Version 2 (MERRA-2)</td>
<td align="left">Total Aerosol Extinction AOT [550&#xa0;nm]</td>
</tr>
<tr>
<td align="left">Meteorological variables</td>
<td align="left">ECMWF Reanalysis v5 (ERA5)</td>
<td align="left">2&#xa0;m dewpoint temperature (meteo_d2m), 2&#xa0;m temperature (meteo_t2m), 10&#xa0;m u-component of wind (meteo_u10), 10&#xa0;m v-component of wind (meteo_v10), surface pressure (meteo_sp), total precipitation (meteo_tp)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn1">
<label>
<sup>a</sup>
</label>
<p>Distances calculated to spatial features are truncated at 25&#xa0;km.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To ensure the geographic covariates included in the models provide sufficiently useful information, we apply the following criteria to filter out variables when 1) &#x3e;80% of monitors had the same value, 2) &#x3e;2% of observations were more than five standard deviations (SDs) away from the mean, 3) the SD of the distribution of values at cohort participant residences was more than five times the SD of the distribution of values at monitor locations, and 4) the maximum value of a percentage variable was 10% among all monitors. In addition, we removed correlated and redundant variables to avoid model overfitting by optimizing the selection of specific predictor variables with varying buffer sizes (e.g., land use type and NDVI). Specifically, using the complete set of candidate predictors, we ran an initial Random Forest model and computed variable importance scores (VIS). We then chose the buffer size for each spatial predictor as the most optimal predictor based on VIS ranking.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Hourly AOD gap filling</title>
<p>MAIAC AOD has a high spatial resolution of &#x223c;1&#xa0;km with local spatial variation but is only available twice daily in the afternoon. To fill the gaps of hourly AOD, we followed the work of MERRA-2 AOD downscaling using elevation. The MERRA-2 aerosol reanalysis product is simulated by Goddard Chemistry Aerosol Radiation and Transport (GOCART) coupled with the Goddard Earth Observing System, Version 5 (GEOS-5) atmospheric general circulation model (<xref ref-type="bibr" rid="B35">Molod et al., 2015</xref>; <xref ref-type="bibr" rid="B17">Gelaro et al., 2017</xref>; <xref ref-type="bibr" rid="B42">Randles et al., 2017</xref>). However, the 0.5 <sup>o</sup> MERRA-2 AOD might not appropriately represent the spatial distribution of aerosol loading, especially over highly polluted areas with large gradients of AOD. Therefore, MERRA-2 AOD is further downscaled from 0.5<sup>o</sup> to 1&#xa0;km based on elevation (<xref ref-type="bibr" rid="B48">Sengupta et al., 2018</xref>) and used to fill the gaps of hourly AOD when MAIAC AOD has missing information.</p>
</sec>
<sec id="s2-3-3">
<title>2.3.3 AutoML-LUR model development</title>
<p>We employed Automated Machine Learning (AutoML) to capture the non-linear relationships between geographic covariates and air pollutant concentrations (<xref ref-type="bibr" rid="B6">Breiman, 2001</xref>), as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. AutoML is the process of automating various aspects of the machine learning workflow, including data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. AutoML is typically used to address the following challenges in machine learning: 1) time-consuming and repetitive tasks and 2) hyperparameter tuning. We used one of the AutoML open-source toolkits, Auto-sklearn (<xref ref-type="bibr" rid="B13">Feurer et al., 2022</xref>; <xref ref-type="bibr" rid="B14">2015</xref>), built on top of the scikit-learn machine learning library. Auto-sklearn uses a Bayesian optimization approach to search the hyperparameter space efficiently and identify the best-performing model for a given dataset. It employs ensemble methods to combine multiple models and enhance the overall performance.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>AutoML land use regression model architecture.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g002.tif"/>
</fig>
<p>During the model training for the new dataset, the meta-features of the new dataset are first computed and used to compare it to the reference datasets in the meta-feature space by comparing datasets using the L1 distance, which measures the absolute differences between the meta-features of the new dataset and the reference datasets. The reference datasets are then ranked based on their distance to the new dataset, and the top 25 nearest reference datasets are selected. The hyperparameters that gave the best performance on these datasets are then used to instantiate the Bayesian optimizer for the new dataset. This helps to reduce the search space for hyperparameters and can lead to faster and more efficient hyperparameter optimization.</p>
</sec>
<sec id="s2-3-4">
<title>2.3.4 Model evaluation</title>
<p>To test the model&#x2019;s overall capability, we conducted a 10-fold cross-validation and computed the root mean squared error (RMSE) and <italic>R</italic>
<sup>2</sup> (coefficient of determination) of the observed <italic>versus</italic> fitted values to assess the model&#x2019;s predictive accuracy. In addition, we evaluated the LUR results in the spatial and temporal dimensions using spatial cross-validation and temporal cross-validation separately, which involves partitioning the data into subsets and using one subset for training the LUR model and the remaining subset for testing. These evaluation metrics can help assess the ability of the LUR model to generalize to data in other spatial regions and new time periods and to estimate the spatial and temporal variations of air pollution exposure.</p>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Exposure estimation</title>
<p>Quantifying human exposure to air pollution depends on two factors: 1) the population living within the area and 2) the air pollution concentration to which they are exposed. Combining the two factors, existing studies utilized population-weighted annual average concentration as a score to estimate population exposures, thus giving greater weight to the air pollution exposure where most people live (<xref ref-type="bibr" rid="B45">Reis et al., 2018</xref>). The population-weighted exposure is generally defined as <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the annual mean population exposure for a certain area, <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the respective population number in this area, and <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the annual mean concentration of the pollutant for this area.</p>
<p>To account for human mobility in the estimation of PM<sub>2.5</sub> exposure, we used the publicly available human mobility dataset SafeGraph to calculate population-weighted exposure. Specifically, we used the SafeGraph <italic>Patterns</italic> data product, which contains mobility patterns from approximately 47 million (around 10% of) mobile devices in the United States (<xref ref-type="bibr" rid="B50">Squire, 2019</xref>; <xref ref-type="bibr" rid="B11">Coston et al., 2021</xref>). The monthly <italic>Patterns</italic> data product used in this study provides anonymized counts of how many people visit commercial points of interest (POIs) each month, which can be divided by the visitor&#x2019;s home census block group (CBG). Note that the term &#x201c;visitors&#x201d; is not used in the conventional sense to distinguish between residents and non-residents. Rather, it refers to individuals visiting Points of Interest (POIs) in a specific area. This can be helpful for understanding how mobility patterns may impact exposure to air pollution, as people who spend more time in areas with higher pollution concentrations may be at greater risk of exposure. Since the data is aggregated into monthly patterns, home CBG information is not directly linked to a specific device, differentiating SafeGraph mobility data from other individual mobility tracking datasets, such as Call Detailed Records (CDR) or mobility survey datasets. Therefore, patterns are aggregated and approximated as community-level mobility instead of individual-based. For the visitor population, we used the &#x2018;popularity by hour&#x2019; field from SafeGraph data for each point of interest (POI) and aggregated the values into each CBG.</p>
<p>Based on the estimated population, a mobility-based exposure (<italic>MBE</italic>) is calculated by matching visit locations with PM<sub>2.5</sub> concentrations estimated from the AutoML-LUR model. Specifically, <italic>MBE</italic> is calculated as follows:<disp-formula id="e1a">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>B</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the percentage of mobility-based visitor number estimated from SafeGraph for a particular census block group over all visitors in a particular month and hour, and <inline-formula id="inf15">
<mml:math id="m17">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the PM<sub>2.5</sub> concentration derived for the census block group for a particular month and hour from the 1&#xa0;km-resolution AutoML-LUR results.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 AutoML-LUR model performance</title>
<p>The model prediction performance varied temporally and spatially (<xref ref-type="table" rid="T3">Table 3</xref>). Replacing EPA data with PurpleAir data to train AutoML-LUR models resulted in significant decreases in the root mean squared error (RMSE), with a decrease of 0.51&#xa0;&#x3bc;g/m<sup>3</sup> (SD &#x3d; 0.22&#xa0;&#x3bc;g/m<sup>3</sup>) on average. In addition, using EPA &#x2b; PurpleAir data to train AutoML-LUR models also resulted in significant decreases in RMSE compared to using EPA data only, with a decrease of 0.32&#xa0;&#x3bc;g/m<sup>3</sup> (SD &#x3d; 0.26&#xa0;&#x3bc;g/m<sup>3</sup>) on average. However, diverging patterns of RMSE and R<sup>2</sup>values are observed across different regions. For example, in regions such as Chicago and Houston, lower RMSE and higher <italic>R</italic>
<sup>2</sup> values were observed when only PurpleAir sensor data were used. However, in other regions such as Los Angeles, models trained on combined EPA and PurpleAir data outperformed those using only PurpleAir data. Several underlying factors may explain these observed variations. Firstly, the geographical placement of sensors is a significant determinant. PurpleAir sensors, predominantly purchased by private residents, are often situated within residential areas. In contrast, EPA monitors are strategically positioned near pollution emission sources or within areas characterized by particular land use types. Consequently, the heightened performance of models in regions using solely PurpleAir data (such as Chicago) could be attributed to their greater sensitivity in capturing the air quality variations within residential environments. Secondly, the spatial distribution and density of sensors is another potential influencer. In the regions where combined sensor data yielded higher performance (such as Los Angeles), the heterogeneous sensor locations and increased sensor density could be contributing factors. The amalgamation of data from both types of sensors provides a broader representation of air quality variations across distinct land uses and proximities to emission sources. Furthermore, the level of congruity between readings from different sensors may impact model performance. As supported by <xref ref-type="fig" rid="F1">Figure 1</xref>, PurpleAir sensors demonstrate greater consistency among themselves compared to their consistency with EPA monitors, thus resulting in enhanced performance when solely PurpleAir data is employed.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Model performances&#x2013;Auto-ML.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="3" align="left">Model</th>
<th colspan="3" align="left">Chicago-naperville-Elgin, IL-in-WI</th>
<th colspan="3" align="left">Houston-the Woodlands-Sugar land, TX</th>
<th colspan="3" align="left">Las Vegas-Henderson-Paradise, NV</th>
<th colspan="3" align="left">Los Angeles-Long Beach-Anaheim, CA</th>
</tr>
<tr>
<th align="left">n</th>
<th align="left">RMSE</th>
<th align="left">
<italic>R</italic>
<sup>2</sup>
</th>
<th align="left">n</th>
<th align="left">RMSE</th>
<th align="left">
<italic>R</italic>
<sup>2</sup>
</th>
<th align="left">n</th>
<th align="left">RMSE</th>
<th align="left">
<italic>R</italic>
<sup>2</sup>
</th>
<th align="left">n</th>
<th align="left">RMSE</th>
<th align="left">
<italic>R</italic>
<sup>2</sup>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">EPA</td>
<td align="left">8,452</td>
<td align="left">0.81</td>
<td align="left">0.91</td>
<td align="left">7,211</td>
<td align="left">1.00</td>
<td align="left">0.86</td>
<td align="left">7,880</td>
<td align="left">0.62</td>
<td align="left">0.97</td>
<td align="left">6,960</td>
<td align="left">1.34</td>
<td align="left">0.94</td>
</tr>
<tr>
<td align="left">PurpleAir</td>
<td align="left">11,049</td>
<td align="left">0.46</td>
<td align="left">0.97</td>
<td align="left">12,961</td>
<td align="left">0.40</td>
<td align="left">0.92</td>
<td align="left">4,312</td>
<td align="left">0.43</td>
<td align="left">0.95</td>
<td align="left">243,394</td>
<td align="left">0.68</td>
<td align="left">0.96</td>
</tr>
<tr>
<td align="left">EPA &#x2b; PurpleAir</td>
<td align="left">19,501</td>
<td align="left">0.81</td>
<td align="left">0.91</td>
<td align="left">38,364</td>
<td align="left">0.67</td>
<td align="left">0.90</td>
<td align="left">12,192</td>
<td align="left">0.62</td>
<td align="left">0.96</td>
<td align="left">146,392</td>
<td align="left">0.67</td>
<td align="left">0.97</td>
</tr>
</tbody>
</table>
<table>
<thead valign="top">
<tr>
<td rowspan="3" align="left">Model</td>
<td colspan="3" align="left">New York-Newark-Jersey City, NY-NJ-PA</td>
<td colspan="3" align="left">Phoenix-Mesa-Scottsdale, AZ</td>
<td colspan="3" align="left">Riverside-San Bernardino-Ontario, CA</td>
<td colspan="3" align="left">San Francisco-Oakland-Hayward, CA</td>
</tr>
<tr>
<td align="left">n</td>
<td align="left">RMSE</td>
<td align="left">
<italic>R</italic>
<sup>2</sup>
</td>
<td align="left">n</td>
<td align="left">RMSE</td>
<td align="left">
<italic>R</italic>
<sup>2</sup>
</td>
<td align="left">n</td>
<td align="left">RMSE</td>
<td align="left">
<italic>R</italic>
<sup>2</sup>
</td>
<td align="left">n</td>
<td align="left">RMSE</td>
<td align="left">
<italic>R</italic>
<sup>2</sup>
</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">EPA</td>
<td align="left">8,952</td>
<td align="left">0.74</td>
<td align="left">0.90</td>
<td align="left">11,279</td>
<td align="left">1.28</td>
<td align="left">0.92</td>
<td align="left">7,974</td>
<td align="left">1.22</td>
<td align="left">0.94</td>
<td align="left">5,547</td>
<td align="left">1.19</td>
<td align="left">0.90</td>
</tr>
<tr>
<td align="left">PurpleAir</td>
<td align="left">23,067</td>
<td align="left">0.58</td>
<td align="left">0.95</td>
<td align="left">10,558</td>
<td align="left">0.46</td>
<td align="left">0.90</td>
<td align="left">99,601</td>
<td align="left">0.66</td>
<td align="left">0.96</td>
<td align="left">603,499</td>
<td align="left">0.43</td>
<td align="left">0.95</td>
</tr>
<tr>
<td align="left">EPA &#x2b; PurpleAir</td>
<td align="left">32,019</td>
<td align="left">0.69</td>
<td align="left">0.93</td>
<td align="left">21,837</td>
<td align="left">1.04</td>
<td align="left">0.91</td>
<td align="left">79,965</td>
<td align="left">0.68</td>
<td align="left">0.96</td>
<td align="left">609,046</td>
<td align="left">0.46</td>
<td align="left">0.95</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The trained models allow the mapping of pollutant concentrations in the eight CBSAs. Local variations of PM<sub>2.5</sub> concentrations also vary by the input and can be significantly different among models (<xref ref-type="fig" rid="F3">Figures 3A&#x2013;C</xref>). AutoML-LUR models trained with EPA data tend to produce a higher concentration of PM<sub>2.5</sub> than models trained with PurpleAir data. Models trained with EPA &#x2b; PurpleAir data showed higher spatial variance than the other two models, with high PM<sub>2.5</sub> concentrations varying spatially according to the observation used to train the AutoML-LUR model. Examples of predicting PM<sub>2.5</sub> for different hours in January 2021 in Los Angeles are shown in <xref ref-type="fig" rid="F3">Figure 3D&#x2013;G</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A&#x2013;C)</bold> AutoML-LUR results for PM<sub>2.5</sub> concentrations at Local Hour 0 January 2021 from models trained using EPA data, PurpleAir data, and EPA &#x2b; PurpleAir data <bold>(B&#x2013;G)</bold> Hourly estimates of PM<sub>2.5</sub> concentrations in Los Angeles for January 2021. Unit: &#xb5;g/m.<sup>3</sup>.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g003.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F4">Figure 4</xref> demonstrates the monthly trends of PM<sub>2.5</sub> predictions in the target CBSAs. Most CBSAs show a seasonal trend of PM<sub>2.5</sub> concentrations, but the levels of PM<sub>2.5</sub> concentrations vary significantly depending on location. For example, the levels of PM<sub>2.5</sub> concentrations tend to be the highest during the winter months in Los Angeles, primarily because of the increased emission and weather patterns in winter months, such as temperature inversions which can trap pollutants near the ground leading to higher PM<sub>2.5</sub> level (<xref ref-type="bibr" rid="B54">Wallace et al., 2010</xref>). On the other hand, PM<sub>2.5</sub> concentrations increased during the summer months in San Francisco, primarily because of natural sources such as wildfires. Some other CBSAs have higher PM<sub>2.5</sub> concentrations during summer months due to a variety of factors, such as increased heat and sunlight, which can lead to the formation of ground-level ozone, a major component of smog. Other factors that may contribute to the high PM<sub>2.5</sub> levels during summer include increased use of air conditioning and wildfires, which can release particles into the air.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Boxplots of monthly predictions in 2021 for AutoML-LUR models for each CBSA. Unit: &#xb5;g/m.<sup>3</sup>.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g004.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Variable importance</title>
<p>Permutation feature importance was calculated to represent the increase in the prediction error of the model after we permuted feature&#x2019;s values, which breaks the relationship between the feature and the true outcome (<xref ref-type="bibr" rid="B15">Fisher et al., 2019</xref>). Analyses of feature importance revealed the key factors that were spatially correlated with PM<sub>2.5</sub> concentrations (<xref ref-type="fig" rid="F5">Figure 5</xref>). Spatially aggregated feature importance scores showed that satellite AOD, temporal indicators, and meteorological variables were highly correlated with PM<sub>2.5</sub> concentrations (<xref ref-type="fig" rid="F5">Figure 5A</xref>). City-wise feature important scores reflect city-wide features most relevant to air pollutant concentrations. In most CBSAs, satellite AOD, meteorological variables, temporal indicators, NDVI, and land use remain highly correlated with PM<sub>2.5</sub> concentrations. <xref ref-type="fig" rid="F5">Figure 5B&#x2013;I</xref> provides a decomposed analysis for each CBSA. Spatial heterogeneity is observed regarding the CBSA&#x2019;s most relevant sources of PM<sub>2.5</sub> other than AOD and meteorological conditions (e.g., distance to large airports for Chicago; imperviousness for Houston; nearby primary road length for Las Vegas; emission for Los Angeles, New York and Riverside; distance to coastline for New York, Riverside, and San Francisco; elevation for Phoenix and San Francisco).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>
<bold>(A)</bold> Top 20 feature importance scores averaged from all eight CBSAs for all time scales, and <bold>(B&#x2013;I)</bold> Top 15 feature importance scores separately for each CBSA for the yearly models. <italic>x</italic>-axes: feature importance scores, <italic>y</italic>-axes: feature abbreviations.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g005.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Mobility-based exposure</title>
<p>Mobility-based exposure (MWE) for each census block group was calculated to analyze visitor exposure to air pollution, which was shown to vary significantly in terms of spatial distribution and temporal variation. Spatial patterns of MWE reveal hotspots of high visitor number weighted exposure to PM<sub>2.5</sub> and places where visitors are less exposed to this pollutant. For Los Angeles, the highest MWE values are found in areas near major transportation hubs and business districts, such as the Los Angeles International Airport, the Port of Los Angeles and Port of Long Beach, and Hollywood (<xref ref-type="fig" rid="F6">Figure 6A</xref>). These values can vary over the time of the day and month of the year, as illustrated in the enlarged areas (<xref ref-type="fig" rid="F6">Figures 6B&#x2013;F</xref>). The months chosen to illustrate the temporal variation in <xref ref-type="fig" rid="F6">Figures 6B&#x2013;F</xref> were selected to represent different seasons throughout the year, to capture the potential seasonal differences in PM<sub>2.5</sub> exposure and visitor patterns in various areas of Los Angeles. These months also provide a comprehensive picture of visitor exposure to PM<sub>2.5</sub> throughout the year. Most of the CBGs demonstrate a relatively consistent pattern of MWE throughout the day, which might result from the levels of PM<sub>2.5</sub> or a relatively low number of visitors throughout the day. CBGs that are significantly deviating from the general temporal patterns show clear diurnal trends. One typical trend is that MWE increases during the day, peaking in the late morning to early afternoon as people arrive for work or other activities. MWE declines in the late afternoon and evening as people finish their activities and return home or to their lodging. Another typical trend is that MWE remains high at night and decreases during the day, which might result from high PM<sub>2.5</sub> concentrations or high visitor numbers at night.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>
<bold>(A)</bold> Spatial patterns of percentiles for the annual average MBE for Los Angeles <bold>(B&#x2013;F)</bold> Zoomed in high MBE-valued regions and their hourly MBE changes during a selected month. Each line corresponds to a particular CBG.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g006.tif"/>
</fig>
<p>MWE to PM<sub>2.5</sub> is a measure of the average exposure of visitors to a particular area, considering both the concentration of PM<sub>2.5</sub> in the air and the number of visitors present in the area. Supposing either of these factors is exceptionally high, the MWE value may appear as a hotspot within the region, indicating a potential area of concern for air quality and public health. However, it is important to note that a high MWE value on its own does not necessarily indicate significant exposure risks without examining both factors separately. For example, a high MWE value may be due to a high number of visitors in an area with relatively low PM<sub>2.5</sub> concentrations (<xref ref-type="fig" rid="F7">Figure 7A</xref>. Airport), or it could be due to a relatively low number of visitors in an area with very high PM<sub>2.5</sub> concentrations (<xref ref-type="fig" rid="F7">Figure 7B</xref>. Long Beach). In the first case, the risk of exposure to PM<sub>2.5</sub> for visitors may be relatively low, while in the second case, the risk may be much higher. Therefore, examining the visitor numbers and PM<sub>2.5</sub> concentrations separately is important to get a complete picture of potential exposure risks. This can help policymakers and public health officials develop targeted strategies to reduce exposure to PM<sub>2.5</sub> and improve air quality in areas with high levels of visitor activity.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Spatial patterns of MBE, AutoML-LUR estimates of PM<sub>2.5</sub>, and visitor numbers at different hours of a particular month for two subset regions in Los Angeles. CBGs with black boundaries represent AutoML-LUR estimates of PM<sub>2.5</sub> higher than 12&#xa0;&#x3bc;g/m<sup>3</sup>.</p>
</caption>
<graphic xlink:href="fenvs-11-1223160-g007.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>This research aimed to improve the accuracy of LUR models for urban air pollution exposure assessments by integrating AutoML and low-cost sensor networks. AutoML-LUR models were developed and tested in eight CBSAs in the US, and results showed that integrating PurpleAir data into the model improves their prediction performance, particularly in areas with scarce regulatory monitoring stations. However, models developed using both EPA and PurpleAir data showed higher variance across different CBSAs compared to those developed using only EPA data. Based on the AutoML-LUR models, feature importance was calculated to identify the key factors that are spatially correlated with PM<sub>2.5</sub> concentrations. Results showed that in most CBSAs, satellite AOD, meteorological variables, temporal indicators, NDVI, and land use remain highly correlated with PM2.5 concentrations, while the other important features vary in different spatial regions. Additionally, the study calculated mobility-based exposure (MBE) to PM<sub>2.5</sub> using aggregated human mobility data from SafeGraph to understand how these exposures vary spatially and temporally. The results showed that areas with higher MBE values are found in neighborhoods with a high number of major transportation hubs, industries, and businesses, which might result from a high PM<sub>2.5</sub> concentration or large visitor numbers.</p>
<p>This study has several implications for environmental science and public health. First, integrating AutoML and low-cost sensor networks can improve the accuracy of LUR models for air pollution exposure assessments, which provides more precise and reliable data for public health studies and decision-making. Second, spatial heterogeneity and temporal variations in the key factors relevant to PM<sub>2.5</sub> concentrations can be used to develop more effective strategies to reduce exposure to air pollution and improve public health. Public health officials and policymakers can use the outcome of this research to 1) develop targeted interventions, such as reducing emissions from major transportation hubs or promoting green spaces in areas with high PM<sub>2.5</sub> concentrations, at a finer temporal interval; 2) identify areas where vulnerable populations may be at higher risk of exposure to air pollution, 3) promote the use of low-cost sensor networks to improve air quality monitoring in their communities, and 4) support evidence-based decision-making on issues such as air quality regulations, land use planning, and transportation policies using AutoML algorithms to integrate multiple sources of data.</p>
<p>There are several limitations of this study. First, the AutoML-LUR method relies heavily on the data availability and accuracy of air quality monitors. Incorporating PurpleAir measurements improved overall accuracy, but their data quality needs screening and calibration. Limited sensor distribution in some areas can result in higher uncertainty in the predictions made by the model for these areas. Second, models with higher temporal resolution may suffer from higher uncertainty due to the greater complexity of the model and the need to account for more variables. This can be particularly challenging in estimating PM<sub>2.5</sub>, which can dramatically vary event by event due to scenarios such as wildfires, fireworks, and dust storms. To address this issue, it may be necessary to develop event-specific models or use more sophisticated models incorporating pattern shifts due to these events (<xref ref-type="bibr" rid="B60">Yu et al., 2022</xref>). Third, SafeGraph data represents approximate mobility patterns for a community rather than individual people, limiting its representativeness in estimating the visitor population for each census block group. Using data from the American Community Survey (ACS) can supplement the mobility data and improve representativeness. Although ACS is conducted annually and does not provide real-time individual-level data, it can provide valuable context to the SafeGraph mobility data. For example, using the ACS data, we can infer demographic and socioeconomic characteristics of individuals such as income, education level, race/ethnicity, and age. By combining this information with SafeGraph data, we can enhance our understanding of who is visiting these locations and thus refine our exposure estimates by accounting for these demographics, which can influence exposure susceptibility and behavior. Lastly, SafeGraph employs data suppression techniques to protect individual privacy. This can result in underrepresentation or inaccuracies in visitation data for locations with low foot traffic or device count, potentially impacting the accuracy of our exposure estimates in these areas (<xref ref-type="bibr" rid="B21">Hu et al., 2021</xref>). Future research should explore methods to counteract the effects of data suppression and investigate the specific impacts of these techniques on exposure estimates.</p>
<p>Future studies should address several challenges to improve our understanding of human exposure to air pollution and its impact on public health. First, socio-economic disparities in human exposure to PM<sub>2.5</sub> can be further analyzed to identify and address potential health inequities. Understanding how different groups are affected by air pollution, based on their socioeconomic and demographic characteristics, is essential for public health planning and policy-making. Second, exposure models integrating numerical and human mobility simulation have been valuable tools for understanding and predicting human exposure to air pollution and other environmental hazards. These simulations involve 1) physical processes that govern the movement and dispersion of pollutants in the environment and 2) the movements and activities of individuals to estimate their exposure to these pollutants. Using data on human mobility, such as that provided by SafeGraph, can help to improve the accuracy and reliability of these models by providing more detailed and realistic information on the movements and activities of individuals. By incorporating this information into the model, it may be possible to predict and understand how people are exposed to air pollution more accurately. Finally, follow-up research should investigate indoor air quality exposure and compare it to outdoor exposure in this study. Understanding the differences between indoor and outdoor exposure and their effects on public health outcomes can help to inform strategies for improving air quality and reducing exposure to pollutants.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The raw data that support the findings of this study are available from the corresponding author upon reasonable request.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>MY: data curation, conceptualization, methodology, code implementation, experiment, result analysis, paper writing SZ: conceptualization, paper review and editing KZ: conceptualization, paper writing, paper review and editing JY: data curation, methodology, paper review and editing MV: code implementation, experiment JM: code implementation, experiment. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This research is funded by the Miller Faculty Fellow Award from the College of Earth and Mineral Sciences at Penn State University.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ardon-Dryer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dryer</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Moghimi</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Measurements of PM<sub>2.5</sub> with PurpleAir under atmospheric conditions</article-title>. <source>Atmos. Meas. Tech.</source> <volume>13</volume>, <fpage>5441</fpage>&#x2013;<lpage>5458</lpage>. <pub-id pub-id-type="doi">10.5194/amt-13-5441-2020</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barkjohn</surname>
<given-names>K. K.</given-names>
</name>
<name>
<surname>Gantt</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Clements</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Development and application of a United States-wide correction for PM<sub>2.5</sub> data collected with the PurpleAir sensor</article-title>. <source>Atmos. Meas. Tech.</source> <volume>14</volume>, <fpage>4617</fpage>&#x2013;<lpage>4637</lpage>. <pub-id pub-id-type="doi">10.5194/amt-14-4617-2021</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beelen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Raaschou-Nielsen</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Stafoggia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Andersen</surname>
<given-names>Z. J.</given-names>
</name>
<name>
<surname>Weinmayr</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hoffmann</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Effects of long-term exposure to air pollution on natural-cause mortality: An analysis of 22 European cohorts within the multicentre ESCAPE project</article-title>. <source>Lancet</source> <volume>383</volume>, <fpage>785</fpage>&#x2013;<lpage>795</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(13)62158-3</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Carmona</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Blanco</surname>
<given-names>M. N.</given-names>
</name>
<name>
<surname>Gassett</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Seto</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Szpiro</surname>
<given-names>A. A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Publicly available low-cost sensor measurements for PM2.5 exposure modeling: Guidance for monitor deployment and data selection</article-title>. <source>Environ. Int.</source> <volume>158</volume>, <fpage>106897</fpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2021.106897</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Black</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tesfaigzi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bassein</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>L. A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Wildfire smoke exposure and human health: Significant gaps in research for a growing public health issue</article-title>. <source>Environ. Toxicol. Pharmacol.</source> <volume>55</volume>, <fpage>186</fpage>&#x2013;<lpage>195</lpage>. <pub-id pub-id-type="doi">10.1016/j.etap.2017.08.022</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Random forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume>, <fpage>5</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Butt</surname>
<given-names>E. W.</given-names>
</name>
<name>
<surname>Turnock</surname>
<given-names>S. T.</given-names>
</name>
<name>
<surname>Rigby</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Reddington</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Yoshioka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Johnson</surname>
<given-names>J. S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Global and regional trends in particulate air pollution and attributable health burden over the past 50 years</article-title>. <source>Environ. Res. Lett.</source> <volume>12</volume>, <fpage>104017</fpage>. <pub-id pub-id-type="doi">10.1088/1748-9326/aa87be</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Canha</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Diapouli</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Almeida</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Integrated human exposure to air pollution</article-title>. <source>Int. J. Environ. Res. Public Health</source> <volume>18</volume>, <fpage>2233</fpage>. <pub-id pub-id-type="doi">10.3390/ijerph18052233</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Caubel</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Cados</surname>
<given-names>T. E.</given-names>
</name>
<name>
<surname>Preble</surname>
<given-names>C. V.</given-names>
</name>
<name>
<surname>Kirchstetter</surname>
<given-names>T. W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A distributed network of 100 black carbon sensors for 100 Days of air quality monitoring in west oakland, California</article-title>. <source>Environ. Sci. Technol.</source> <volume>53</volume>, <fpage>7564</fpage>&#x2013;<lpage>7573</lpage>. <pub-id pub-id-type="doi">10.1021/acs.est.9b00282</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Coker</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Amegah</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Mwebaze</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ssematimba</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bainomugisha</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A land use regression model using machine learning and locally developed low cost particulate matter sensors in Uganda</article-title>. <source>Environ. Res.</source> <volume>199</volume>, <fpage>111352</fpage>. <pub-id pub-id-type="doi">10.1016/j.envres.2021.111352</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Coston</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Guha</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ouyang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chouldechova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>D. E.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Leveraging administrative data for bias audits: Assessing disparate coverage with mobility data for COVID-19 policy</article-title>,&#x201d; in <source>Proceedings of the 2021 ACM conference on fairness, accountability, and transparency</source>, <fpage>173</fpage>&#x2013;<lpage>184</lpage>. <pub-id pub-id-type="doi">10.1145/3442188.3445881</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>de Souza</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Reisen</surname>
<given-names>V. A.</given-names>
</name>
<name>
<surname>Franco</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Isp&#xe1;ny</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bondon</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Santos</surname>
<given-names>J. M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Generalized additive models with principal component analysis: An application to time series of respiratory disease and air pollution data</article-title>. <source>J. R. Stat. Soc. Ser. C Appl. Statistics)</source> <volume>67</volume>, <fpage>453</fpage>&#x2013;<lpage>480</lpage>. <pub-id pub-id-type="doi">10.1111/rssc.12239</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Feurer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Eggensperger</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Falkner</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lindauer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <source>Auto-sklearn 2.0: Hands-free AutoML via meta-learning</source>.</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Feurer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Eggensperger</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Springenberg</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Blum</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Efficient and robust automated machine learning</article-title>,&#x201d; in <source>Advances in neural information processing systems</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates, Inc</publisher-name>).</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fisher</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rudin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Dominici</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <source>All models are wrong, but many are useful: Learning a variable&#x2019;s importance by studying an entire class of prediction models simultaneously</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1801.01489</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fowlie</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Walker</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wooley</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Climate policy, environmental justice, and local air pollution</article-title>. <source>Brookings Econ. Stud.</source> <volume>27</volume>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gelaro</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>McCarty</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Su&#xe1;rez</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Todling</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Molod</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Takacs</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>The modern-era retrospective analysis for research and applications, version 2 (MERRA-2)</article-title>. <source>J. Clim.</source> <volume>30</volume>, <fpage>5419</fpage>&#x2013;<lpage>5454</lpage>. <pub-id pub-id-type="doi">10.1175/JCLI-D-16-0758.1</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Doraiswamy</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Levy</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Pikelnaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Maibach</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feenstra</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Impact of California fires on local and regional air quality: The role of a low-cost sensor network and satellite observations</article-title>. <source>GeoHealth</source> <volume>2</volume>, <fpage>172</fpage>&#x2013;<lpage>181</lpage>. <pub-id pub-id-type="doi">10.1029/2018GH000136</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lam</surname>
<given-names>J. C. K.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>V. O. K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A domain-specific bayesian deep-learning approach for air pollution forecast</article-title>. <source>IEEE Trans. Big Data</source> <volume>8</volume>, <fpage>1034</fpage>&#x2013;<lpage>1046</lpage>. <pub-id pub-id-type="doi">10.1109/TBDATA.2020.3005368</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hoek</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Beelen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>de Hoogh</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Vienneau</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Gulliver</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2008</year>). <article-title>A review of land-use regression models to assess spatial variation of outdoor air pollution</article-title>. <source>Atmos. Environ.</source> <volume>42</volume>, <fpage>7561</fpage>&#x2013;<lpage>7578</lpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2008.05.057</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>She</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Human mobility data in the COVID-19 pandemic: Characteristics, applications, and challenges</article-title>. <source>Int. J. Digital Earth</source> <volume>14</volume>, <fpage>1126</fpage>&#x2013;<lpage>1147</lpage>. <pub-id pub-id-type="doi">10.1080/17538947.2021.1952324</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jain</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Presto</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Zimmerman</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Spatial modeling of daily PM2.5, NO2, and CO concentrations measured by a low-cost sensor network: Comparison of linear, machine learning, and hybrid land use models</article-title>. <source>Environ. Sci. Technol.</source> <volume>55</volume>, <fpage>8631</fpage>&#x2013;<lpage>8641</lpage>. <pub-id pub-id-type="doi">10.1021/acs.est.1c02653</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jbaily</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>T.-H.</given-names>
</name>
<name>
<surname>Kamareddine</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Verguet</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Air pollution exposure disparities across US population and income groups</article-title>. <source>Nature</source> <volume>601</volume>, <fpage>228</fpage>&#x2013;<lpage>233</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-021-04190-y</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Keller</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Olives</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.-Y.</given-names>
</name>
<name>
<surname>Sheppard</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sampson</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Szpiro</surname>
<given-names>A. A.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>A unified spatiotemporal modeling approach for predicting concentrations of multiple air pollutants in the multi-ethnic study of Atherosclerosis and air pollution</article-title>. <source>Environ. Health Perspect.</source> <volume>123</volume>, <fpage>301</fpage>&#x2013;<lpage>309</lpage>. <pub-id pub-id-type="doi">10.1289/ehp.1408145</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kelly</surname>
<given-names>K. E.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>W. W.</given-names>
</name>
<name>
<surname>Sayahi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Mitchell</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Becnel</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gaillardon</surname>
<given-names>P.-E.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Community-based measurements reveal unseen differences during air pollution episodes</article-title>. <source>Environ. Sci. Technol.</source> <volume>55</volume>, <fpage>120</fpage>&#x2013;<lpage>128</lpage>. <pub-id pub-id-type="doi">10.1021/acs.est.0c02341</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirwa</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Szpiro</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Sheppard</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sampson</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Keller</surname>
<given-names>J. P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Fine-scale air pollution models for epidemiologic research: Insights from approaches developed in the multi-ethnic study of Atherosclerosis and air pollution (MESA air)</article-title>. <source>Curr. Envir Health Rpt</source> <volume>8</volume>, <fpage>113</fpage>&#x2013;<lpage>126</lpage>. <pub-id pub-id-type="doi">10.1007/s40572-021-00310-y</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brauer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Tsui</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Land use regression modelling of air pollution in high density high rise cities: A case study in Hong Kong</article-title>. <source>Sci. Total Environ.</source> <volume>592</volume>, <fpage>306</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1016/j.scitotenv.2017.03.094</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Girguis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lurmann</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Pavlovic</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>McClure</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Franklin</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Ensemble-based deep learning for estimating PM2.5 over California with multisource big data including wildfire smoke</article-title>. <source>Environ. Int.</source> <volume>145</volume>, <fpage>106143</fpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2020.106143</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bechle</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Presto</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Hankey</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Using crowd-sourced low-cost sensors in a land use regression of PM2.5 in 6 US cities</article-title>. <source>Air Qual. Atmos. Health</source> <volume>15</volume>, <fpage>667</fpage>&#x2013;<lpage>678</lpage>. <pub-id pub-id-type="doi">10.1007/s11869-022-01162-7</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Giuliano</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Habre</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Estimating hourly PM2.5 concentrations at the neighborhood scale using a low-cost air sensor network: A Los Angeles case study</article-title>. <source>Environ. Res.</source> <volume>195</volume>, <fpage>110653</fpage>. <pub-id pub-id-type="doi">10.1016/j.envres.2020.110653</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Longley</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Salmond</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>PyLUR: Efficient software for land use regression modeling the spatial distribution of air pollutants using GDAL/OGR library in Python</article-title>. <source>Front. Environ. Sci. Eng.</source> <volume>14</volume>, <fpage>44</fpage>. <pub-id pub-id-type="doi">10.1007/s11783-020-1221-5</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Masiol</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Z&#xed;kov&#xe1;</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chalupa</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Rich</surname>
<given-names>D. Q.</given-names>
</name>
<name>
<surname>Ferro</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Hopke</surname>
<given-names>P. K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Hourly land-use regression models based on low-cost PM monitor data</article-title>. <source>Environ. Res.</source> <volume>167</volume>, <fpage>7</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1016/j.envres.2018.06.052</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McDuffie</surname>
<given-names>E. E.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>R. V.</given-names>
</name>
<name>
<surname>Spadaro</surname>
<given-names>J. V.</given-names>
</name>
<name>
<surname>Burnett</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>O&#x2019;Rourke</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Source sector and fuel contributions to ambient PM2.5 and attributable mortality across multiple spatial scales</article-title>. <source>Nat. Commun.</source> <volume>12</volume>, <fpage>3594</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-23853-y</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.-F.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>A land use regression model for estimating the NO2 concentration in shanghai, China</article-title>. <source>Environ. Res.</source> <volume>137</volume>, <fpage>308</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1016/j.envres.2015.01.003</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Molod</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Takacs</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Suarez</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bacmeister</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Development of the GEOS-5 atmospheric general circulation model: Evolution from MERRA to MERRA2</article-title>. <source>Geosci. Model. Dev.</source> <volume>8</volume>, <fpage>1339</fpage>&#x2013;<lpage>1356</lpage>. <pub-id pub-id-type="doi">10.5194/gmd-8-1339-2015</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morley</surname>
<given-names>D. W.</given-names>
</name>
<name>
<surname>Gulliver</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A land use regression variable generation, modelling and prediction tool for air pollution exposure assessment</article-title>. <source>Environ. Model. Softw.</source> <volume>105</volume>, <fpage>17</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1016/j.envsoft.2018.03.030</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muttoo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ramsay</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Brunekreef</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Beelen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Meliefste</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Naidoo</surname>
<given-names>R. N.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Land use regression modelling estimating nitrogen oxides exposure in industrial south Durban, South Africa</article-title>. <source>Sci. Total Environ. 610&#x2013;</source> <volume>611</volume>, <fpage>1439</fpage>&#x2013;<lpage>1447</lpage>. <pub-id pub-id-type="doi">10.1016/j.scitotenv.2017.07.278</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nyhan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Grauwin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Britter</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Misstear</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>McNabola</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Laden</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>&#x201c;Exposure track&#x201d;&#x2014;the impact of mobile-device-based mobility patterns on quantifying population exposure to air pollution</article-title>. <source>Environ. Sci. Technol.</source> <volume>50</volume>, <fpage>9671</fpage>&#x2013;<lpage>9681</lpage>. <pub-id pub-id-type="doi">10.1021/acs.est.6b02385</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nyhan</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Kloog</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Britter</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ratti</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Koutrakis</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Quantifying population exposure to air pollution using individual mobility patterns inferred from mobile phone data</article-title>. <source>J. Expo. Sci. Environ. Epidemiol.</source> <volume>29</volume>, <fpage>238</fpage>&#x2013;<lpage>247</lpage>. <pub-id pub-id-type="doi">10.1038/s41370-018-0038-9</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Orun</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Elizondo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goodyer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Paluszczyszyn</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Use of Bayesian inference method to model vehicular air pollution in local urban areas</article-title>. <source>Transp. Res. Part D Transp. Environ.</source> <volume>63</volume>, <fpage>236</fpage>&#x2013;<lpage>243</lpage>. <pub-id pub-id-type="doi">10.1016/j.trd.2018.05.009</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Raffuse</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sullivan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>McCarthy</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Penfold</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hafner</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2007</year>). <source>Ambient air monitoring network assessment guidance, analytical techniques for technical assessments of ambient air monitoring networks</source>. <comment>Retrieved July 20, 2007)</comment>.</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Randles</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Silva</surname>
<given-names>A. M. da</given-names>
</name>
<name>
<surname>Buchard</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Colarco</surname>
<given-names>P. R.</given-names>
</name>
<name>
<surname>Darmenov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Govindaraju</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>The MERRA-2 aerosol reanalysis, 1980 onward. Part I: System description and data assimilation evaluation</article-title>. <source>J. Clim.</source> <volume>30</volume>, <fpage>6823</fpage>&#x2013;<lpage>6850</lpage>. <pub-id pub-id-type="doi">10.1175/JCLI-D-16-0609.1</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ravindra</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Rattan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mor</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Aggarwal</surname>
<given-names>A. N.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Generalized additive models: Building evidence of air pollution, climate change and human health</article-title>. <source>Environ. Int.</source> <volume>132</volume>, <fpage>104987</fpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2019.104987</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reid</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Brauer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Johnston</surname>
<given-names>F. H.</given-names>
</name>
<name>
<surname>Jerrett</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Balmes</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Elliott</surname>
<given-names>C. T.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Critical review of health impacts of wildfire smoke exposure</article-title>. <source>Environ. Health Perspect.</source> <volume>124</volume>, <fpage>1334</fpage>&#x2013;<lpage>1343</lpage>. <pub-id pub-id-type="doi">10.1289/ehp.1409277</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li&#x161;ka</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Vieno</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Carnell</surname>
<given-names>E. J.</given-names>
</name>
<name>
<surname>Beck</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Clemens</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>The influence of residential and workday population mobility on exposure to air pollution in the UK</article-title>. <source>Environ. Int.</source> <volume>121</volume>, <fpage>803</fpage>&#x2013;<lpage>813</lpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2018.10.005</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Mi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Georgopoulos</surname>
<given-names>P. G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Comparison of Machine Learning and Land Use Regression for fine scale spatiotemporal estimation of ambient air pollution: Modeling ozone concentrations across the contiguous United States</article-title>. <source>Environ. Int.</source> <volume>142</volume>, <fpage>105827</fpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2020.105827</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roberts</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>A critical assessment of shrinkage-based regression approaches for estimating the adverse health effects of multiple air pollutants</article-title>. <source>Atmos. Environ.</source> <volume>39</volume>, <fpage>6223</fpage>&#x2013;<lpage>6230</lpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2005.07.004</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sengupta</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lopez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Habte</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maclaurin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shelby</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The national solar radiation data base (NSRDB)</article-title>. <source>Renew. Sustain. Energy Rev.</source> <volume>89</volume>, <fpage>51</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1016/j.rser.2018.03.003</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shaddick</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Thomas</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Mudu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ruggeri</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gumy</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Half the world&#x2019;s population are exposed to increasing air pollution</article-title>. <source>npj Clim. Atmos. Sci.</source> <volume>3</volume>, <fpage>23</fpage>. <pub-id pub-id-type="doi">10.1038/s41612-020-0124-2</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Squire</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>What about bias in the SafeGraph dataset?</article-title> <comment>[WWW Document]. URL Available at: <ext-link ext-link-type="uri" xlink:href="https://www.safegraph.com/blog/what-about-bias-in-the-safegraph-dataset">https://www.safegraph.com/blog/what-about-bias-in-the-safegraph-dataset</ext-link> (accessed 13 7, 22)</comment>.</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Meteorology impact on PM&#x26;amp;lt;sub&#x26;amp;gt;2.5&#x26;amp;lt;/sub&#x26;amp;gt; change over a receptor region in the regional transport of air pollutants: Observational study of recent emission reductions in central China</article-title>. <source>Atmos. Chem. Phys.</source> <volume>22</volume>, <fpage>3579</fpage>&#x2013;<lpage>3593</lpage>. <pub-id pub-id-type="doi">10.5194/acp-22-3579-2022</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thomas</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Jerrett</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kuenzli</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Louis</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Dominici</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zeger</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>Bayesian model averaging in time-series studies of air pollution and mortality</article-title>. <source>J. Toxicol. Environ. Health, Part A</source> <volume>70</volume>, <fpage>311</fpage>&#x2013;<lpage>315</lpage>. <pub-id pub-id-type="doi">10.1080/15287390600884941</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tryner</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>L&#x2019;Orange</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Mehaffy</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Miller-Lionberg</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hofstetter</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Laboratory evaluation of low-cost PurpleAir PM monitors and in-field correction using co-located portable filter samplers</article-title>. <source>Atmos. Environ.</source> <volume>220</volume>, <fpage>117067</fpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2019.117067</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wallace</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Corr</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kanaroglou</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Topographic and spatial impacts of temperature inversions on air quality using mobile air pollution surveys</article-title>. <source>Sci. Total Environ.</source> <volume>408</volume>, <fpage>5086</fpage>&#x2013;<lpage>5096</lpage>. <pub-id pub-id-type="doi">10.1016/j.scitotenv.2010.06.020</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wallace</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ott</surname>
<given-names>W. R.</given-names>
</name>
<name>
<surname>Sarnat</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Calibration of low-cost PurpleAir outdoor monitors using an improved method of calculating PM</article-title>. <source>Atmos. Environ.</source> <volume>256</volume>, <fpage>118432</fpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2021.118432</pub-id>
</citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weissert</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Alberti</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Miles</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Miskell</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Feenstra</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Henshaw</surname>
<given-names>G. S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Low-cost sensor networks and land-use regression: Interpolating nitrogen dioxide concentration at high temporal and spatial resolution in Southern California</article-title>. <source>Atmos. Environ.</source> <volume>223</volume>, <fpage>117287</fpage>. <pub-id pub-id-type="doi">10.1016/j.atmosenv.2020.117287</pub-id>
</citation>
</ref>
<ref id="B57">
<citation citation-type="book">
<collab>WHO</collab> (<year>2022</year>). <source>Billions of people still breathe unhealthy air</source>. <comment>new WHO data [WWW Document]. URL Available at: <ext-link ext-link-type="uri" xlink:href="https://www.who.int/news/item/04-04-2022-billions-of-people-still-breathe-unhealthy-air-new-who-data">https://www.who.int/news/item/04-04-2022-billions-of-people-still-breathe-unhealthy-air-new-who-data</ext-link> (accessed 10 8, 22)</comment>.</citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>P.-Y.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.-Y.</given-names>
</name>
<name>
<surname>Teo</surname>
<given-names>T.-A.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.-W.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>H.-R.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Incorporating land-use regression into machine learning algorithms in estimating the spatial-temporal variation of carbon monoxide in Taiwan</article-title>. <source>Environ. Model. Softw.</source> <volume>139</volume>, <fpage>104996</fpage>. <pub-id pub-id-type="doi">10.1016/j.envsoft.2021.104996</pub-id>
</citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Brauer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Raffuse</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Henderson</surname>
<given-names>S. B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Machine learning approach to estimate hourly exposure to fine particulate matter for urban, rural, and remote populations during wildfire seasons</article-title>. <source>Environ. Sci. Technol.</source> <volume>52</volume>, <fpage>13239</fpage>&#x2013;<lpage>13249</lpage>. <pub-id pub-id-type="doi">10.1021/acs.est.8b01921</pub-id>
</citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Masrur</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Blaszczak-Boxe</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Predicting hourly PM2.5 concentrations in wildfire-prone areas using a SpatioTemporal Transformer model</article-title>. <source>Sci. Total Environ.</source> <volume>160446</volume>, <fpage>160446</fpage>. <pub-id pub-id-type="doi">10.1016/j.scitotenv.2022.160446</pub-id>
</citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ivey</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Gurram</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sivaraman</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Quantifying the impact of daily mobility on errors in air pollution exposure estimation using mobile phone location data</article-title>. <source>Environ. Int.</source> <volume>141</volume>, <fpage>105772</fpage>. <pub-id pub-id-type="doi">10.1016/j.envint.2020.105772</pub-id>
</citation>
</ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zimmerman</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H. Z.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hauryliuk</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Improving correlations between land use and air pollutant concentrations using wavelet analysis: Insights from a low-cost sensor network</article-title>. <source>Aerosol Air Qual. Res.</source> <volume>20</volume>, <fpage>314</fpage>&#x2013;<lpage>328</lpage>. <pub-id pub-id-type="doi">10.4209/aaqr.2019.03.0124</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>