<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2024.1485344</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Exploring code portability solutions for HEP with a particle tracking test code</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ather</surname> <given-names>Hammad</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Berkman</surname> <given-names>Sophie</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Cerati</surname> <given-names>Giuseppe</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2819162/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kortelainen</surname> <given-names>Matti J.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1139057/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kwok</surname> <given-names>Ka Hei Martin</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2871716/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lantz</surname> <given-names>Steven</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lee</surname> <given-names>Seyong</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2845474/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Norris</surname> <given-names>Boyana</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2862256/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Reid</surname> <given-names>Michael</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Reinsvold Hall</surname> <given-names>Allison</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2717696/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Riley</surname> <given-names>Daniel</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2862209/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Strelchenko</surname> <given-names>Alexei</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2871972/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Cong</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Computer and Information Science, University of Oregon</institution>, <addr-line>Eugene, OR</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Physics and Astronomy, Michigan State University</institution>, <addr-line>East Lansing, MI</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Computational Science and AI Directorate, Fermi National Accelerator Laboratory</institution>, <addr-line>Batavia, IL</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Physics, Cornell University</institution>, <addr-line>Ithaca, NY</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>Computer Science and Mathematics Division, Oak Ridge National Laboratory</institution>, <addr-line>Oak Ridge, TN</addr-line>, <country>United States</country></aff>
<aff id="aff6"><sup>6</sup><institution>Physics Department, United States Naval Academy</institution>, <addr-line>Annapolis, MD</addr-line>, <country>United States</country></aff>
<aff id="aff7"><sup>7</sup><institution>School of Computing, Clemson University</institution>, <addr-line>Clemson, SC</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Matt LeBlanc, Brown University, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Davide Costanzo, The University of Sheffield, United Kingdom</p>
<p>Enrico Bothmann, University of G&#x000F6;ttingen, Germany</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Allison Reinsvold Hall <email>achall&#x00040;usna.edu</email></corresp>
<corresp id="c002">Giuseppe Cerati <email>cerati&#x00040;fnal.gov</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>10</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1485344</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>09</day>
<month>10</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Ather, Berkman, Cerati, Kortelainen, Kwok, Lantz, Lee, Norris, Reid, Reinsvold Hall, Riley, Strelchenko and Wang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Ather, Berkman, Cerati, Kortelainen, Kwok, Lantz, Lee, Norris, Reid, Reinsvold Hall, Riley, Strelchenko and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Traditionally, high energy physics (HEP) experiments have relied on x86 CPUs for the majority of their significant computing needs. As the field looks ahead to the next generation of experiments such as DUNE and the High-Luminosity LHC, the computing demands are expected to increase dramatically. To cope with this increase, it will be necessary to take advantage of all available computing resources, including GPUs from different vendors. A broad landscape of code portability tools&#x02014;including compiler pragma-based approaches, abstraction libraries, and other tools&#x02014;allow the same source code to run efficiently on multiple architectures. In this paper, we use a test code taken from a HEP tracking algorithm to compare the performance and experience of implementing different portability solutions. While in several cases portable implementations perform close to the reference code version, we find that the performance varies significantly depending on the details of the implementation. Achieving optimal performance is not easy, even for relatively simple applications such as the test codes considered in this work. Several factors can affect the performance, such as the choice of the memory layout, the memory pinning strategy, and the compiler used. The compilers and tools are being actively developed, so future developments may be critical for their deployment in HEP experiments.</p></abstract>
<kwd-group>
<kwd>heterogeneous computing</kwd>
<kwd>portability solutions</kwd>
<kwd>heterogeneous architectures</kwd>
<kwd>code portability</kwd>
<kwd>particle tracking</kwd>
</kwd-group>
<contract-sponsor id="cn001">U.S. Department of Energy<named-content content-type="fundref-id">10.13039/100000015</named-content></contract-sponsor>
<counts>
<fig-count count="7"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="25"/>
<page-count count="13"/>
<word-count count="10400"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Big Data and AI in High Energy Physics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Modern high energy physics (HEP) experiments have to process enormous volumes of data in their search to probe extremely rare interactions between fundamental particles. The Compact Muon Solenoid (CMS) experiment (CMS Collaboration, <xref ref-type="bibr" rid="B11">2008</xref>) at the CERN Large Hadron Collider (LHC), for example, processed hundreds of petabytes of detector data and Monte Carlo (MC) simulations during Run 2 (2015&#x02013;2018) of the LHC (CMS Offline Software and Computing, <xref ref-type="bibr" rid="B13">2021</xref>, <xref ref-type="bibr" rid="B14">2022</xref>). Within the next decade, HEP experiments such as the High-Luminosity LHC (HL-LHC) (Apollinari et al., <xref ref-type="bibr" rid="B2">2015</xref>) at CERN and the Deep Underground Neutrino Experiment (DUNE) (DUNE Collaboration, <xref ref-type="bibr" rid="B15">2020</xref>) at Fermilab will pose significant additional computing challenges. The event rate at the LHC is expected to increase by a factor of 7.5, and the data volumes will grow to exabyte scale. Likewise, the expected data rate of a DUNE far-detector module is 9.4 PB per year, and so the total tape volume is expected to exceed the exabyte scale by 2040 (DUNE Collaboration, <xref ref-type="bibr" rid="B16">2022</xref>). To handle these data volumes without sacrificing the physics potential of each experiment, significant R&#x00026;D&#x02013;and accompanying shifts in traditional HEP computing paradigms&#x02013;are required.</p>
<p>One paradigm shift that will help HEP experiments prepare for upcoming computing challenges is the ability to utilize parallel heterogeneous computing resources. Historically, the LHC experiments have relied on traditional x86 CPUs for the vast majority of offline computing needs. The majority of the data processing capabilities for the LHC experiments are provided by the Worldwide LHC Computing Grid (WLCG) (Bird, <xref ref-type="bibr" rid="B7">2011</xref>), which connects 170 computing centers in over 40 countries. Increasingly, however, experiments are adapting their software frameworks to take advantage of computing resources at High Performance Computing (HPC) centers (Megino et al., <xref ref-type="bibr" rid="B24">2023</xref>; DUNE Collaboration, <xref ref-type="bibr" rid="B16">2022</xref>). All planned exascale platforms rely heavily on GPUs to achieve their anticipated compute performance, and HEP workflows will need to run on GPUs in order to efficiently utilize these resources (Albrecht and et al., <xref ref-type="bibr" rid="B1">2019</xref>).</p>
<p>Adapting HEP algorithms to run on GPUs is not a trivial task. For example, CMSSW (Jones et al., <xref ref-type="bibr" rid="B19">2006</xref>), the CMS software framework, includes almost 8 million lines of code (CMS Offline Software and Computing, <xref ref-type="bibr" rid="B13">2021</xref>) and was written by hundreds of scientists with varying software backgrounds over the course of decades. Additionally, it is not clear what compute architectures will be prevalent in HPC centers or international scientific computing grids in a decade or two, when the HL-LHC and DUNE experiments are collecting and analyzing data. Even the planned exascale machines in the US use a variety of architectures: Aurora at Argonne National Laboratory uses CPUs and GPUs from Intel, while Frontier at Oak Ridge National Laboratory and El Capitan at Lawrence Livermore Laboratory rely on CPUs and GPUs from AMD. Initial attempts to port HEP algorithms to GPUs typically involved rewriting the original C&#x0002B;&#x0002B; code using CUDA. This process is labor-intensive and only enables offloading to NVIDIA GPUs. Moreover, significant efforts are required to optimize the performance of the initial implementations. The HIP programming language is very similar to CUDA and supports both NVIDIA and AMD GPUs (with very early support for Intel GPUs as well), but neither HIP nor CUDA support CPU architectures directly. Writing and maintaining different implementations for every individual computing platform would take much more expertise and personpower than any HEP experiment can provide.</p>
<p>This is a widely recognized challenge in scientific computing, and there is a broad, rapidly changing landscape of portability solutions that allow a single source code to be compiled and run on a variety of computing backends. The available portability solutions vary widely in terms of overall approach, performance, maturity, and support for different backends or compilers. It is clear, however, that taking advantage of these portability tools will be an essential part of modernizing HEP software. Ideally, a portability solution would achieve two important goals. First, the portability solution should enable straightforward adaptations of existing HEP algorithms, with minimal rewriting and optimization required. Second, the tool should enable algorithms to run efficiently on a variety of different computing architectures, including both CPU and GPU platforms from different manufacturers. The performance on different architectures needs to be reasonable, on the same order of magnitude, although it is unlikely to match that of a fully optimized native implementation. In this paper, we used a standalone benchmark algorithm to test different code portability solutions and evaluated each in terms of its computational performance and subjective ease of use.</p>
<p>Programming models and C<monospace>&#x0002B;&#x0002B;</monospace> libraries such as Kokkos (Edwards et al., <xref ref-type="bibr" rid="B17">2014</xref>; Trott et al., <xref ref-type="bibr" rid="B25">2022</xref>) and Alpaka (Matthes et al., <xref ref-type="bibr" rid="B23">2017</xref>) provide high level data structures and parallel execution options that can be adapted to produce optimized code for a variety of backends, including CPUs and GPUs from NVIDIA, AMD, or Intel (preliminary). Another portability solution is the <monospace>std::execution::par</monospace> (stdpar) interface, which has been included in the C<monospace>&#x0002B;&#x0002B;</monospace> standard since C<monospace>&#x0002B;&#x0002B;</monospace> 17. The application programming interface (API) allows for a high level description of concurrent loops, but does not allow for low level optimizations that can be used to enhance performance in native CUDA or HIP. Various C<monospace>&#x0002B;&#x0002B;</monospace> compilers and associated libraries&#x02014;such as the oneAPI DPC<monospace>&#x0002B;&#x0002B;</monospace>/C<monospace>&#x0002B;&#x0002B;</monospace> Compiler (dpcpp) from Intel, and nvc&#x0002B;&#x0002B; from NVIDIA&#x02014;provide support for offloading loops to GPUs, but these compilers are still relatively new. Similarly, SYCL is a programming model based on the ISO C<monospace>&#x0002B;&#x0002B;</monospace>17 standard that enables host and kernel (device) code to be included in the same source file. Finally, there is a category of directive-based portability solutions, which includes OpenMP and OpenACC: through the use of pragmas, developers can specify high level parallelization and memory management behaviors, with the compilers managing the low level optimizations. It should be noted that all these portability solutions, though they are based on open specifications and open-source libraries, generally rely on proprietary, vendor-supplied software stacks (and often compilers) in order to run on particular GPUs.</p>
<p>This paper is organized as follows: In Section 2, we describe the motivation and context for this work. In Section 3, the benchmark algorithm is described in more detail. The different implementations are covered in Section 4, including technical details and a subjective discussion of the experience porting the algorithm to each tool. Compute performance results are shown in Section 5, and Section 6 provides an overall discussion of our experience and lessons learned.</p>
</sec>
<sec id="s2">
<title>2 Background and related work</title>
<p>There are several processing steps involved in analyzing data from a HEP experiment. For example, analyzing a time window at the LHC that contains at least one proton-proton collision (referred to as an &#x0201C;event&#x0201D;), includes the initial data acquisition, &#x0201C;reconstruction&#x0201D; of the raw detector data into higher level information including what particles were observed and their energies, and the final data analysis and statistical interpretations. This process is similar for other HEP experiments. In CMS and ATLAS (ATLAS Collaboration, <xref ref-type="bibr" rid="B4">2008</xref>), the most computationally expensive reconstruction step is track finding, which is the combinatorial process of reconstructing the trajectories of charged particles from the energy deposits (&#x0201C;hits&#x0201D;) they leave in different layers of the detector (see CMS Collaboration, <xref ref-type="bibr" rid="B12">2014</xref> for a full description from CMS). The benchmarks used for the results in this paper represent the propagation and Kalman update steps (described below) of a traditional Kalman Filter (KF) tracking algorithm (Fruhwirth, <xref ref-type="bibr" rid="B18">1987</xref>). There are two test codes, referred to as the &#x0201C;propagate to z&#x0201D; or &#x0201C;propagate to r&#x0201D; benchmarks, denoted by <monospace>p2z</monospace> and <monospace>p2r</monospace>, respectively. These compact programs (&#x0201C;mini apps&#x0201D;), although including functions extracted from the CMS tracking code, are standalone applications that can be more easily replicated, profiled, and optimized for the purpose of this work.</p>
<p>The work described in this paper builds off efforts by several other groups working to modernize HEP reconstruction algorithms. The <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks are part of a larger algorithm development effort known as <sc>mk</sc>F<sc>it</sc> (Lantz et al., <xref ref-type="bibr" rid="B21">2020</xref>). The goal of the <sc>mk</sc>F<sc>it</sc> project is to rewrite the traditional KF tracking algorithms used by most major HEP experiments and develop a new CPU implementation that is efficient, vectorized, and multithreaded. Depending on the compiler, the <sc>mk</sc>F<sc>it</sc> algorithm achieves up to a factor of six speedup compared to previous KF tracking implementations, and it is now the default algorithm used to reconstruct the majority of tracks in the CMS experiment (Cerati et al., <xref ref-type="bibr" rid="B9">2023</xref>). The key insight of the <sc>mk</sc>F<sc>it</sc> project is that the KF calculations can be parallelized over the thousands of tracks that may be present within a single detector event. Moreover, if the small matrices and vectors holding the data for each track are arranged in memory so that matching elements from different tracks are stored in adjacent locations, then vector or SIMD (Single Instruction, Multiple Data) operations can be used to perform the KF calculations. Similar efforts have also been effective at speeding up code for Liquid Argon Time Projection Chamber (LArTPC) neutrino experiments (Berkman et al., <xref ref-type="bibr" rid="B5">2022</xref>).</p>
<p>The <sc>mk</sc>F<sc>it</sc> effort has so far targeted optimizations for Intel multicore CPU architectures such as the Intel Xeon and Intel Xeon Phi processors and coprocessors, but efficient implementations for other architectures will become increasingly important, especially during the HL-LHC era. Given that <sc>mk</sc>F<sc>it</sc> was explicitly designed to create opportunities for vector or SIMD operations, it seems that GPUs should also make a suitable target platform for the <sc>mk</sc>F<sc>it</sc> approach to parallelizing Kalman filtering. However, initial attempts to port <sc>mk</sc>F<sc>it</sc> to the NVIDIA Kepler GPU (K40) using CUDA were not very encouraging, both in terms of difficulty and in terms of observed performance [for a full discussion, see Section 4 of Cerati et al. (<xref ref-type="bibr" rid="B10">2017</xref>)]. The irregular patterns of memory access that are occasionally needed in order to reorganize the data coming from different tracks turned out to be particularly challenging to manage on GPUs. Even with well-structured data, however, translating standard C&#x0002B;&#x0002B; code to be compatible with NVIDIA CUDA required significant low level re-coding effort to achieve acceptable performance for the basic KF operations. Since it is not feasible to rewrite <sc>mk</sc>F<sc>it</sc> for every possible architecture, the <monospace>p2z</monospace> project was started to explore code portability tools in the context of charged particle tracking.</p>
<p>A broader effort with similar motivation is the HEP Computational Center for Excellence (HEP-CCE) collaboration&#x00027;s Portable Parallelization Strategies (PPS) activity (Bhattacharya et al., <xref ref-type="bibr" rid="B6">2022</xref>; Atif et al., <xref ref-type="bibr" rid="B3">2023</xref>). The HEP-CCE PPS project is exploring portability solutions using representative reconstruction algorithms from CMS and ATLAS as well as LArTPC neutrino experiments such as DUNE. Collaborators from the HEP-CCE PPS activity became involved in this project and have used the <monospace>p2r</monospace> mini app to evaluate GPU offloading via different technologies, such as CUDA, OpenACC, and stdpar, as described in detail below.</p>
<p>Having two teams working simultaneously on these two complementary mini apps has proven to be important to the project&#x00027;s success. For many of the implementations described in Section 4, we found that it was relatively straightforward to do an initial porting of the algorithm but fairly difficult to have a fully optimized version. Different initial strategies in porting the <monospace>p2z</monospace> or <monospace>p2r</monospace> benchmark meant that multiple approaches could be simultaneously developed and tested. In several cases, an issue was identified in a specific <monospace>p2z</monospace> or <monospace>p2r</monospace> implementation and the solution was propagated to both mini apps. Having two different teams with unique expertise also expanded the number of portability technologies we could test. Finally, as explained in Section 5, slightly different approaches were taken to measure the final results, giving additional insight into the performance of each tool.</p>
</sec>
<sec id="s3">
<title>3 Description of algorithm</title>
<p>Track finding (also known as track building) is the process of reconstructing a particle&#x00027;s trajectory by identifying which hits in an event likely came from the same particle. It requires testing many potential combinations of hits to find a set that is consistent with the expected helical trajectory of a charged particle in a magnetic field. Track fitting, on the other hand, is the process of taking a pre-determined set of hits and determining the final parameters of the track. The <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks include everything that would be needed for a realistic track <italic>fitting</italic> algorithm, but do not include the combinatorial selection required for track finding.</p>
<p>The CMS and ATLAS detectors are divided into two main sections: the cylindrical region coaxial with the beam pipe, known as the barrel, and a disk region on either end of the barrel, known as the endcaps. To first approximation, the individual tracker layers can be approximated as being located at constant radius <italic>r</italic> or constant <italic>z</italic> position for the barrel and endcap layers, respectively. Charged particles in a constant magnetic field will travel in a helix, so if the position and momentum are known on layer <italic>N</italic>, then the expected position can be calculated for layer <italic>N</italic> &#x0002B; 1.</p>
<p>The mini apps used in this analysis perform two key steps of KF tracking:</p>
<list list-type="order">
<list-item><p><bold>Track propagation:</bold> Propagate the track state&#x02014;including the track&#x00027;s momentum and position vectors and associated uncertainties in the form of a covariance matrix&#x02014;at layer <italic>N</italic> to a prediction at layer <italic>N</italic> &#x0002B; 1, which is specified by either a <italic>z</italic> coordinate or a radius <italic>r</italic> for the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks, respectively.</p></list-item>
<list-item><p><bold>Kalman update:</bold> Update the track state on layer <italic>N</italic> &#x0002B; 1 by combining information about the propagated track state and the coordinates of a compatible hit on that layer.</p></list-item>
</list>
<p>These two steps are the most arithmetically intensive steps of both track finding and track fitting. They are relatively simple algorithms but have the biggest impact on the overall execution time, making them suitable for a standalone test code.</p>
<sec>
<title>3.1 Input data</title>
<p>The starting point for a KF track finding algorithm is a track &#x0201C;seed&#x0201D;, an initial guess at the track state. For the full <sc>mk</sc>F<sc>it</sc> algorithm, the input track seeds are built in an upstream algorithm using three or four hits from the innermost layers of the CMS detector. For simplicity, the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks use an artificial standalone input consisting of a single track seed. The parameters of the initial seed are smeared according to a Gaussian distribution, in order to prevent the algorithm from performing identical numerical operations for each track.</p>
<p>The tracks are built by first propagating the initial seed parameters to the next layer. In the full combinatorial <sc>mk</sc>F<sc>it</sc> application, the next step is to search for compatible hits on that layer. In the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks, only one hit per layer is considered, similar to what is required for track fitting. The hit parameters&#x02014;the hit locations and uncertainties on each layer&#x02014;are smeared per track according to the same procedure as the input track parameters.</p>
<p>The total number of tracks generated with this procedure is split into &#x0201C;events&#x0201D; with a fixed number of tracks per event.</p>
<p>For the GPU implementations of the benchmark, the track propagation and Kalman update steps are run as a single GPU kernel. The data for all tracks and hits are prepared on the CPU, transferred to the GPU for the computations in two bulk transfers, and the output data are transferred back to the CPU.</p>
</sec>
<sec>
<title>3.2 Computations and data structure</title>
<p>The &#x0201C;propagation to z&#x0201D; or <monospace>p2z</monospace> benchmark uses the expected helical trajectory of the charged particle to calculate the track parameters and the covariance matrix on endcap layer <italic>N</italic> &#x0002B; 1. In contrast, the &#x0201C;propagation to r&#x0201D; or <monospace>p2r</monospace> benchmark uses an iterative approach to propagation, advancing the track state from the initial barrel radius to the final radius in discrete steps. In practice, both approaches involve a series of operations involving sin() and cos() functions, as well as matrix multiplication of up to 6x6 matrices. The two benchmarks are expected to be similar in terms of arithmetic intensity.</p>
<p>The second task in both benchmarks is the Kalman update step, which &#x0201C;updates&#x0201D; the track state using the parameters of a hit on that layer. The hit parameters include three coordinates for the hit position and a 3x3 covariance matrix. Similar to the propagation step, the update step involves small matrix multiplication and matrix inverse operations as well as trigonometric functions.</p>
<p>Both benchmarks employ an Array-Of-Structures-Of-Arrays (AOSOA) data structure, as illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>. Each benchmark runs over a fixed number of events (<monospace>nevts</monospace>) and processes a fixed number of tracks in each event (<monospace>ntrks</monospace>). Within one event, tracks are grouped into batches of <monospace>bsize</monospace> tracks, and each batch of tracks is put into a Structure-Of-Arrays construct referred to as an <monospace>MPTRK</monospace>. Here, as in the full <sc>mk</sc>F<sc>it</sc> algorithm, the goal of organizing tracks into different batches is to enable SIMD operations across batch elements. The value of <monospace>bsize</monospace> can be optimized for different platforms; for example, on a GPU it might be the NVIDIA warp size of 32, while on a CPU it might be a multiple of the AVX-512 vector width of 16; for consistency we use 32 everywhere.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Representation of the data structure used in the <monospace>p2r</monospace> benchmark. The <monospace>p2z</monospace> data structure is similar, but has <monospace>ntrks</monospace> equal to 9,600.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0001.tif"/>
</fig>
<p>To maximize the opportunities for SIMD operations, the storage order in the AOSOA follows the same general scheme as in <sc>mk</sc>F<sc>it</sc>. Within an <monospace>MPTRK</monospace>, the first elements of <monospace>bsize</monospace> different vectors (or matrices) get stored in adjacent locations, followed by the second elements of <monospace>bsize</monospace> different vectors (or matrices), and so on, until the full structure representing a batch of <monospace>bsize</monospace> tracks is completely populated. Then the next <monospace>MPTRK</monospace> structure is written into memory, and the next, until all <monospace>ntrks</monospace> tracks (8192 for <monospace>p2r</monospace> and 9600 for <monospace>p2z</monospace>) for the first event are present in memory. This first event corresponds to the first row of SOAs in <xref ref-type="fig" rid="F1">Figure 1</xref>. It is followed by a second row of SOAs for the second event, etc. A similar memory layout applies to the hit data.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Implementations</title>
<p>The state-of-the-art of portability tools is a moving target, as many tools are undergoing very active development, with additional features, backends, and compiler support being added on a regular basis (<inline-formula><mml:math id="M1"><mml:mrow><mml:mi mathvariant="script">O</mml:mi></mml:mrow></mml:math></inline-formula>(monthly)). In total, we tested nine different parallelization tools on four different architectures, but testing the full matrix of possibilities was beyond the scope of this paper. The final set of <monospace>p2z</monospace> implementations is shown in <xref ref-type="table" rid="T1">Table 1</xref>, including which backends and compilers were used to test each implementation, and the full set of <monospace>p2r</monospace> implementations is shown in <xref ref-type="table" rid="T2">Table 2</xref>. Note that this does not include the full set of backends or compilers that each tool is capable of supporting.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Summary of <monospace>p2z</monospace> implementations used for the results shown in Section 5.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th/>
<th valign="top" align="left"><bold>NVIDIA GPU V100</bold></th>
<th valign="top" align="center"><bold>x86 CPU Xeon gold</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">TBB (oneTBB/2021.10.0)</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">CUDA</td>
<td valign="top" align="center">cuda/11.2 nvcc/11.0.221</td>
<td valign="top" align="center">-</td>
</tr> <tr>
<td valign="top" align="left">Alpaka (v0.8.0)</td>
<td valign="top" align="center">cuda/11.2 nvcc/11.0.221</td>
<td valign="top" align="center">Alpaka v0.9.0 oneTBB/2021.10.20 gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">Kokkos (v4.0)</td>
<td valign="top" align="center">cuda/11.2 nvcc/11.0.221</td>
<td valign="top" align="center">gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">stdpar</td>
<td valign="top" align="center">nvc&#x0002B;&#x0002B;/24.5</td>
<td valign="top" align="center">nvc&#x0002B;&#x0002B;/24.5</td>
</tr> <tr>
<td valign="top" align="left">OpenMPv4</td>
<td valign="top" align="center">OpenARC/0.76 (Lee and Vetter, <xref ref-type="bibr" rid="B22">2014</xref>) nvcc/11.0.221</td>
<td valign="top" align="center">gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">OpenACC</td>
<td valign="top" align="center">OpenARC/0.76 (Lee and Vetter, <xref ref-type="bibr" rid="B22">2014</xref>) nvcc/11.0.221</td>
<td valign="top" align="center">-</td>
</tr> <tr>
<td valign="top" align="left">Results shown in:</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F2">Figure 2</xref>, top</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F7">Figure 7</xref>, top</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The table lists the compiler versions used for each implementation.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Summary of <monospace>p2r</monospace> implementations used for the results shown in Section 5.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th/>
<th valign="top" align="center"><bold>NVIDIA GPU A100</bold></th>
<th valign="top" align="center"><bold>AMD GPU MI100</bold></th>
<th valign="top" align="center"><bold>Intel GPU A770</bold></th>
<th valign="top" align="center"><bold>x86 CPU Xeon Gold</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">TBB (oneTBB/2021.10.0)</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">CUDA</td>
<td valign="top" align="center">cuda/11.6.2nvcc/11.6.124</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
</tr> <tr>
<td valign="top" align="left">HIP</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">rocm/5.2.0</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
</tr> <tr>
<td valign="top" align="left">Alpaka (v0.9.0)</td>
<td valign="top" align="center">cuda/11.6.2nvcc/11.6.124</td>
<td valign="top" align="center">rocm/5.1.3hipcc/3.5.0</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">oneTBB/2021.10.0 gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">Kokkos (v3.6.1)</td>
<td valign="top" align="center">cuda/11.6.2nvcc/11.6.124</td>
<td valign="top" align="center">rocm/5.1.3hipcc/3.5.0</td>
<td valign="top" align="center">Kokkos 4.0dpcpp/2023.0.0</td>
<td valign="top" align="center">Kokkos 4.0.0gcc/12.3.0</td>
</tr> <tr>
<td valign="top" align="left">SYCL</td>
<td valign="top" align="center">cuda/11.6.2intel/llvm-sycl</td>
<td valign="top" align="center">rocm/5.1.3intel/llvm-sycl</td>
<td valign="top" align="center">dpcpp/2023.0.0</td>
<td valign="top" align="center">dpcpp/2023.1.0</td>
</tr> <tr>
<td valign="top" align="left">stdpar</td>
<td valign="top" align="center">nvc&#x0002B;&#x0002B;/22.7</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">dpcpp/2023.0.0dpl/2022.0.0</td>
<td valign="top" align="center">nvc&#x0002B;&#x0002B;/24.5</td>
</tr> <tr>
<td valign="top" align="left">Results shown in:</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F2">Figure 2</xref>, bottom</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F6">Figure 6</xref>, top</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F6">Figure 6</xref>, bottom</td>
<td valign="top" align="center"><xref ref-type="fig" rid="F7">Figure 7</xref>, bottom</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The table lists the compiler versions used for each implementation. For SYCL, the intel/llvm-sycl repository is used to compile dpc&#x0002B;&#x0002B; tool chains to be compatible with NVIDIA/AMD GPUs. This is not the full list of all possible combinations of tools and backends. For example, HIP could be compiled for NVIDIA GPUs as well, but that was not tested in this work. Similarly, we note that the oneAPI toolkit recently included a plugin that enables stdpar to be used on AMD GPUs, and Alpaka has recently introduced experimental support for Intel GPUs in v0.9.0, but neither was tested here.</p>
</table-wrap-foot>
</table-wrap>
<p>For both test codes, the original CPU implementation was multithreaded using the oneAPI Threading Building Blocks (TBB) library and compiled with gcc, since this is the combination that most closely matches what the <sc>mk</sc>F<sc>it</sc> project uses for its highly optimized implementation. The reference GPU implementation is the one written using CUDA and compiled with nvcc. Below we describe each tool, its corresponding <monospace>p2r</monospace> and <monospace>p2z</monospace> implementations, and our subjective experience porting these benchmarks using the different portability solutions.</p>
<sec>
<title>4.1 TBB</title>
<p>Our reference CPU implementation is based on the oneAPI Threading Building Blocks (or oneTBB, or just TBB) library. TBB is a template library originated by Intel for parallel programming on multi-core processors that simplifies the definition and optimization of threaded applications. For a given executable, TBB collects all tasks that are specified to run in parallel and the library manages and schedules threads to execute them. TBB is used as the CPU thread manager in the software framework of the CMS experiment.</p>
<p>In our code, nested TBB <monospace>parallel_for</monospace> loops are over events and batches of tracks. Each batch of tracks is then vectorized, so that tracks in the batch are processed in a SIMD fashion. Vectorization is implemented following the approach used in <sc>mk</sc>F<sc>it</sc>, where groups of matrices are processed concurrently and loops over the matrix index in the group are decorated with <monospace>omp simd</monospace> pragmas. These pragmas were activated by the <monospace>-fopenmp</monospace> compiler option.</p>
</sec>
<sec>
<title>4.2 CUDA and HIP</title>
<p>Our reference GPU implementation is based on the CUDA programming model, which is a multi-threaded SIMD model for general purpose GPU programming, introduced by NVIDIA. The CUDA implementation is ported from the TBB version, which shares the same AOSOA data structure for input data. The main difference in CUDA is that each <monospace>MPTRK</monospace> is processed by a block of GPU threads, and each thread processes the computation for one track in the <monospace>MPTRK</monospace>. Since the computation of each track is independent of the others, we find keeping the intermediate results in the local registers to have the most efficient memory access. We explored using shared memory to store the intermediate results within an <monospace>MPTRK</monospace> for all the threads in the block, but it was shown to have significantly lower memory throughput in a detailed profiling study.</p>
<p>In relation to other portability technologies, CUDA provides a level of abstraction similar to the general accelerator execution model and memory model that is also employed by general GPU programming models such as OpenCL and SYCL. As a proprietary NVIDIA GPU programming model, however, it exposes several NVIDIA-GPU-specific features, which allows available architecture-specific features to be fully exploited, but it also means that the code is not portable across heterogeneous accelerators.</p>
<p>HIP is the vendor-supported, native programming model for AMD GPUs and is designed to be portable across NVIDIA and AMD GPUs. It is also designed to be syntactically similar to CUDA so that most API calls can be simple translations of names. In the case of <monospace>p2z</monospace> and <monospace>p2r</monospace>, the kernels only rely on the core functionalities of CUDA, such as memory allocation and kernel dispatches, which are supported in HIP. This leads to a straightforward port to the HIP version starting from the CUDA version.</p>
</sec>
<sec>
<title>4.3 Directive-based solutions, OpenMP and OpenACC</title>
<p>Directive-based, high-level programming models such as OpenMP and OpenACC use a set of directives (a special type of comments that a compiler can understand) that allow a programmer to provide the compilers with important characteristics of an application, such as available parallelism and data sharing/mapping rules, so that much of the low-level programming and optimization burdens are automatically handled by the directive compilers.</p>
<p>The biggest advantage of the directive programming model is that it allows incremental porting of existing sequential CPU applications written in standard programming languages such as C, C&#x0002B;&#x0002B;, and Fortran to parallel versions that can offload work to heterogeneous accelerators, without requiring major changes in the existing code structures. The initial OpenMP version was created by converting the reference TBB CPU implementation into an OpenMP CPU implementation, which is relatively straightforward due to the similarity between the TBB parallel syntax (using lambdas) and OpenMP parallel constructs. Then, the GPU offloading version was created by extending the OpenMP CPU implementation with OpenMP target-offloading directives and data mapping directives. The CPU version and the GPU version have different parallelism and data mapping strategies. For instance, on GPUs, the OpenMP target and data mapping directives are essential, but when targeting CPUs, they are unnecessary since the original host data and corresponding device data will share storage; in the latter case, how the OpenMP compiler implements those unnecessary directives on the CPUs is implementation-defined. Another issue is that on CPUs, team-level parallelism may or may not be ignored depending on the compiler implementations, possibly resulting in additional overheads due to loop serialization.</p>
<p>The initial conversion from the OpenMP GPU implementation into the OpenACC implementation was straightforward since both OpenMP and OpenACC are directive-based accelerator programming models and provide very similar execution and memory models. However, the main issue in converting between OpenMP and OpenACC was that different OpenMP/OpenACC compilers may choose different parallelism mapping strategies and vary in terms of maturity and supported features (see Section 5.1.1 for results and a more detailed discussion).</p>
</sec>
<sec>
<title>4.4 Alpaka</title>
<p>Alpaka (Matthes et al., <xref ref-type="bibr" rid="B23">2017</xref>) is a single-source, header-only C<monospace>&#x0002B;&#x0002B;</monospace> parallelization library. The API level is similar to CUDA, with an abstraction layer added between the application and the vendor-specific programming models to achieve portability. For example, the kernel functions in Alpaka are templated with an accelerator type, which is resolved at compile time for different execution backends of the kernel. One difference between Alpaka and CUDA is that Alpaka has an additional abstraction level called <italic>elements</italic> in the parallel hierarchy model, where multiple elements can be processed in a thread. Having an explicit level allows compilers to take advantage of the SIMD vector registers when compiling for the CPU backends. Each SOA (<monospace>MPTRK</monospace>) is processed by a block, and the thread/element level is mapped differently between CPU and GPU backends to take full advantage of the parallel hierarchy. For GPU backends, blocks of <monospace>bsize</monospace> threads are assigned to process each <monospace>MPTRK</monospace>, whereas one thread with <monospace>bsize</monospace> elements is assigned to process each <monospace>MPTRK</monospace> for CPU backends. In each case, enough threads are defined so that all <monospace>ntrks</monospace> tracks are processed for an event. Profiling results confirm the use of vectorized instructions in the regions where the original <sc>mk</sc>F<sc>it</sc> implementation was also able to vectorize.</p>
<p>The overall conversion from CUDA to Alpaka is relatively smooth due to the similarity between the two programming models, except that the heavy use of templating often leads to a more verbose coding style and convoluted error messages during debugging. Nevertheless, Alpaka versions are often able to produce close-to-native performance after some effort of optimization. A particular relevant note for HEP experiment is that CMS has chosen to use Alpaka as its supported portability solution for the GPU usage in LHC Run 3 (Kortelainen et al., <xref ref-type="bibr" rid="B20">2021</xref>; Bocci et al., <xref ref-type="bibr" rid="B8">2023</xref>).</p>
</sec>
<sec>
<title>4.5 Kokkos</title>
<p>Similar to Alpaka, Kokkos (Edwards et al., <xref ref-type="bibr" rid="B17">2014</xref>; Trott et al., <xref ref-type="bibr" rid="B25">2022</xref>) serves as a single-source C<monospace>&#x0002B;&#x0002B;</monospace> template metaprogramming (TMP) library, intended to achieve architecture agnosticism and alleviate programmers from the complexities of vendor- or target-specific programming paradigms and heterogeneous hardware architectures. By embracing TMP methodologies, Kokkos facilitates device-specific code generation and optimizations via template specialization. To cater to diverse computing environments, Kokkos offers multiple device-specific backends, implemented as template libraries atop various HPC programming models like CUDA, HIP, OpenMP, HPX, SYCL, and OpenACC. These backends are tailored to adhere to advancements in the C<monospace>&#x0002B;&#x0002B;</monospace> standard, ensuring compatibility and efficacy.</p>
<p>A notable departure from Alpaka is Kokkos&#x00027; emphasis on descriptive rather than prescriptive parallelism. Kokkos prompts developers to articulate algorithms in general parallel programming concepts, which are subsequently mapped to hardware by the Kokkos framework. The Kokkos programming model revolves around two fundamental abstractions: the first being the user data abstraction (<monospace>Kokkos::View</monospace>), a template library facilitating the representation of multidimensional arrays while managing efficient data layout for both CPU and GPU. The second abstraction revolves around parallel execution patterns (<monospace>parallel_for</monospace>, <monospace>parallel_reduce</monospace>, and <monospace>parallel_scan</monospace>), which can be executed under three distinct execution policies: <monospace>RangePolicy</monospace> for mapping single parallel loops, <monospace>MDRangePolicy</monospace> for mapping directly nested parallel loops, and <monospace>TeamPolicy</monospace> for hierarchical mapping of multiple nested parallel loops.</p>
<p>In this study, Kokkos versions were developed by translating CUDA code to Kokkos using the Kokkos <monospace>View</monospace> and parallel dispatch abstractions. The initial translation process was relatively straightforward, owing to the striking similarities between the Kokkos and CUDA execution models and memory models, except for the added complexity due to Kokkos-specific restrictions on C<monospace>&#x0002B;&#x0002B;</monospace> template programming. Efficient execution on both CPU and GPU took further optimization and was achieved by configuring the <monospace>teamSize</monospace> and <monospace>vectorSize</monospace> in the <monospace>TeamPolicy</monospace> used in the <monospace>parallel_for</monospace> execution pattern, as the defaults were found not to be optimal.</p>
</sec>
<sec>
<title>4.6 Standard parallelization using stdpar in C<monospace>&#x0002B;&#x0002B;</monospace></title>
<p>The C&#x0002B;&#x0002B; programming language is often the preferred choice for implementing high performance scientific applications. The recent revisions of the ISO C&#x0002B;&#x0002B; standard introduced a suite of algorithms capable of being executed on accelerators. Although this approach may not yield the best performance, it can present a viable balance between code productivity and computational efficiency. Numerous production-grade compilers are available, such as clang and its variants from various providers, or the recently released NVHPC from NVIDIA.</p>
<p>With the introduction of the C&#x0002B;&#x0002B;17 standard, the C<monospace>&#x0002B;&#x0002B;</monospace> Standard Template Library (STL) underwent a substantial overhaul of its suite of algorithms, now updated with execution policies to adapt across various computing architectures, including multi-core x86 systems and GPUs. These parallel algorithms extended most of the existing STL algorithms with an additional argument, which is an execution policy. The policy enables programmers to specify the intended parallelism of an algorithm, which can result in performance improvements for the computational task. In particular, the execution policies in C&#x0002B;&#x0002B;17 include:</p>
<list list-type="bullet">
<list-item><p><monospace>std::execution::seq</monospace></p></list-item>
<list-item><p><monospace>std::execution::unseq</monospace></p></list-item>
<list-item><p><monospace>std::execution::par</monospace></p></list-item>
<list-item><p><monospace>std::execution::par_unseq</monospace></p></list-item>
</list>
<p>The first option forces the algorithm to run sequentially, while the remaining three options allow the algorithm to be vectorized or run in parallel (with additional vectorization). Both the <monospace>p2z</monospace> and <monospace>p2r</monospace> stdpar implementations use the <monospace>std::execution::par_unseq</monospace> execution policy. Currently, only the nvc&#x0002B;&#x0002B; compiler offers support for stdpar algorithms to be offloaded on NVIDIA GPUs. It leverages CUDA Unified Memory to handle automatic data movement between CPU and GPU. On the systems that do not support Heterogeneous Memory Management, only data that is dynamically allocated in CPU code compiled by nvc&#x0002B;&#x0002B; is automatically managed, whereas memory allocated in GPU code is exclusively for GPU use and remains unmanaged. Thus, on such systems, CPU and GPU stack memory, along with global objects, are outside nvc&#x0002B;&#x0002B;&#x00027;s automatic management scope. Even data allocated on the CPU heap outside units compiled by nvc&#x0002B;&#x0002B; is not managed. When dealing with parallel algorithm invocations, pointers and objects must refer to data on the managed CPU heap to avoid errors. Any dereferencing of pointers to CPU stack or global objects in GPU code can lead to memory violations. These aspects encapsulate the nvc&#x0002B;&#x0002B; compiler&#x00027;s precise approach to memory management across CPU and GPU, emphasizing careful allocation and reference handling to ensure efficient operations. In our experience, developing code for this application is largely similar to standard C&#x0002B;&#x0002B; programming, with the primary distinction being the need to consider the previously mentioned limitations.</p>
</sec>
<sec>
<title>4.7 SYCL</title>
<p>SYCL represents a cross-platform abstraction layer that enables code for heterogeneous processors to be written in a &#x0201C;single-source&#x0201D; style using completely standard C&#x0002B;&#x0002B;. This approach aims to enhance the efficiency and accessibility of programming for a variety of compute architectures. While the SYCL programming language was promoted by the Khronos Group, it is predominantly advocated for by Intel, so the primary focus is on the optimization for Intel GPUs.</p>
<p>One of the key advantages of SYCL is the ability to handle regular C&#x0002B;&#x0002B; code for the host CPU and a subset of C&#x0002B;&#x0002B; for the device code within the same source file. This ability paves the way for an integrated and simplified development process. It also enables parallelism and the usage of memory hierarchies through a class template library. This effectively allows the expression of parallel STL, thus further integrating SYCL with standard C&#x0002B;&#x0002B; features. Since it was designed to be fully compatible with standard C&#x0002B;&#x0002B;, it allows developers to utilize any C&#x0002B;&#x0002B; library within a SYCL application. This compatibility with standard C&#x0002B;&#x0002B; makes SYCL a versatile tool for developers. Moreover, SYCL&#x00027;s design prioritizes performance portability, aiming to provide high performance across a wide range of hardware architectures. Its abstractions are constructed to allow optimization but do not require a particular architecture or kernel language.</p>
<p>For the benchmarks described in this paper, the programming approach for SYCL is nearly a direct replication of the CUDA approach. In our implementation, we utilized SYCL&#x00027;s Unified Shared Memory feature for data management. To compile SYCL for NVIDIA and AMD GPUs, we compiled the dpc&#x0002B;&#x0002B; tool chain following the instructions in Intel&#x00027;s open-source llvm-sycl repository.<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref></p>
</sec>
</sec>
<sec sec-type="results" id="s5">
<title>5 Results</title>
<p>The most important computing metric, from the point of view of HEP computing as a whole, is the algorithm&#x00027;s throughput, defined here as the number of tracks that can be processed per second. To measure the throughput, all versions processed approximately 800k tracks in a single kernel. The kernel, together with the data movement operations, are then iterated a fixed number of times to sustain a total program run time of around one minute. The measurements were repeated multiple times to ensure stability of results. We tested the performance of different tools on a number of different hardware systems, including NVIDIA GPUs (Section 5.1), Intel and AMD GPUs (Section 5.2), and Intel CPUs (Section 5.3).</p>
<p>The code used for the results in this paper is directly extracted from the <sc>mk</sc>F<sc>it</sc> application. In order to reduce overheads, it does not have its own setup for validating the results in terms of physics output. For a discussion of the <sc>mk</sc>F<sc>it</sc> physics performance results, see the <sc>mk</sc>F<sc>it</sc> papers (Cerati et al., <xref ref-type="bibr" rid="B10">2017</xref>; Lantz et al., <xref ref-type="bibr" rid="B21">2020</xref>). Numerical reproducibility and consistency of results across different <monospace>p2z</monospace> and <monospace>p2r</monospace> versions are verified with summary printouts. Since the artificial sample is produced by smearing the inputs from a single track, printouts report the mean and RMS values of the track parameters computed over the full sample. Output results from different versions are numerically identical when the same compiler is used. Compilers may introduce differences in the average parameter values at or below the ppm level. Such differences are due to numerical precision in floating point operations and the different levels of optimization used by default by different compilers. The level of agreement in the output guarantees that all versions perform the same operations, making the comparisons in terms of computing performance meaningful.</p>
<sec>
<title>5.1 NVIDIA GPU results</title>
<p>The throughput of the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks were measured using two different systems with two different NVIDIA GPUs. To test the <monospace>p2r</monospace> implementations, we used the Joint Laboratory for System Evaluation (JLSE), a collection of HPC testbeds hosted at Argonne National Lab. The NVIDIA GPU that was used for testing is an A100 GPU with an AMD 7532 CPU as the host machine. For the <monospace>p2z</monospace> implementations, the measurements were performed on a test node for the Summit HPC system at Oak Ridge National Laboratory. Each Summit node includes six NVIDIA V100 GPUs and two IBM Power9 CPUs, although only one GPU was utilized in our tests.</p>
<p>The throughput measurements on NVIDIA GPUs are shown in <xref ref-type="fig" rid="F2">Figure 2</xref> for both the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks. In both cases, the native CUDA implementation is used as the reference implementation, and the measurement time includes the kernel execution only. Several different compilers are used for the different implementations: the CUDA, Alpaka, and Kokkos implementations are compiled with the nvcc compiler; OpenMP and OpenACC are compiled with the OpenARC (Lee and Vetter, <xref ref-type="bibr" rid="B22">2014</xref>) compiler; the stdpar versions are compiled with nvc&#x0002B;&#x0002B;; and for SYCL, the intel/llvm-sycl repository is used to compile dpc&#x0002B;&#x0002B; tool chains to be compatible with NVIDIA/AMD GPUs. For evaluation, whenever possible, we use the same launch parameters, including number of blocks and number of threads per block. In the OpenACC and OpenMP versions, different compilers varied in how they enforced the user-specified configurations (see Section 5.1.1). The launch parameters need to be manually specified for Alpaka and Kokkos, otherwise the libraries choose suboptimal values and the performance is about 30% worse, taking those versions even further from the native CUDA version. Setting the number of registers per thread is another approximately 10% effect. These parameters cannot be manually specified in the stdpar implementation. In the SYCL implementations, we specified the execution volume manually but relied on the defaults for the other launch parameters.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Throughput measurements for the p2z <bold>(top)</bold> and <monospace>p2r</monospace><bold>(bottom)</bold> benchmarks on NVIDIA GPUs. The p2z measurements were performed on a V100 GPU and the p2r measurements were performed on an A100 GPU. In both cases, only the kernel execution time was considered in the throughput.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0002.tif"/>
</fig>
<p>In general, most of the different portability solutions managed to produce close-to-native performance. The stdpar implementation did not perform well for either benchmark, mainly because the stdpar implemented in the nvc&#x0002B;&#x0002B; compiler relies on CUDA Unified Memory for all data movements between CPU and GPU memory, which fetches data needed by the GPU kernel on demand, exposing memory transfer overheads to the kernel execution. CUDA Unified Memory provides APIs to prefetch or migrate data between CPU and GPU memory to hide or reduce the memory transfer overheads, but the current stdpar does not include such functionalities. In order to mitigate the effects of data transfers while measuring the kernel execution time, we introduced implicit prefetching using the parallel <monospace>std::copy</monospace> algorithm. For the <monospace>p2r</monospace> benchmark, the worst performing version is the SYCL implementation. Detailed profiling using NVIDIA NSight Compute shows significant branching when using SYCL, but preliminary investigations have not revealed an obvious explanation for the branching.</p>
<p>For the <monospace>p2z</monospace> benchmark, most of the different implementations achieved similar performance except for the stdpar version. While similar to other <monospace>p2z</monospace> implementations, the relative performance of <monospace>p2z</monospace> Kokkos version is lower than that of the <monospace>p2r</monospace> Kokkos version, when compared to their corresponding CUDA versions. Detailed profiling shows that the Kokkos version of <monospace>p2z</monospace> uses more registers than the CUDA version, while the Kokkos version of <monospace>p2r</monospace> uses a similar number of registers compared to the CUDA version. Further investigations of the branching of the <monospace>p2r</monospace> SYCL version and of the difference in register usage in the <monospace>p2z</monospace> and <monospace>p2r</monospace> Kokkos version are beyond the scope of the present paper.</p>
<p><xref ref-type="fig" rid="F3">Figure 3</xref> shows the throughput of the <monospace>p2z</monospace> implementations on the NVIDIA V100 GPUs, this time including both the kernel execution time and the memory transfer times. The transfer times are generally 2 to 5 times larger then the kernel execution times, which means that much of the variability between implementations is concealed. With the exception of stdpar, all implementations have close to identical performance because they provide memory management features to facilitate explicit data transfers, and in some cases memory prepinning as discussed in Section 5.1.2.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Throughput measurement for the <monospace>p2z</monospace> benchmark on NVIDIA V100 GPUs, including kernel execution time as well as data transfer times.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0003.tif"/>
</fig>
<sec>
<title>5.1.1 Compiler dependence</title>
<p>Different compilers can yield very different timing results, especially for the directive-based portability solutions. <xref ref-type="fig" rid="F4">Figure 4</xref> shows the throughput performance for the OpenACC and OpenMP <monospace>p2z</monospace> versions on NVIDIA V100 GPUs, including both kernel execution time and data transfer times. The left bars in <xref ref-type="fig" rid="F4">Figure 4</xref> show that the OpenARC-compiled OpenMP version performs better than the versions compiled with llvm, gcc, or IBM. Detailed profiling shows that the llvm, gcc, and IBM compiled versions use different launch parameters (the number of threads in a thread block and the number of thread blocks) than those specified in the OpenMP program, while the OpenARC-compiled version literally follows the user-specified configuration. Different launch parameters in the llvm/gcc/IBM compiled versions adversely affected the concurrency. The OpenMP version of <monospace>p2z</monospace> allocates temporary user data in the team-private memory: the OpenARC-compiled version allocates the team-private data in the CUDA shared memory, but the llvm/gcc/IBM compiled versions only partially use the CUDA shared memory, which incurs more device global memory accesses than the OpenARC-generated version. Lower concurrency and more global memory accesses seem to be the main reasons for the lower performance of the llvm/gcc/IBM compiled OpenMP versions.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Comparison of the throughput performance on a V100 GPU using different compilers for the OpenMP <bold>(left)</bold> and OpenACC <bold>(right)</bold> versions of the <monospace>p2z</monospace> benchmark. Measurements include both kernel execution time and data transfer times.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0004.tif"/>
</fig>
<p>In the OpenACC version of <monospace>p2z</monospace> (the right bars in <xref ref-type="fig" rid="F4">Figure 4</xref>), both the nvc&#x0002B;&#x0002B; and OpenARC compiled versions achieved similar kernel computation times, but the OpenARC-compiled version had better memory transfer performance than the version compiled with nvc&#x0002B;&#x0002B;. OpenARC literally translates the OpenACC data clauses into corresponding memory transfer APIs (one transfer API call per list item in a memory transfer data clause), but the nvc&#x0002B;&#x0002B; compiler automatically splits memory transfers into multiple, small asynchronous memory transfer calls. Splitting memory transfers may expose more communication and computation overlapping opportunities, but in the <monospace>p2z</monospace> case, too many small asynchronous memory transfers in the nvc&#x0002B;&#x0002B; version perform worse than the simple memory transfers implemented in OpenARC, which is the main reason for the better performance of the OpenARC version than the nvc&#x0002B;&#x0002B; version. Like OpenARC, gcc also generates one memory transfer API call per list item in a memory transfer data clause, but the gcc version achieves lower memory transfer throughput than the OpenARC version, which may be caused by the host memory pre-pinning optimization, which is supported by OpenARC but not by gcc, as shown in the next section.</p>
</sec>
<sec>
<title>5.1.2 Effect of memory prepinning</title>
<p>For GPU versions, implementing host memory pinning was found to greatly improve the performance, which is shown in <xref ref-type="fig" rid="F5">Figure 5</xref> for the Kokkos and OpenACC <monospace>p2z</monospace> versions, including both kernel execution time and data transfer times. In the system with NVIDIA GPUs, prepinning the host memory enables direct-memory access (DMA) transfers, which achieve better memory transfer bandwidth than non-DMA transfers.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Throughput comparison on a V100 GPU showing the effect of turning memory prepinning on or off for different OpenACC <bold>(top)</bold> and Kokkos implementations <bold>(bottom)</bold> of the <monospace>p2z</monospace> benchmark: batch shared memory with synchronous or asynchronous transfers, and thread-private data on local memory with synchronous or asynchronous transfers. Measurements include both kernel execution time and data transfer times. The percentages in the figure refer to the throughput with memory prepinning on compared to the corresponding non-prepinned version.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0005.tif"/>
</fig>
<p>We compared the performance with and without explicit prepinning for three OpenACC versions (<xref ref-type="fig" rid="F5">Figure 5</xref>, left). The first uses batched shared memory and synchronous transfers. The second implementation uses batched shared memory and asynchronous transfers, and the final version uses thread-private data on local memory with asynchronous transfers. Like the CUDA and HIP implementations (see Section 4.2), the results show that keeping the intermediate results in the local register (thread-private data version) performs better than using the shared memory. When host memory prepinning is on, all of the host data appearing in OpenACC clauses are prepinned. In all cases, prepinning is observed to improve the performance. The asynchronous versions have on-demand host-memory pre-pinning, which means the host memory is prepinned before each asynchronous memory transfer if it is not already prepinned. Therefore, a smaller impact from explicit prepinning is observed than for the synchronous transfer version. The performance improvement from the host memory prepinning is more pronounced in the Kokkos results (<xref ref-type="fig" rid="F6">Figure 6</xref>, right), because the internal implementation of the Kokkos library requires additional memory transfers, such as copying the functor objects, while the OpenACC/OpenMP versions do not.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Throughput measurements for the <monospace>p2r</monospace> benchmark on an AMD MI-100 GPU <bold>(top)</bold> and on an Intel A770 GPU <bold>(bottom)</bold>. The performance of each implementation is compared to the performance of the native version (HIP for the AMD GPU and SYCL for the Intel GPU). Only the kernel execution time was included in the measurements.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0006.tif"/>
</fig>
</sec>
</sec>
<sec>
<title>5.2 AMD and Intel GPU results</title>
<p>Support for other GPU architectures is in general less mature than the support for NVIDIA GPUs, although this is an area of rapid expansion. We explored the preliminary performance of the <monospace>p2r</monospace> benchmark on both AMD and Intel GPUs, and the kernel-only throughput measurements are shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. This is an out-of-the-box comparison of each tool&#x00027;s portability; no dedicated effort was made to optimize for AMD or Intel GPU architectures. For example, the HIP implementation for the AMD GPU is a carbon copy of the native CUDA version. For both Alpaka and Kokkos, switching backends is relatively seamless and does not require any code changes.</p>
<p>The results on the AMD GPU are shown in the left plot of <xref ref-type="fig" rid="F6">Figure 6</xref>. The AMD GPU tests were performed on the JLSE testbed, which includes two AMD EPYC 7543 32c (Milan) CPUs and four AMD MI100 32GB GPUs. Only one GPU was used to perform the measurements. Both Kokkos and Alpaka include HIP backends which achieve reasonable performance: Alpaka actually outperforms the HIP version that was ported from CUDA, and Kokkos is within a factor of about 2. The same launch parameters as the CUDA implementation are used for the native HIP, Alpaka:HIP, and Kokkos:HIP measurements.</p>
<p>The throughput measurements on an Intel A770 GPU are shown in the right plot of <xref ref-type="fig" rid="F6">Figure 6</xref>. Since the A770 is not an HPC-class GPU, all calculations were converted to single-precision operations. Relying on double-precision emulation results in performance that is 3 to 30 times slower, depending on the implementation. The SYCL backend for Alpaka has only experimental support (introduced in v0.9.0) and was not tested here. The Kokkos SYCL backend is still under active development: we observed a factor of 2 improvement in the throughput of the Kokkos:SYCL implementation when we updated from Kokkos 3.6.1 to Kokkos 4.1.0.</p>
</sec>
<sec>
<title>5.3 CPU results</title>
<p>The original <sc>mk</sc>F<sc>it</sc> application is parallelized using the Threading Building Blocks (TBB) library from Intel, so we also used TBB as the reference native implementation on the CPU. This implementation is multithreaded and vectorized. It is worth noting that the original version was initially developed based on the &#x0201C;classic&#x0201D; Intel C&#x0002B;&#x0002B; Compiler (icc version 19), which led to improved vectorization performance compared to more recent compilers, resulting in <monospace>p2z</monospace> execution times that were approximately 2.7x faster. However, since this version is not supported anymore, we choose not to include it in our main results. <xref ref-type="fig" rid="F7">Figure 7</xref> shows the throughput performance of the <monospace>p2z</monospace> and <monospace>p2r</monospace> benchmarks on a two-socket system equipped with Intel Xeon Gold 6248 CPUs. All implementations were compiled with gcc except stdpar, which was compiled with nvc&#x0002B;&#x0002B;, and SYCL, which was compiled with dpcpp. Nonstandard options such as <monospace>-ffast-math</monospace> are not included in the compilation since, while they may help vectorizing loops including trigonometric functions, they do not guarantee numerical reproducibility.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Throughput measurements for <monospace>p2z</monospace><bold>(top)</bold> and <monospace>p2r</monospace> benchmarks <bold>(bottom)</bold> on a two-socket system equipped with Intel Xeon Gold 6248 CPUs. The performance of each implementation is compared to the performance of the native TBB version.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1485344-g0007.tif"/>
</fig>
<p>Using the portability layers, we are able to achieve throughput equal to or better than 70% of the original, native performance on the CPU for most implementations. The Alpaka implementation of the <monospace>p2z</monospace> benchmark actually outperformed the TBB reference implementation. The reason for the better performance of <monospace>p2z</monospace> Alpaka is not currently known, and the results might be platform dependent. Note that the <monospace>p2z</monospace> Alpaka implementation uses the OpenMP execution backend, while the <monospace>p2r</monospace> Alpaka implementation relies on the TBB execution backend. The <monospace>p2r</monospace> SYCL implementation only achieves 27% of the reference implementation, but unlike many of the other implementations, SYCL is a language extension and depends heavily on compiler optimizations rather than a performance-tuned library. When optimizing these implementations, considerations included deciding on an optimal data layout that works for both CPUs and GPUs and ensuring that the loops are properly vectorized when run on the CPU.</p>
</sec>
</sec>
<sec id="s6">
<title>6 Summary</title>
<p>In the project described in this paper, we ported two benchmark applications for charged-particle track reconstruction using state-of-the-art portability tools and compared the results to the native implementations. These benchmarks could form the backbone of a realistic track fitting algorithm. We have tested our ports on Intel CPUs and on GPUs from different vendors. In developing and testing these benchmarks, we found that the performance can vary significantly depending on the details of the implementation. Achieving optimal performance was not easy, even for relatively simple applications such as these. Each implementation took several iterations of profiling and development to achieve the results shown here. Ranking the difficulty of each porting effort would not be straightforward, since the lessons learned when implementing one solution would often apply to subsequent solutions, without the need for rediscovery. Even so, the steps that were taken to improve performance on one type of accelerator (NVIDIA GPUs, for example) did not necessarily translate into analogous gains on other types of GPUs or CPUs.</p>
<p>Several factors were found to have large effects on the final performance. We found that optimizing the memory layout and enabling explicit memory prepinning (in the case of NVIDIA GPUs) led to big improvements in the performance of each implementation, up to a factor of six speedup. The choice of compiler also changed the throughput performance on NVIDIA GPUs by an order of magnitude or more for OpenMP and OpenACC implementations. Because these compilers are undergoing very active development, regularly checking performance with the latest versions is important. The same is true for the tools themselves, especially with respect to support for AMD and Intel GPUs. For example, we observed a factor of two speedup on an Intel GPU when updating to a newer version of the Kokkos library.</p>
<p>In summary, we explored major portability solutions using two benchmark applications from HEP, including implementations using Alpaka, Kokkos, SYCL, stdpar, OpenMP, and OpenACC. Most of these solutions can give reasonable performance, on the same order of magnitude, on different architectures, but most cases required significant optimization. The ability to run algorithms on GPUs from different vendors will allow HEP experiments to take advantage of a variety of computing resources, including current, planned, and future HPCs. This paper demonstrates that while tools exist to effectively port existing CPU algorithms to GPUs, reaching the desired performance is not yet straightforward. Future development of these portability solutions and their application, or alternative methods, will be necessary for the successful operation and data analysis of these experiments.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/cerati/p2z-tests/releases/tag/v1.0">https://github.com/cerati/p2z-tests/releases/tag/v1.0</ext-link>; <ext-link ext-link-type="uri" xlink:href="https://github.com/cerati/p2r-tests/releases/tag/v1.0">https://github.com/cerati/p2r-tests/releases/tag/v1.0</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>HA: Investigation, Software, Writing &#x02013; review &#x00026; editing. SB: Investigation, Writing &#x02013; review &#x00026; editing. GC: Investigation, Project administration, Software, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. MK: Investigation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Software. KK: Investigation, Software, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SLa: Investigation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SLe: Investigation, Software, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. BN: Investigation, Software, Writing &#x02013; review &#x00026; editing. MR: Investigation, Software, Writing &#x02013; review &#x00026; editing. AR: Investigation, Project administration, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. DR: Investigation, Writing &#x02013; review &#x00026; editing. AS: Investigation, Software, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. CW: Investigation, Software, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. We thank the Joint Laboratory for System Evaluation (JLSE) for providing the resources for the performance measurements used in this work. This research used resources of the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725. This material is based upon work by the RAPIDS Institute and the &#x0201C;HEP Event Reconstruction with Cutting Edge Computing Architectures&#x0201D; project, supported by the U.S. Department of Energy, Office of Science, Office of Advanced Scientific Computing Research and Office of High-Energy Physics, Scientific Discovery through Advanced Computing (SciDAC) program. This work was supported by the National Science Foundation under Cooperative Agreements OAC-1836650 and PHY-2323298; by the U.S. Department of Energy, Office of Science, Office of High Energy Physics under Award Number 89243023SSC000116; and by the U.S. Department of Energy, Office of Science, Office of High Energy Physics, High Energy Physics Center for Computational Excellence (HEP-CCE) at Fermi National Accelerator Laboratory under B&#x00026;R KA2401045. This manuscript has been authored by UT-Battelle, LLC under Contract No. DE-AC05-00OR22725 with the U.S. Department of Energy and by Fermi Research Alliance, LLC under Contract No. DE-AC02-07CH11359 with the U.S. Department of Energy, Office of Science, Office of High Energy Physics. The publisher, by accepting the article for publication, acknowledges that the U.S. Government retains a non-exclusive, paid up, irrevocable, world-wide license to publish or reproduce the published form of the manuscript, or allow others to do so, for U.S. Government purposes. The DOE will provide public access to these results in accordance with the DOE Public Access Plan (<ext-link ext-link-type="uri" xlink:href="http://energy.gov/downloads/doe-public-access-plan">http://energy.gov/downloads/doe-public-access-plan</ext-link>).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup>Intel llvm/sycl branch, <ext-link ext-link-type="uri" xlink:href="https://github.com/intel/llvm/tree/70c2dc6dcf73f645248aa7c70c8cefdabf37e9b7">https://github.com/intel/llvm/tree/70c2dc6dcf73f645248aa7c70c8cefdabf37e9b7</ext-link>.</p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Albrecht</surname> <given-names>J.</given-names></name> <name><surname>Alves</surname> <given-names>A. A.</given-names></name> <name><surname>Amadio</surname> <given-names>G.</given-names></name> <name><surname>Andronico</surname> <given-names>G.</given-names></name> <name><surname>Anh-Ky</surname> <given-names>N.</given-names></name> <name><surname>Aphecetche</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>A roadmap for HEP software and computing R&#x00026;D for the 2020s</article-title>. <source>Comp. Softw. Big Sci</source>. <volume>3</volume>:<fpage>8</fpage>. <pub-id pub-id-type="doi">10.1007/s41781-018-0018-8</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Apollinari</surname> <given-names>G.</given-names></name> <name><surname>Br&#x000FC;ning</surname> <given-names>O.</given-names></name> <name><surname>Nakamoto</surname> <given-names>T.</given-names></name> <name><surname>Rossi</surname> <given-names>L.</given-names></name></person-group> (<year>2015</year>). <source>High Luminosity Large Hadron Collider HL-LH. Technical Report FERMILAB-PUB-15-699-TD</source>, CERN Yellow Rep. Available at: <ext-link ext-link-type="uri" xlink:href="https://cds.cern.ch/record/2116337">https://cds.cern.ch/record/2116337</ext-link></citation>
</ref>
<ref id="B3">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Atif</surname> <given-names>M.</given-names></name> <name><surname>Battacharya</surname> <given-names>M.</given-names></name> <name><surname>Calafiura</surname> <given-names>P.</given-names></name> <name><surname>Childers</surname> <given-names>T.</given-names></name> <name><surname>Dewing</surname> <given-names>M.</given-names></name> <name><surname>Dong</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2023</year>). <source>Evaluating Portable Parallelization Strategies for Heterogeneous Architectures in High Energy Physics</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://inspirehep.net/literature/2672556">https://inspirehep.net/literature/2672556</ext-link></citation>
</ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><collab>ATLAS Collaboration</collab></person-group> (<year>2008</year>). <article-title>The ATLAS experiment at the CERN Large Hadron Collider</article-title>. <source>JINST</source> <volume>3</volume>:<fpage>S08003</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/3/08/S08003</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Berkman</surname> <given-names>S.</given-names></name> <name><surname>Cerati</surname> <given-names>G.</given-names></name> <name><surname>Knoepfel</surname> <given-names>K.</given-names></name> <name><surname>Mengel</surname> <given-names>M.</given-names></name> <name><surname>Reinsvold Hall</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Optimizing the hit finding algorithm for liquid argon TPC neutrino detectors using parallel architectures</article-title>. <source>JINST</source> <volume>17</volume>:<fpage>P01026</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/17/01/P01026</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Bhattacharya</surname> <given-names>M.</given-names></name> <name><surname>Calafiura</surname> <given-names>P.</given-names></name> <name><surname>Childers</surname> <given-names>T.</given-names></name> <name><surname>Dewing</surname> <given-names>M.</given-names></name> <name><surname>Dong</surname> <given-names>Z.</given-names></name> <name><surname>Gutsche</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Portability: a necessary approach for future scientific software,&#x0201D;</article-title> in <source>Proceedings of the US Community Study on the Future of Particle Physics (Snowmass 2021</source>). Available at: <ext-link ext-link-type="uri" xlink:href="https://inspirehep.net/literature/2054702">https://inspirehep.net/literature/2054702</ext-link> <pub-id pub-id-type="pmid">32470916</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bird</surname> <given-names>I.</given-names></name></person-group> (<year>2011</year>). <article-title>Computing for the large Hadron Collider</article-title>. <source>Ann. Rev. Nucl. Part. Sc</source>. <volume>61</volume>, <fpage>99</fpage>&#x02013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-nucl-102010-130059</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bocci</surname> <given-names>A.</given-names></name> <name><surname>Czirkos</surname> <given-names>A.</given-names></name> <name><surname>Pilato</surname> <given-names>A. D.</given-names></name> <name><surname>Pantaleo</surname> <given-names>F.</given-names></name> <name><surname>Hugo</surname> <given-names>G.</given-names></name> <name><surname>Kortelainen</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Performance portability for the CMS reconstruction with Alpaka</article-title>. <source>J. Phys. Conf. Ser</source>. <volume>2438</volume>:<fpage>012058</fpage>. <pub-id pub-id-type="doi">10.1088/1742-6596/2438/1/012058</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Cerati</surname> <given-names>G.</given-names></name> <name><surname>Elmer</surname> <given-names>P.</given-names></name> <name><surname>Gartung</surname> <given-names>P.</given-names></name> <name><surname>Giannini</surname> <given-names>L.</given-names></name> <name><surname>Kortelainen</surname> <given-names>M.</given-names></name> <name><surname>Krutelyov</surname> <given-names>V.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Generalizing mkFit and its application to HL-LHC,&#x0201D;</article-title> in <source>Proceedings of the International Conference on Computing in High Energy and Nuclear Physics</source> (<publisher-loc>CHEP23</publisher-loc>). Available at: <ext-link ext-link-type="uri" xlink:href="https://www.epj-conferences.org/articles/epjconf/abs/2024/05/epjconf_chep2024_03019/epjconf_chep2024_03019.html">https://www.epj-conferences.org/articles/epjconf/abs/2024/05/epjconf_chep2024_03019/epjconf_chep2024_03019.html</ext-link></citation>
</ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cerati</surname> <given-names>G.</given-names></name> <name><surname>Elmer</surname> <given-names>P.</given-names></name> <name><surname>Krutelyov</surname> <given-names>S.</given-names></name> <name><surname>Lantz</surname> <given-names>S.</given-names></name> <name><surname>Lefebvre</surname> <given-names>M.</given-names></name> <name><surname>Masciovecchio</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Parallelized Kalman-filter-based reconstruction of particle tracks on many-core processors and GPUs,&#x0201D;</article-title> in <source>Proceedings of Connecting The Dots/Intelligent Trackers 2017 (CTD/WIT 2017)</source> (<publisher-loc>Les Ulis</publisher-loc>: <publisher-name>EDP Sciences</publisher-name>), <fpage>00006</fpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><collab>CMS Collaboration</collab></person-group> (<year>2008</year>). <article-title>The CMS Experiment at the CERN LHC</article-title>. <source>JINST</source> <volume>3</volume>:<fpage>S08004</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/3/08/S08004</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><collab>CMS Collaboration</collab></person-group> (<year>2014</year>). <article-title>Description and performance of track and primary-vertex reconstruction with the CMS tracker</article-title>. <source>JINST</source> <volume>9</volume>:<fpage>P10009</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/9/10/P10009</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="book"><person-group person-group-type="author"><collab>CMS Offline Software, and Computing.</collab></person-group> (<year>2021</year>). <article-title>&#x0201C;Evolution of the CMS computing model towards Phase-2,&#x0201D;</article-title> in <source>Technical Report CMS-NOTE-2021-001, CERN-CMS-NOTE-2021-001</source> (<publisher-loc>Geneva</publisher-loc>: <publisher-name>CERN</publisher-name>).</citation>
</ref>
<ref id="B14">
<citation citation-type="book"><person-group person-group-type="author"><collab>CMS Offline Software and Computing</collab></person-group> (<year>2022</year>). <article-title>&#x0201C;CMS Phase-2 computing model: update document,&#x0201D;</article-title> in <source>Technical Report CMS-NOTE-2022-008, CERN-CMS-NOTE-2022-008</source> (<publisher-loc>Geneva</publisher-loc>: <publisher-name>CERN</publisher-name>).</citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><collab>DUNE Collaboration</collab></person-group> (<year>2020</year>). <article-title>Deep underground neutrino experiment (DUNE), far detector technical design report, volume I introduction to DUNE</article-title>. <source>JINST</source> <volume>15</volume>:<fpage>T08008</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/15/08/T08008</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="web"><person-group person-group-type="author"><collab>DUNE Collaboration</collab></person-group> (<year>2022</year>). <article-title>&#x0201C;DUNE offline computing conceptual design report,&#x0201D;</article-title> in <source>Technical Report FERMILAB-DESIGN-2022-01, Fermilab</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://inspirehep.net/literature/2171912">https://inspirehep.net/literature/2171912</ext-link></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Edwards</surname> <given-names>H. C.</given-names></name> <name><surname>Trott</surname> <given-names>C. R.</given-names></name> <name><surname>Sunderland</surname> <given-names>D.</given-names></name></person-group> (<year>2014</year>). <article-title>Kokkos: enabling manycore performance portability through polymorphic memory access patterns</article-title>. <source>J. Parall. Distrib. Comp</source>. <volume>74</volume>:<fpage>3202</fpage>&#x02013;<lpage>3216</lpage>. <pub-id pub-id-type="doi">10.1016/j.jpdc.2014.07.003</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fruhwirth</surname> <given-names>R.</given-names></name></person-group> (<year>1987</year>). <article-title>Application of Kalman filtering to track and vertex fitting</article-title>. <source>Nucl. Instrum. Meth</source>. <volume>A262</volume>:<fpage>444</fpage>&#x02013;<lpage>450</lpage>. <pub-id pub-id-type="doi">10.1016/0168-9002(87)90887-4</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Jones</surname> <given-names>C. D.</given-names></name> <name><surname>Paterno</surname> <given-names>M.</given-names></name> <name><surname>Kowalkowski</surname> <given-names>J.</given-names></name> <name><surname>Sexton-Kennedy</surname> <given-names>L.</given-names></name> <name><surname>Tanenbaum</surname> <given-names>W.</given-names></name></person-group> (<year>2006</year>). <article-title>&#x0201C;The new CMS event data model and framework,&#x0201D;</article-title> in <source>Proceedings of the International Conference on Computing in High Energy and Nuclear Physics (CHEP06</source>). Available at: <ext-link ext-link-type="uri" xlink:href="https://indico.cern.ch/event/408139/contributions/979800/attachments/815724/1117731/FrameworkPaper.pdf">https://indico.cern.ch/event/408139/contributions/979800/attachments/815724/1117731/FrameworkPaper.pdf</ext-link></citation>
</ref>
<ref id="B20">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kortelainen</surname> <given-names>M. J.</given-names></name> <name><surname>Kwok</surname> <given-names>M.</given-names></name> <name><surname>Childers</surname> <given-names>T.</given-names></name> <name><surname>Strelchenko</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Porting CMS heterogeneous pixel reconstruction to Kokkos,&#x0201D;</article-title> in <source>25th International Conference on Computing in High Energy and Nuclear Physics (CHEP 2021)</source>, 03034. Available at: <ext-link ext-link-type="uri" xlink:href="https://www.epj-conferences.org/articles/epjconf/abs/2021/05/epjconf_chep2021_03034/epjconf_chep2021_03034.html">https://www.epj-conferences.org/articles/epjconf/abs/2021/05/epjconf_chep2021_03034/epjconf_chep2021_03034.html</ext-link></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lantz</surname> <given-names>S.</given-names></name> <name><surname>McDermott</surname> <given-names>K.</given-names></name> <name><surname>Reid</surname> <given-names>M.</given-names></name> <name><surname>Riley</surname> <given-names>D.</given-names></name> <name><surname>Wittich</surname> <given-names>P.</given-names></name> <name><surname>Berkman</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Speeding up particle track reconstruction using a parallel Kalman filter algorithm</article-title>. <source>J. Instrument</source>. <volume>15</volume>, <fpage>P09030</fpage>&#x02013;<lpage>P09030</lpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/15/09/P09030</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>S.</given-names></name> <name><surname>Vetter</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;OpenARC: Open accelerator research compiler for directive-based, efficient heterogeneous computing,&#x0201D;</article-title> in <source>HPDC&#x00027;14: Proceedings of the ACM Symposium on High-Performance Parallel and Distributed Computing, Short Paper</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>). <pub-id pub-id-type="doi">10.1145/2600212.2600704</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Matthes</surname> <given-names>A.</given-names></name> <name><surname>Widera</surname> <given-names>R.</given-names></name> <name><surname>Zenker</surname> <given-names>E.</given-names></name> <name><surname>Worpitz</surname> <given-names>B.</given-names></name> <name><surname>Huebl</surname> <given-names>A.</given-names></name> <name><surname>Bussmann</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Tuning and optimization for a variety of many-core architectures without changing a single line of implementation code using the Alpaka library,&#x0201D;</article-title> in <source>High Performance Computing. ISC High Performance 2017. Lecture Notes in Computer Science, 10524</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>). <pub-id pub-id-type="doi">10.1007/978-3-319-67630-2_36</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Megino</surname> <given-names>F. B.</given-names></name> <name><surname>Bryant</surname> <given-names>L.</given-names></name> <name><surname>Hufnagel</surname> <given-names>D.</given-names></name> <name><surname>Anampa</surname> <given-names>K. H.</given-names></name></person-group> (<year>2023</year>). <source>US ATLAS and US CMS HPC and Cloud Blueprint</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://inspirehep.net/literature/2651770">https://inspirehep.net/literature/2651770</ext-link></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Trott</surname> <given-names>C. R.</given-names></name> <name><surname>Lebrun-Grandi</surname> <given-names>D.</given-names></name> <name><surname>Arndt</surname> <given-names>D.</given-names></name> <name><surname>Ciesko</surname> <given-names>J.</given-names></name> <name><surname>Dang</surname> <given-names>V.</given-names></name> <name><surname>Ellingwood</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Kokkos 3: programming model extensions for the exascale era</article-title>. <source>IEEE Transactions on Parall. Distrib. Syst</source>. <volume>33</volume>, <fpage>805</fpage>&#x02013;<lpage>817</lpage>. <pub-id pub-id-type="doi">10.1109/TPDS.2021.3097283</pub-id></citation>
</ref>
</ref-list>
</back>
</article>