<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. High Perform. Comput.</journal-id>
<journal-title-group>
<journal-title>Frontiers in High Performance Computing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. High Perform. Comput.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2813-7337</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fhpcp.2026.1664774</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Toward energy-efficiency: CNTD_MERIC approach for energy-aware MPI applications</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Ad Dooja</surname> <given-names>Kashaf</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<uri xlink:href="https://loop.frontiersin.org/people/2994138"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yasal</surname> <given-names>Osman</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<uri xlink:href="https://loop.frontiersin.org/people/3227436"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Vysocky</surname> <given-names>Ondrej</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Riha</surname> <given-names>Lubomir</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Cesarini</surname> <given-names>Daniele</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Bartolini</surname> <given-names>Andrea</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Data Science and Computation, University of Bologna</institution>, <city>Bologna</city>, <country country="it">Italy</country></aff>
<aff id="aff2"><label>2</label><institution>Infrastructure Research Lab, IT4Innovations, VSB &#x02013; Technical University of Ostrava</institution>, <city>Ostrava</city>, <country country="cz">Czechia</country></aff>
<aff id="aff3"><label>3</label><institution>HPC Department, CINECA</institution>, <city>Casalecchio di Reno</city>, <country country="it">Italy</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Electrical, Electronic and Information Engineering &#x0201C;Guglielmo Marconi&#x0201D;, University of Bologna</institution>, <city>Bologna</city>, <country country="it">Italy</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Kashaf Ad Dooja, <email xlink:href="mailto:kashaf.addooja2@unibo.it">kashaf.addooja2@unibo.it</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-04-08">
<day>08</day>
<month>04</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>4</volume>
<elocation-id>1664774</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>13</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>03</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Ad Dooja, Yasal, Vysocky, Riha, Cesarini and Bartolini.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Ad Dooja, Yasal, Vysocky, Riha, Cesarini and Bartolini</copyright-holder>
<license>
<ali:license_ref start_date="2026-04-08">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Energy efficiency is a major challenge in High-Performance Computing (HPC) systems, impairing their scale, performance, and sustainability. Despite technological and research progress, there is still a lack of software methods to measure and assess the energy efficiency of computing codes at scale. This is also exacerbated by the emergence of newer ISAs in the HPC computing spectrum with non-unified interfaces for power and energy monitoring. In this work, we present CNTD_MERIC, which integrates two state-of-the-art energy monitoring and optimization libraries for HPC systems, COUNTDOWN and MERIC. COUNTDOWN is an energy-aware runtime system for MPI applications. MERIC is a platform-agnostic runtime system and energy measurement library that optimizes energy efficiency by adjusting hardware configurations. CNTD_MERIC combines the benefits of these two approaches with low overhead, resulting in a portable power management runtime system for MPI applications. We evaluated the integrated library on both ARM and x86 compute nodes in the production environment of the IT4Innovations supercomputing center (IT4I). The results show that CNTD_MERIC achieves similar performance to the original COUNTDOWN and MERIC implementations in terms of energy optimization and power/energy measurement, with negligible overheads within &#x02212;5% to &#x0002B;3% compared to the original COUNTDOWN configurations. We also implemented CNTD_MERIC for multi-architecture (x86 and ARM) comparison between Intel Sapphire Rapids and A64FX processors. The results indicate that A64FX achieves significantly lower execution time, reduced energy-to-solution, and lower average power consumption (110&#x02013;132 vs. 400&#x02013;590 W), confirming its efficiency for energy-efficient HPC systems.</p></abstract>
<kwd-group>
<kwd>A64FX</kwd>
<kwd>Energy efficient computing</kwd>
<kwd>HPC</kwd>
<kwd>MPI</kwd>
<kwd>power management</kwd>
<kwd>profiling</kwd>
</kwd-group>
<funding-group>
 <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was partly supported by the EuroHPC JU Pilot for the exascale EUPEX project (g.a. 101033975), the EuroHPC JU SEANERGYS project (g.a. 101177590), and the SPOKE 1: Future HPC and Big Data by PNRR. This work was also supported by the Ministry of Education, Youth and Sports of the Czech Republic through the e-INFRA CZ (ID:90254).</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="2"/>
<equation-count count="1"/>
<ref-count count="30"/>
<page-count count="13"/>
<word-count count="8374"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Architecture and Systems</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>High-Performance Computing (HPC) is entering an era where energy efficiency, even considered a secondary design objective, has become an important factor for system design and operations. Across the last couple of decades, this has meant: (i) the design of computing architecture and systems with higher FLOPS/Watt ratio<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref>, (ii) the development of hardware extensions to support energy-proportionality, like the introduction of the performance states, dynamic-voltage and frequency scaling, low-power idle states (<xref ref-type="bibr" rid="B17">Kodama et al., 2020</xref>; <xref ref-type="bibr" rid="B14">Jadhav et al., 2022</xref>), (iii) the development of integrated hardware extensions for monitoring power and energy consumption at runtime (<xref ref-type="bibr" rid="B27">Vysocky et al., 2018</xref>; <xref ref-type="bibr" rid="B10">Hackenberg et al., 2014</xref>), and (iv) the development of system-level and user-level power management runtime systems to control and optimize the energy and power consumption during HPC production (<xref ref-type="bibr" rid="B2">Cesarini et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Corbal&#x000E1;n and Brochard, 2018</xref>; <xref ref-type="bibr" rid="B19">Madella et al., 2025</xref>).</p>
<p>In the pursuit of higher peak performance amid the stagnation of lithographic technology scaling gains, HPC systems have increasingly prioritized efficiency by favoring designs with simpler cores and SIMD acceleration. Over the years, this trend has led to heterogeneous node architectures and the introduction of RISC ISAs into the HPC race. The Fujitsu A64FX (<xref ref-type="bibr" rid="B29">Yoshida, 2018</xref>) was the first ARM-based HPC processor to be deployed at a large scale, powering the Fugaku supercomputer. Fugaku ranked &#x00023;1 in the Top500 list from June 2020 to June 2022 and is currently ranked 7th as of June 2025.<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>. Its architecture is based on the A64FX processor, built on the ARMV8.2-A SVE (scalable vector extension) architectures with 48 compute cores, high-speed high bandwidth memory generation 2 (HBM2) memory, and vector processing units. The Fujitsu A64FX processor, designed for the Fugaku supercomputer, offers decent power efficiency for its workloads, providing a promising basis for continued development of energy-efficient HPC architectures (<xref ref-type="bibr" rid="B24">Sreepathi and Taylor, 2021</xref>).</p>
<p>In HPC, performance and capability are attained by distributing applications across multiple compute nodes interconnected by low-latency networks. The Message Passing Interface (MPI) is the standard application runtime to exchange data and synchronize workload (<xref ref-type="bibr" rid="B12">Hsu and Feng, 2005</xref>; <xref ref-type="bibr" rid="B28">Walker, 1994</xref>). In recent years, several power management runtime systems have been proposed for HPC systems (<xref ref-type="bibr" rid="B2">Cesarini et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Corbal&#x000E1;n and Brochard, 2018</xref>). Following the HPC PowerStack initiative architecture, these runtime systems can be categorized into three layers: job-manager, system-manager, and node-manager (<xref ref-type="bibr" rid="B19">Madella et al., 2025</xref>). The node manager controls and monitors hardware at the node level and applies power management policies for the processor and node. The system manager is responsible for the system-wide power budget and constraints. It monitors performance and power telemetry in real-time and implements power shifting and reallocation algorithms to track the power demand and redistribute capacity to maximize job performance under a power cap. The job manager focuses on lowering the energy consumption of each job by instrumenting parallel applications to isolate their computational phases and applying application-aware power-management policies.</p>
<p>The primary goal of this layered approach is to convert application inefficiencies into energy savings while minimizing performance loss. Indeed, recent works have studied the sustainability of HPC production from the perspective of carbon footprint emission, showing that both Scope2 and Scope3 emissions are equally important during an HPC system&#x00027;s lifetime (<xref ref-type="bibr" rid="B18">Li et al., 2023</xref>; <xref ref-type="bibr" rid="B4">Chadha et al., 2023</xref>). Scope2 relates to the carbon emission from energy consumption, while Scope3 relates to carbon emission from the fabrication and installation of the computing system. Given the similar weight of Scope2 and Scope3 carbon emission in today&#x00027;s supercomputer systems and electricity mix composition, reducing energy consumption at a cost of longer execution time leads to a higher Scope3 accounted cost and may lead to a loss in sustainability. It is thus essential to monitor the actual energy consumption (energy-to-solution) together with the actual carbon intensity and minimize the overhead of power management policies in terms of the application&#x00027;s performance (time-to-solution).</p>
<p>To minimize this overhead, these power-management runtime systems employ policies that identify inefficiencies during application execution phases and selectively reduce the hardware performance/resources when workloads are constrained by communication, memory, or I/O. At such phases, scaling back hardware resources does not impair application performance. COUNTDOWN (<xref ref-type="bibr" rid="B2">Cesarini et al., 2021</xref>, <xref ref-type="bibr" rid="B3">2020</xref>) is one such runtime system that focuses on detecting and isolating communication-bound phases in MPI applications and reducing the performance of the computing unit in these moments. The MERIC (<xref ref-type="bibr" rid="B3">Cesarini et al., 2020</xref>) runtime system and energy measurement library, which optimizes energy efficiency by adjusting hardware configurations, focuses on identifying application phases and reporting energy consumption, enabling searches for optimal operating points that minimize energy consumption for each phase.</p>
<p>Energy consumption can be attained by reading vendor-specific hardware interfaces such as RAPL (for Intel and AMD CPUs), NVML (for NVIDIA GPUs), or Fujitsu-provided APIs for A64FX (ARM). It is important to note that the accuracy and reliability of these monitoring interfaces are not uniform and can vary between CPU/GPU architectures and vendors. This variability presents a challenge for porting runtime systems developed for one architecture, like Intel-specific interfaces, to newer architectures like the ARM-based A64FX processor. For instance, MERIC already supports the A64FX, while COUNTDOWN only targets the Intel processors and NVIDIA GPUs.</p>
<p>To address this limitation, in this paper, we introduce CNTD_MERIC, a framework that integrates the MERIC runtime system and energy measurement library (<xref ref-type="bibr" rid="B27">Vysocky et al., 2018</xref>) with the COUNTDOWN runtime system. CNTD_MERIC brings architecture-independent power and energy monitoring capabilities to MPI applications running on heterogeneous systems.</p>
<p>The following are the main contributions of this paper:</p>
<list list-type="order">
<list-item><p>We extended COUNTDOWN&#x00027;s internal structures to access MERIC&#x00027;s architecture-specific energy counters. We named this integration CNTD_MERIC. This integration introduces a new abstraction layer that reconciles the MERIC C&#x0002B;&#x0002B; monitoring framework with COUNTDOWN&#x00027;s C-based runtime, enabling synchronized and architecture-independent energy monitoring across heterogeneous HPC systems.</p></list-item>
<list-item><p>To assess the overhead of the proposed integration, we instantiated the CNTD_MERIC in state-of-the-art (SoA) compute nodes based on Intel processor Sapphire Rapids microarchitecture at IT4I and compared it to the vanilla COUNTDOWN implementation. When executing NAS benchmarks, CNTD_MERIC shows negligible time-to-solution overheads with respect to original COUNTDOWN. This proves that integrating MERIC in COUNTDOWN abstracts the power monitoring hardware (HW) interface without leading to a significant overhead.</p></list-item>
<list-item><p>We instantiated CNTD_MERIC on the A64FX compute nodes at IT4I to perform a comparative analysis between state-of-the-art (SoA) Intel Sapphire Rapids processors and A64FX using NAS parallel benchmarks. This analysis compares the Fujitsu A64FX processor with one of Intel&#x00027;s most advanced server CPUs, the Xeon Max 9468 from the Sapphire Rapids family. The A64FX achieved a significantly lower execution time, reduced energy-to-solution, and lower average power consumption, operating between 110 and 132 W, compared to the 400 and 590 W on Intel processors. This performance highlights its suitability for power-aware high-performance computing and demonstrates the ability of CNTD_MERIC to enable such monitoring on ARM platforms.</p></list-item>
</list>
<p>The paper is organized as follows. Section 2 provides the background and limitations of existing power management libraries. Section 3 provides the design details and implementation of CNTD_MERIC integration. Section 4 presents our experimental setup and results on A64FX and Intel Xeon Max 9468 systems using NAS parallel applications.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Power and energy efficiency have become primary design goals in high-performance computing (HPC), particularly as we progress toward exascale systems. Many runtime solutions have been proposed to reduce the energy consumption while maintaining acceptable performance levels. Key threads of research include <bold>runtime energy management</bold> and <bold>energy-aware frameworks</bold>. Most of them rely on exploiting <bold>communication slack, workload imbalance</bold>, or application phases using <bold>Dynamic voltage and frequency scaling (DVFS)</bold> and <bold>low-power state transitions</bold>.</p>
<p>Early software-based energy optimization techniques, such as Adagio (<xref ref-type="bibr" rid="B22">Rountree et al., 2009</xref>) and EAR (<xref ref-type="bibr" rid="B5">Corbalan et al., 2020</xref>), leverage DVFS to reduce energy consumption in MPI applications by exploiting communication slack and load imbalance. Adagio relies on <bold>offline profiling</bold> and <bold>manual phase annotation</bold> to model frequency scaling, limiting its adaptability to dynamic workloads. While EAR introduces a transparent, runtime approach through its (DynAIS) algorithm, which automatically detects iterative application regions (e.g., loops) and computes dynamic application signatures to drive DVFS decisions without requiring prior application knowledge or user intervention. However, both systems primarily target long-running iterative phases, leaving fine-grained power management opportunities unexplored.</p>
<p>To overcome these limitations, <xref ref-type="bibr" rid="B2">Cesarini et al. (2021)</xref> introduced COUNTDOWN, a tool for fine-grained, performance-neutral energy saving. COUNTDOWN specifically targets MPI communication and synchronization phases. It addresses the problem of overheads in transitioning to low-power states by using a timeout strategy that distinguishes between short and long MPI phases. It dynamically intercepts MPI calls and injects DVFS and other power-state transitions (e.g., P-, C-, and T-states) only when beneficial. A notable strength of COUNTDOWN is its transparency. It requires no modifications to the application source code and works seamlessly through dynamic linking. Evaluation on NAS benchmarks and Quantum ESPRESSO showed up to 50% energy savings with less than 5% performance overhead. However, its energy-saving capabilities are currently focused on systems with RAPL support and Intel-based platforms, limiting its portability to emerging architectures such as ARM-based processors (e.g., A64FX). In their other work, COUNTDOWN Slack (<xref ref-type="bibr" rid="B3">Cesarini et al., 2020</xref>), the authors specifically focus on exploiting <bold>slack time</bold>. While the original COUNTDOWN applied power saving to all MPI communication phases, COUNTDOWN Slack introduces artificial barriers to isolate slack time and only reduces power during non-critical waiting periods. This enhancement avoids slowing down data transfer and ensures persistent performance with less than 3% overhead and is still capable of achieving significant energy savings (up to 22% in large-scale runs).</p>
<p>READEX (<xref ref-type="bibr" rid="B23">Schuchart et al., 2017</xref>) does not focus on a single specific type of workload, but splits a complete application execution into phases of a similar workload type (similar hardware resource requirements) at the granularity of tens up to hundreds of milliseconds. In a complex application, we may expect this dynamic boundness change to occur frequently. The MERIC runtime system is exploiting the energy savings from hardware and system-software parameters configuration changes to fit each application phase, bringing power savings while not extending application runtime.</p>
<p>Another library, EVeREST-C (<xref ref-type="bibr" rid="B30">Yue et al., 2025</xref>), is a runtime tool that helps save CPU energy in HPC applications by adjusting core and uncore frequencies using DVFS. It relies on a simple metric, <bold>Instructions Per Second (IPS)</bold>, to detect whether code is compute or memory-bound, and applies energy-saving changes without modifying the application. It achieved up to 11% energy savings on SPEC benchmarks and 8% improvement over other MPI-based methods, with less than 3% performance overhead. While the IPS-based design is intended to be multi-architecture, the current results are limited to x86 platforms. However, uncore frequency control is only available for Intel processors. It may miss very short phases, require users to specify acceptable performance degradation, and be less effective with irregular communication patterns in MPI applications.</p>
<p>In the literature, various DVFS-aware runtime frameworks have been proposed. Each targets different trade-offs between energy efficiency and application performance. For instance, CPU MISER (<xref ref-type="bibr" rid="B8">Ge et al., 2007</xref>) is a runtime system that leverages user-defined performance thresholds and adjusts CPU frequency based on observed performance counters. While it achieves significant energy savings, especially in memory-bound applications, its effectiveness is highly dependent on platform-specific counter configuration and static thresholds, making it less adaptable across diverse architectures or dynamic workloads. The Uncore Power Scavenger (UPS) (<xref ref-type="bibr" rid="B9">Gholkar et al., 2019</xref>) targets uncore frequency scalings, adjusting the power domain to save the energy in compute-bound phases, but it relies on hardware-specific monitoring like DRAM power counters and limited to Intel processors. <xref ref-type="bibr" rid="B26">Venkatesh et al. (2015)</xref> propose EAM (Energy-Aware MPI), an application-oblivious runtime that reduces power during MPI slack time using DVFS and idling, integrated into the MVAPICH2 library. It achieves up to 41% energy savings under 4% performance loss across many applications. However, its dependence on accurate slack prediction limits performance with irregular or fine-grained communication.</p>
<p>Recently, there has been increased focus on optimizing performance and energy efficiency on ARM-based HPC systems such as Fugaku, which is powered by the Fujitsu A64FX processors. <xref ref-type="bibr" rid="B17">Kodama et al. (2020)</xref> performed the foundational analysis of supercomputer Fugaku&#x00027;s A64FX processor, by exploring three key power control modes: eco, boost, and core retention using micro-benchmarking (STREAM and DGEMM). Their study showed that combining eco mode with core retention can significantly reduce power without performance loss in memory-bound workloads, while boost mode improves compute-bound performance. However, eco mode only saves power for memory-bound tasks, and improvement in the performance of compute task result in more energy cost. The analysis was limited to synthetic workloads. Also, power use varies a lot between different parts of the supercomputer, making it hard to optimize the whole system evenly. Expanding this work, the authors evaluated Fugaku&#x00027;s A64FX processor using the SPEC CPU and SPEC OMP benchmark suites to compare its performance and power behaviors with Intel Xeon Processors. The comparison results confirmed A64FX&#x00027;s advantages in memory-intensive workloads and validated the earlier findings on power-saving modes. In this study, the authors also highlighted the higher performance of the Fujitsu compiler over GCC. Despite this, Power-saving modes reduce energy but often sacrifice performance (<xref ref-type="bibr" rid="B15">Kodama et al., 2021</xref>).</p>
<p><xref ref-type="bibr" rid="B16">Kodama et al. (2022)</xref> further explored this work on real-world HPC workloads from the SPEC suite. They demonstrate that a combination of eco and boost mode could reduce energy consumption by 17% along with slight performance gains 2%, validating earlier works in practical contexts. However, it excelled in scalable, memory-intensive HPC tasks with strong power efficiency, but its reliance on manual tuning highlighted challenges for broader adoption. <xref ref-type="bibr" rid="B1">Alappat et al. (2021)</xref> proposed the Execution-Cache-Memory (ECM) model for A64FX to predict and analyze energy-performance tradeoffs. It shows offline tuning of frequency and memory bandwidth parameters can help to save energy from 18 to 31%. However, their models are static and do not support dynamic workloads.</p>
<p>A review of the studies presented above demonstrates significant progress in understanding and controlling energy usage on A64FX systems. However, they often rely on offline profiling, static policies, or coarse-grained runtime controls. They don&#x00027;t provide MPI communication awareness, automated DVFS control, or direct energy monitoring on ARM-based supercomputers. To address these limitations, CNTD_MERIC leverages vendor-specific interfaces (e.g., Fujitsu A64FX APIs) through MERIC to provide hardware-level energy measurement capabilities. This unified framework combines the runtime adaptive DVFS and timeout-based logic of COUNTDOWN with the MERIC&#x00027;s monitoring support, targeting x86 and ARM-based architectures. It addresses a critical gap in MPI runtime energy/power management by enabling dynamic, system-level power and energy monitoring specifically tailored for ARM-based HPC architectures such as A64FX.</p></sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>This section outlines the methodology we propose to develop a unified framework called CNTD_MERIC, which integrates the MERIC energy monitoring library with the COUNTDOWN power-aware runtime system. This integration allows platform-independent energy measurement and profiling of HPC applications. To describe the integration process, we first introduced the internal structure and key functions of MERIC and COUNTDOWN that are relevant for this work. Then we present the steps followed to create the CNTD_MERIC framework.</p>
<sec>
<label>3.1</label>
<title>MERIC key components</title>
<p>MERIC is a C&#x0002B;&#x0002B; based energy monitoring library designed to provide access to domain/architecture-specific energy counters such as RAPL, A64FX, OCC, or HDEEM. MERIC uses C&#x0002B;&#x0002B; templates to create specialized logic for each power monitoring system, which we will refer to later as a domain. These templates encapsulate domain-specific implementation under a unified generic interface that can be specified at compile time based on the target domain. For instance, the generic function <monospace>measureEnergy &#x0003C; DomainType, CounterType&#x0003E;</monospace> can be instantiated as <monospace>measureEnergy &#x0003C; RAPL, unit_64&#x0003E;</monospace> or <monospace>measureEnergy &#x0003C; A64FX</monospace>, <monospace>unsigned long long int&#x0003E;</monospace> depending on the domain. Each domain class encapsulates the logic for accessing energy data from its respective energy counters. This design allows MERIC to support heterogeneous architectures while providing a unified interface for energy measurement. In this work, we utilize the five core functions in MERIC for which we provide a C-compatible wrapper called <bold>MERICext</bold>, compiled into a static library (libmeric_ext.a) and linked into COUNTDOWN during build time. These MERIC core functions and the associated MERICext functions are listed in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>COUNTDOWN_MERIC integration: MERICext API description.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">Function</th>
<th valign="top" align="left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">extlib_init()</td>
<td valign="top" align="left">Initializes enabled domains and sets up domain-specific APIs (e.g., RAPL, A64FX).</td>
</tr>
<tr>
<td valign="top" align="left">extlib_read_energy_measurements()</td>
<td valign="top" align="left">Collects timestamped energy readings from all active domains.</td>
</tr>
<tr>
<td valign="top" align="left">extlib_calc_energy_consumption()</td>
<td valign="top" align="left">Computes energy consumed between two timestamps using domain-specific logic</td>
</tr>
<tr>
<td valign="top" align="left">extlib_free_energy_timestamp()</td>
<td valign="top" align="left">Frees memory allocated to store timestamped data.</td>
</tr>
<tr>
<td valign="top" align="left">extlib_close()</td>
<td valign="top" align="left">Finalizes the domains and releases resources.</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>3.2</label>
<title>COUNTDOWN key components</title>
<p>COUNTDOWN is a runtime library designed to save energy in an MPI application without compromising performance. It is implemented with a timeout strategy, which is purely reactive and is triggered by the MPI primitives invoked by the application. This prevents the changing of power states of the cores when there are fast applications and MPI context switches, which saves the performance overhead without a reduction in energy and power. It is implemented in C and dynamically linked to MPI-based applications via a shared library (libcntd.so). COUNTDOWN periodically invokes sampling routines through the time-based sampling engine, implemented in sampling.c driven by the time_sample() function. This function is invoked at regular, user-defined intervals during program execution. The sampling interval, which defines how often energy and performance data are collected, is determined during initialization. It can be explicitly set through the environment variable CNTD_SAMPLING_TIME (e.g., CNTD_SAMPLING_TIME = 2 for a 2-s interval), or it is set to the default at compile time (DEFAULT_SAMPLING_TIME_REPORT, typically 1 s) if the user does not specify. Once the interval is set, COUNTDOWN configures a high-resolution POSIX timer, and each expiration triggers the time_sample() function via a registered signal handler. On each call, the function time_sample() collects a comprehensive sample/measurements of system metrics, including hardware performance counters, platform-specific energy values (e.g., RAPL), I/O statistics, and timestamps, using a double-buffering scheme to compute the difference between successive samples. These per-interval measurements are immediately processed for time-series reporting if enabled and later aggregated at program completion to produce summary statistics that provide a complete picture of the application&#x00027;s behavior and energy profile.</p>
<p>COUNTDOWN supports two operational modalities to control the runtime behavior. Which is configured through the CNTD_ENABLE environment variable.</p>
<list list-type="bullet">
<list-item><p><bold>Enable mode:</bold> it activates COUNTDOWN&#x00027;s power-saving algorithm to reduce energy consumption.</p></list-item>
<list-item><p><bold>Analysis mode:</bold> it performs energy-aware profiling of MPI applications, allowing users to collect runtime energy statistics for performance and efficiency analysis.</p></list-item>
</list>
</sec>
<sec>
<label>3.3</label>
<title>Implementation: CNTD_MERIC</title>
<p>We present the integration of COUNTDOWN and MERIC, called CNTD_MERIC, in <xref ref-type="fig" rid="F1">Figure 1</xref>. CNTD_MERIC exposes the same interface as the standard COUNTDOWN. As described earlier, to enable a seamless integration, we developed a new wrapper layer called MERICext (extlib.cpp, meric_ext.h), which exposes a C-compatible interface and encapsulates MERIC&#x00027;s internal architecture-specific logic. MERICext acts as a bridge between MERIC and COUNTDOWN. It is compiled into a static library (libmeric_ext.a), which is linked into COUNTDOWN&#x00027;s shared library (libcntd.so) during the build process. This ensures that any MPI-based application linked with COUNTDOWN can be transparently profiled for energy consumption via a MERICext interface, without requiring modification of the application code. The integration proceeds as follows:</p>
<list list-type="order">
<list-item><p><bold>Wrapper initialization:</bold> in our implementation, during COUNTDOWN&#x00027;s initialization phase, the extlib_init() function is called. This function is developed to initialize the energy monitoring domains requested by the user (e.g., A64FX, RAPL, NVML, etc.). For each domain, this function consists of preprocessor checks (e.g., &#x00023;ifdef &#x02013;aarch64&#x02013;) to verify support at compile time, and it calls the appropriate domain-specific <monospace>::init(is_detailed)</monospace> method to verify hardware presence at runtime. The core feature of this function is the assignment of function pointers (<monospace>energyMeasurementFns[index]</monospace>, <monospace>closeFns[index]</monospace>) to the corresponding explicitly instantiated template functions (<monospace>e.g., measureEnergy &#x0003C; meric::A64FX, unsigned</monospace> <monospace>long long int&#x0003E;</monospace>) for storing and processing timestamp samples. This initialization phase establishes the foundation for the runtime sampling.</p></list-item>
<list-item><p><bold>Energy sampling:</bold> after initialization, CNTD_MERIC enters COUNTDOWN&#x00027;s time-based sampling phase. The sampling interval, typically set to 1 s (configurable via an environment variable), determines how often energy data is collected. At the expiration of each interval, the <monospace>time_sample()</monospace> function is triggered, which calls the MERICext&#x00027;s <monospace>extlib_read_energy_measurement()</monospace> function to record the raw measurements. During this process, two timestamps are recorded:</p></list-item></list>
<list list-type="bullet">
<list-item><p><bold>ts_begin:</bold> a pointer to a structure holding raw counter values at the start of the interval.</p></list-item>
<list-item><p><bold>ts_end:</bold> a pointer to a structure holding raw counter values at the end of the interval.</p></list-item>
</list>
<p>Each timestamp stores data in an ExtlibEnergyTimeStamp structure that contains the raw counter values for the enabled energy domains. The function <monospace>extlib_read_energy_measurements()</monospace> reads these counters using hardware-specific routines that access the low-level energy sensors. At this stage, the energy consumption for the interval is not calculated. This phase is solely responsible for gathering the counter data that will later be used to compute the difference between ts_end and ts_begin in the next phase.</p>
<list list-type="simple">
<list-item><p><bold>3. Energy computation:</bold> once both (ts_end, ts_begin) are available, the CNTD_MERIC transitions to the computation phase, where <monospace>extlib_calc_energy_consumption()</monospace> function is invoked to compute the energy consumed during the interval. Its implementation begins by validating checks to ensure that the provided ts_end and ts_begin timestamps are compatible. It then iterates through every counter in each enabled energy domain. For every counter, domain-specific logic embedded in the <monospace>::getResultValue()</monospace> method is applied. This method contains the essential domain-specific logic to calculate energy consumption in Joules, typically by computing the difference between the end and start counter values. This value is then propagated into COUNTDOWN&#x00027;s internal cumulative energy tracking structure.</p></list-item>
<list-item><p><bold>4. Memory cleanup and ts_begin update:</bold> after each interval, the result structure is de-allocated using <monospace>extlib_free_energy_timestamp()</monospace> function. The ts_end value is then assigned to the ts_begin to prepare for the next sampling interval.</p></list-item>
<list-item><p><bold>5. Finalization:</bold> at finalization, CNTD_MERIC calls <monospace>extlib_close()</monospace>, which invokes domain-specific cleanup routines (e.g., <monospace>A64FX::close()</monospace>) and de-allocates all memory associated with the energy measurement domains.</p></list-item>
</list>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Integration framework of COUNTDOWN and MERIC.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0001.tif">
<alt-text content-type="machine-generated">Flowchart diagram showing the interaction between MERIC, the MERICext wrapper, COUNTDOWN, and an application (App.x). MERICext (extlib.cpp, meric_ext.h) exposes a C-compatible interface that encapsulates MERIC&#x02019;s architecture-specific logic, acting as a bridge between MERIC and COUNTDOWN. It is compiled into a static library (libmeric_ext.a), which is linked into COUNTDOWN&#x02019;s shared library (libcntd.so) at build time. COUNTDOWN&#x02019;s Sampling.c uses the MERICext interface to perform energy measurements: it initializes the energy domain, takes begin and end readings, calculates energy consumption, and stores the results. The application (App.x) is an MPI-based program that initializes and finalizes the MPI environment, retrieves process rank and size. It is dynamically linked with COUNTDOWN&#x02019;s shared library, enabling transparent energy profiling without any modification to the application code. </alt-text>
</graphic>
</fig>
<p>The integration occurs inside COUNTDOWN&#x00027;s time_sample() function (<xref ref-type="statement" rid="algo1">Algorithm 1</xref>), where the periodic energy measurement is transparently delegated to MERICext. The logic proceeds as follows:</p>
<statement content-type="algorithm" id="algo1">
<label>Algorithm 1</label>
<title>Energy measurement and accumulation in COUNTDOWN.</title>
<p>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-i0001.tif"/>
</p>
</statement>
<p>This sampling process is repeated periodically throughout the application execution, allowing CNTD_MERIC to build a detailed profile of energy consumption over time. Linking CNTD_MERIC to the application is straightforward: it requires configuring the environment variable LD_PRELOAD with the path of CNTD_MERIC runtime library, selecting the required domain, and launching the application as usual.</p></sec></sec>
<sec id="s4">
<label>4</label>
<title>Experimental results</title>
<p>This section presents the experimental setup and tests that we conducted.</p>
<sec>
<label>4.1</label>
<title>Experimental setup</title>
<p>The integration and evaluation of the CNTD_MERIC framework were performed on two different high-performance computing nodes provided by the IT4Innovations National Supercomputing Center. The first node is based on Intel&#x00027;s Xeon Max 9468 processor from the Sapphire Rapids family, launched in 2023 and manufactured using the Intel 7 (10 nm) process (<xref ref-type="bibr" rid="B25">Tiffany Trader, 2023</xref>). It supports Intel&#x00027;s Advanced Matrix Extensions (AMX) and is optimized for HPC and AI workloads (<xref ref-type="bibr" rid="B13">Intel Corporation, 2025</xref>). The second node features Fujitsu&#x00027;s ARM-based A64FX processors, launched in 2019 and built on TSMC&#x00027;s 7nm technology (<xref ref-type="bibr" rid="B7">Fujitsu Global, 2020</xref>). The key specs of each target system are listed in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>System architecture comparison.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">Feature</th>
<th valign="top" align="left">Intel Xeon Max 9468</th>
<th valign="top" align="left">Fujitsu A64FX&#x00040;2.0GHz</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Architecture</td>
<td valign="top" align="left">x86_64</td>
<td valign="top" align="left">ARMv8.2-A SVE</td>
</tr>
<tr>
<td valign="top" align="left">Processor family</td>
<td valign="top" align="left">Sapphire rapids with HBM</td>
<td valign="top" align="left">A64FX</td>
</tr>
<tr>
<td valign="top" align="left">Cores per socket</td>
<td valign="top" align="left">48 (4 NUMA subdomains)</td>
<td valign="top" align="left">48 (4 NUMA subdomains)</td>
</tr>
<tr>
<td valign="top" align="left">Sockets per node</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td valign="top" align="left">Total cores per node</td>
<td valign="top" align="left">96</td>
<td valign="top" align="left">48</td>
</tr>
<tr>
<td valign="top" align="left">Threads per core</td>
<td valign="top" align="left">1 (Hyperthreading disabled)</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td valign="top" align="left">Base/max frequency</td>
<td valign="top" align="left">3.5 GHz (2.1 GHz nominal)</td>
<td valign="top" align="left">2.0 GHz</td>
</tr>
<tr>
<td valign="top" align="left">L2 cache</td>
<td valign="top" align="left">2 MB per core</td>
<td valign="top" align="left">8 MB shared per NUMA node</td>
</tr>
<tr>
<td valign="top" align="left">L3 cache</td>
<td valign="top" align="left">105 MB per socket (shared)</td>
<td valign="top" align="left">N/A</td>
</tr>
<tr>
<td valign="top" align="left">Memory architecture</td>
<td valign="top" align="left">DDR5 &#x0002B; High-bandwidth memory (HBM2e), integrated per socket</td>
<td valign="top" align="left">HBM2, tightly integrated</td>
</tr>
<tr>
<td valign="top" align="left">Vector extensions</td>
<td valign="top" align="left">AVX-512</td>
<td valign="top" align="left">Scalable vector extension (SVE)</td>
</tr>
<tr>
<td valign="top" align="left">Energy measurement Support</td>
<td valign="top" align="left">Fixed performance counters</td>
<td valign="top" align="left">Native energy counters integrated in hardware</td>
</tr>
<tr>
<td valign="top" align="left">HPC optimization</td>
<td valign="top" align="left">Designed for data-intensive and memory-bound workloads</td>
<td valign="top" align="left">HPC-optimized SoC with memory, compute, and I/O on-die</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>4.2</label>
<title>NAS parallel benchmarks</title>
<p>To evaluate our proposed framework, we used the NAS parallel Benchmark suite (NBP) (<xref ref-type="bibr" rid="B21">NASA Advanced Supercomputing Division, 2024</xref>), which is a set of popular HPC benchmarks developed by the NASA Advanced Supercomputing Division. The NPB consists of 8 benchmarks, BT, CG, FT, LU, SP, EP, MG, and IS, which are widely used for different scientific areas and also widely recognized in the HPC community for comparing and characterizing the performance of parallel systems. We used NPB version 3.4.3, and these were compiled and run on both Intel x86_64 and Fujitsu A64FX with 96 and 48 cores per node, respectively. These benchmarks were executed with class D problem size, which is suitable for large_scale parallel systems.</p>
<p>Out of the eight benchmarks, we successfully executed seven on Intel (BT, CG, LU, SP, EP, MG, and IS) and six on A64FX (BT, EP, LU, SP, CG, and IS). The FT (Fast Fourier Transform) and MG (MultiGrid) benchmarks could not be executed on a single A64FX node due to memory size limitations. Both FT and MG are highly memory-intensive. Therefore, to maintain a fair and consistent basis for comparison across architectures, we restricted our evaluation to these six benchmarks that could be executed on both systems under the same class and comparable resource settings.</p>
</sec>
<sec>
<label>4.3</label>
<title>Results and discussions</title>
<p>To evaluate the proposed framework, we performed: (i) an overhead analysis of CNTD_MERIC with COUNTDOWN, (ii) an evaluation of pure power monitoring on ARM-based A64FX processor, (iii) a performance and energy efficiency analysis of three representative workloads: NAS BT (<xref ref-type="bibr" rid="B21">NASA Advanced Supercomputing Division, 2024</xref>), ResNet50 (<xref ref-type="bibr" rid="B11">He et al., 2016</xref>), and STREAM (<xref ref-type="bibr" rid="B20">McCalpin, 1991</xref>).</p>
<p>The first set of experiments evaluates the performance overhead introduced by integrating MERIC into COUNTDOWN and compares the original runtime CNTD_X86 with the enhanced CNTD_MERIC_X86 across seven NAS Parallel Benchmarks in both Analysis and Enable modes. Each benchmark was executed six times to ensure statistical reliability, and the resulting averages, variances, and standard deviations were used to derive the error bars shown in <xref ref-type="fig" rid="F2">Figures 2</xref>&#x02013;<xref ref-type="fig" rid="F4">4</xref>. <xref ref-type="fig" rid="F2">Figures 2A</xref>&#x02013;<xref ref-type="fig" rid="F2">C</xref> present the <bold>Analysis Mode</bold> results, where CNTD_MERIC_X86 performs energy-aware profiling without modifying application execution. The execution times in <xref ref-type="fig" rid="F2">Figure 2A</xref> indicate that CNTD_MERIC_X86 closely matches CNTD_X86 across all benchmarks, with very small variations reflected in the low standard deviations. Energy consumption and average power in <xref ref-type="fig" rid="F2">Figures 2B</xref>, <xref ref-type="fig" rid="F2">C</xref> similarly closely align with the original runtime.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Comparison of CNTD_X86 and CNTD_MERIC_X86 across benchmarks for seven different conditions with Analysis <bold>(A&#x02013;C)</bold> and Enable <bold>(D&#x02013;F)</bold> modes.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0002.tif">
<alt-text content-type="machine-generated">Six grouped bar charts compare CNTD_MERIC_X86 and CNTD_X86 (blue and pink) across six NAS benchmarks for time, energy, and power. Panels A-C show results for the first scenario, D-F for the second. Each panel has error bars and labeled x-axes, with CNTD_MERIC_X86 generally performing slightly better in time and energy, while power measurements are similar between implementations. A legend appears below.</alt-text>
</graphic>
</fig>
<p>To quantify overhead, we computed the percentage error using CNTD_MERIC_X86 as the measured (new) value and CNTD_X86 as the reference (old) value as depicted in Equation 1, and the results are shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Overhead (%)</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Measured</mml:mtext><mml:mo>-</mml:mo><mml:mtext class="textrm" mathvariant="normal">Reference</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Reference</mml:mtext></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mn>100</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>Execution time overhead (<xref ref-type="fig" rid="F3">Figure 3A</xref>) remains consistently below 0.5% across all benchmarks, ranging between &#x02013;0.25% and &#x0002B;0.45%, confirming that MERIC imposes no meaningful execution-time penalty. Energy and power overheads in <xref ref-type="fig" rid="F3">Figures 3B</xref>, <xref ref-type="fig" rid="F3">C</xref> fluctuate slightly above and below zero, with values remaining within &#x02013;5% to &#x0002B;3%, reflecting normal run-to-run variability rather than systematic measurement bias.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Overhead analysis on NAS benchmarks results with CNTD_MERIC under Analysis mode <bold>(A&#x02013;C)</bold>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0003.tif">
<alt-text content-type="machine-generated">Bar chart (A) titled &#x0201C;Energy Saving (%)&#x0201D; uses blue bars to compare energy savings across eight configurations, with values ranging approximately from thirteen percent to twenty-two percent and a dashed reference line at seventeen percent. Bar chart (B) titled &#x0201C;Power Saving (%)&#x0201D; uses green bars to depict power savings for the same configurations, displaying a similar pattern and the same reference line at seventeen percent.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F2">Figures 2D</xref>&#x02013;<xref ref-type="fig" rid="F2">F</xref> present the Enable Mode results, where COUNTDOWN applies dynamic frequency scaling during communication phases. <xref ref-type="fig" rid="F2">Figure 2D</xref> shows only marginal increases in execution time relative to Analysis mode, while <xref ref-type="fig" rid="F2">Figures 2E</xref>, <xref ref-type="fig" rid="F2">F</xref> demonstrate substantial reductions in energy and average power. When comparing Analysis and Enable modes, and calculating the percent of change in terms of decrease, CNTD_MERIC_X86 achieves energy and power savings of approximately 13%&#x02013;21% across the NAS benchmarks, confirming its ability to preserve COUNTDOWN&#x00027;s power-optimization capabilities. These reductions are further summarized in <xref ref-type="fig" rid="F4">Figure 4</xref>, which highlights consistent improvements in both energy and power efficiency. Overall, <xref ref-type="fig" rid="F2">Figures 2</xref>&#x02013;<xref ref-type="fig" rid="F4">4</xref> collectively demonstrate that CNTD_MERIC_X86 introduces negligible overhead, maintains highly stable and reproducible performance behavior, and preserves COUNTDOWN&#x00027;s significant energy and power-saving benefits.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Energy and power savings achieved under Enable mode with CNTD_MERIC <bold>(A, B)</bold>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0004.tif">
<alt-text content-type="machine-generated">Bar chart with three panels labeled A, B, and C comparing overhead percentages for different benchmarks. Panel A shows time overhead, panel B presents energy overhead, and panel C illustrates power overhead, each with data for seven benchmarks on the x-axis. Panel A has primarily small positive and negative values, panel B shows mostly negative energy overheads, and panel C displays a mix of positive and negative power overheads. All axes are labeled with percentage units.</alt-text>
</graphic>
</fig>
<p>The second experiment employs the CNTD_MERIC runtime to compare the energy/performance/power efficiency of the A64FX compute nodes against the Sapphire Rapids ones (based on the Intel Xeon Max 9468 processor). <xref ref-type="fig" rid="F5">Figure 5</xref> illustrates a comparative analysis of the CNTD-MERIC framework across six NAS Parallel Benchmark (NPB) applications. Each benchmark is run six times to calculate the average and standard deviation for execution time, energy consumption, and average power. The results show that the A64FX node achieves high performance in most cases, with significantly reduced execution times compared to Intel&#x00027;s Sapphire Rapids node. Particularly in <bold>LU, SP, CG, and IS</bold>. In terms of energy consumption, A64FX consistently consumes less energy across all workloads. Despite the high performance, the average power consumption of A64FX remains significantly lower (110&#x02013;132 W) compared to Intel (400&#x02013;590 W), resulting in better performance-per-watt of the A64FX processor.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Evaluation and comparison of power monitoring of CNTD_MERIC_X86 and CNTD_MERIC_A64FX <bold>(A&#x02013;C)</bold>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0005.tif">
<alt-text content-type="machine-generated">Bar chart visualization comparing two systems: CNTD_MERIC_X86 (blue) and CNTD_MERIC_A64FX (pink) using three metrics across six benchmarks. Panel A shows time in seconds, where CNTD_MERIC_X86 generally records lower times for nas_bt.36. and nas_ep.16. Panel B shows energy in joules, with CNTD_MERIC_X86 consistently consuming more energy. Panel C displays power in watts, where CNTD_MERIC_X86 uses substantially more power across all benchmarks. Each panel includes error bars and x-axis labels for the benchmarks. Legend below identifies system colors. </alt-text>
</graphic>
</fig>
<p>Since COUNTDOWN provides statistics on MPI communication, we will leverage this information in the comparative analysis by collecting detailed statistics on application computation time (APP_time %) and MPI communication time (MPI_time %), and we assess the computation and communication balance on both architectures.</p>
<p><xref ref-type="fig" rid="F6">Figure 6</xref> illustrates the comparative analysis of the percentage of computation and communication for six NAS benchmarks, with separate bars for Intel Xeon Max and Fujitsu A64FX. Each benchmark is run six times to calculate the average for MPI time % and APP time %. Both systems ran the same version of Open MPI (v4.1.4). The results are divided into application (APP) and MPI time percentages. On the Intel Xeon system, benchmarks such as SP, CG, and IS spend more than 90% of their runtime in MPI phases, indicating a communication-dominated execution profile even within a single node. While on the A64FX, more than 90% of the runtime is spent on application computation across all benchmarks, depicting reduced MPI overhead and higher compute efficiency. This shift is primarily due to architectural differences: the A64FX employs tightly integrated HBM2 memory and a wide SVE vector unit, which provides high memory bandwidth and efficient computation. In contrast to Intel Xeon, it combines DDR5 and HBM2e, which introduces a higher communication cost for memory-bound workloads. Using the detailed MPI and APP phase statistics collected by CNTD_MERIC, this analysis breaks down the time distribution between the computation and communication phases, showing how fundamental architectural design choices shape the performance and efficiency of the application.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Comparative analysis of MPI communication overheads using CNTD_MERIC.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0006.tif">
<alt-text content-type="machine-generated">Stacked horizontal bar chart compares x86 and A64FX CPU architectures for six benchmarks, showing application and MPI time percentages. Each benchmark features distinct blue, pink, green, and red bars to represent respective percentages. 

Legend explains color coding: blue is x86 app time, pink is x86 MPI time, green is A64FX app time, and red is A64FX MPI time. X-axis shows time percentages from zero to one hundred.</alt-text>
</graphic>
</fig>
<p>Furthermore, we extended the evaluation of the proposed CNTD_MERIC implementation by testing on three different application domains: two HPC benchmarks and one deep learning application. <xref ref-type="fig" rid="F7">Figure 7</xref> presents the performance and energy efficiency analysis of the three workloads: <bold>NAS BT</bold>, <bold>ResNet50</bold>, and <bold>STREAM</bold> executed on the A64FX processor using CNTD_MERIC framework. The benchmarks were executed with varying process counts 4, 9, 16, and 25. These process counts were selected to match the square-number requirements of the BT benchmark for proper grid decomposition. The key metrics, such as execution time, energy consumption, average power, and memory usage, were recorded.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>CNTD_MERIC benchmark results on A64FX for NAS BT, ResNet50, and STREAM <bold>(A&#x02013;D)</bold>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0007.tif">
<alt-text content-type="machine-generated">Four-panel figure shows performance metrics for NAS BT, ResNet50, and STREAM as processes increase from four to twenty-five. Panel A, a line chart, plots execution time (seconds) decreasing for all applications with more processes. Panel B, a line chart, shows energy consumption (Joules) also decreasing with more processes. Panel C, a line chart, indicates power consumption (Watts) increases as processes increase. Panel D, a line chart, compares memory usage (GBytes), showing significant growth for ResNet50, steady memory for STREAM, and low memory for NAS BT. Each legend indicates application by color and marker shape.</alt-text>
</graphic>
</fig>
<p>In our experiments, we use the Block Tri-diagonal (BT) class C benchmark from NAS parallel Benchmarks (NPB), which evaluates parallel performance by solving a large 3D fluid dynamics problem. For the deep learning application, we used ResNet50, a convolutional neural network (CNN) pre-trained on the ImageNet dataset with approximately 25.5 million parameters. For inference, we employ 1000 images from the imagenet dataset. This model is implemented in a distributed manner using PyTorch with MPI backend, where data is distributed equally across MPI ranks (e.g., 250 samples per rank for 4 processes). Distributed communication is initialized via torch.distributed to ensure efficient load balancing and minimal redundancy. To evaluate the memory bandwidth performance of the system, we used the STREAM benchmark, a widely used benchmark design to measure sustainable memory bandwidth (in GB/s) and computational throughput for simple vector operations. <xref ref-type="fig" rid="F7">Figure 7A</xref> shows that execution time decreases significantly as the number of processes increases for all three benchmarks. ResNet50 and BT show similar scaling behavior, with a reduction in the runtime from approximately 450 s to under 100 s (at 25 processes). STREAM is a simple memory-intensive benchmark. Showing much shorter execution time across all configurations. It provides only small improvements once the process count exceeds 9, since each process has limited computation to perform.</p>
<p>In <xref ref-type="fig" rid="F7">Figure 7B</xref>, a consistent drop in energy consumption has been observed as the number of processes increases. BT and ResNet50 start with the highest energy consumption at four processes, followed by a significant decrease as the number of processes increases. STREAM remains in much lower energy usage throughout due to its shorter runtime and lower computational intensity.</p>
<p>The power and memory behavior can be seen in <xref ref-type="fig" rid="F7">Figures 7C</xref>, <xref ref-type="fig" rid="F7">D</xref> The average power increases with process count. BT shows the highest average power (approximately 123 W at 25 processes), followed by STREAM and ResNet50. These results indicate that although parallelization reduces total runtime and energy, it increases power usage due to more active cores. From a memory usage perspective, STREAM maintains a steady increase in memory usage with process count. In the case of ResNet50, the memory usage increases as the number of processes increases from 5 GB at 4 processes to approximately 25 GB at 25 processes. NAS BT shows the lowest memory use, remaining below 10 GB even at the maximum process count.</p>
<p>Overall, the BT benchmark shows computational intensity with moderate energy draw, while ResNet-50 stresses both memory and compute units. STREAM serves as a memory-bound baseline. Our tool captured these behaviors and provided a unified energy-performance perspective across workloads.</p>
<p>In <xref ref-type="fig" rid="F8">Figure 8</xref>, we evaluated a deep learning workload, an HPC workload, and Quantum ESPRESSO in a multi node setting using the CNTD_MERIC framework to measure execution time, energy consumption, and average power. For the deep learning, we employed Wide ResNet, a CNN pretrained on the ImageNet dataset. It contains 68.9 million parameters and is therefore 2.7 times larger than the ResNet50 model. In the multinode evaluation, we used a dataset that was four times larger than the one adopted for the ResNet50 inference experiments. We scaled the job across 25, 50, 75, and 100 MPI processes, corresponding to 2, 4, 6, and 8 A64FX nodes. For the NPB BT Class D benchmark, we followed the native NPB parallelization constraints, which require the number of MPI processes to be perfect squares; therefore, the experiments were performed with 25, 49, 81, and 100 processes. Quantum ESPRESSO is an open-source software suite used for electronic-structure calculations and materials modeling. For Quantum ESPRESSO, we scaled the workload across 8, 16, 24, and 32 MPI processes, each using 12 OpenMP threads, corresponding to 2, 4, 6, and 8 nodes, respectively.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>CNTD_MERIC multinode benchmark results on A64FX with wide ResNet, BT Class D, and quantum ESPRESSO.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fhpcp-04-1664774-g0008.tif">
<alt-text content-type="machine-generated">Three line graphs compare NAS BT (blue squares), Wide ResNet (red triangles), and Quantum ESPRESSO (orange circles) by number of nodes. From left to right: Time decreases, energy remains mostly stable, and power increases as nodes increase. A legend identifies the datasets.</alt-text>
</graphic>
</fig>
<p>Across all applications, increasing the number of MPI processes/nodes leads to substantial reductions in execution time. Particularly, in the case of Wide ResNet, runtime improves from 723.6 s on two nodes to 252.3 s on eight nodes, while BT Class D (Quantum Espresso) exhibits a similar trend, decreasing from 1559.6 s (606.4 s) to 491.7 s (220.9 s). As expected, average power consumption increases nearly linearly with node count for all the applications, reflecting proportional resource utilization across the cluster.</p>
<p>Energy behavior, however, exhibits a non-monotonic trend for Wide ResNet and BT. Precisely, energy consumption increases from 164 to 199 kJ (25&#x02013;50 processes), then decreases to its minimum of 172 kJ at 75 processes, before rising again at 100 processes due to increased communication and synchronization costs. For BT, energy rises slightly from 364 to 377 kJ (25&#x02013;49 processes), reaches its lowest value at 81 processes (345 kJ), and then increases again at 100 processes. However, Quentum ESPRESSO exhibited montonic trend, the energy consumption increases from 139.7 to 201.829 kJ (8&#x02013;32 processes), respectively.</p></sec></sec>
<sec sec-type="conclusions" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>In this work, we present CNTD_MERIC, a unified and multi-architecture (x86 and ARM) framework for profiling MPI-based HPC applications. CNTD_MERIC extends the capabilities of the original COUNTDOWN by integrating it with the MERIC library through a C-compatible abstraction layer. It enables seamless support for heterogeneous energy monitoring systems, such as Intel&#x00027;s RAPL and Fujitsu&#x00027;s A64FX. We performed a series of experiments and evaluated the framework on Intel Xeon Max 9468 and Fujitsu A64FX compute nodes provided by the IT4Innovations supercomputing center using a diverse set of workloads, including NAS Parallel Benchmarks, STREAM, deep learning inference, and Quantum ESPRESSO. Our tool successfully captured the distinct energy-performance profiles for each of them. This ability to provide a unified energy-performance perspective across diverse workloads demonstrates the suitability of CNTD_MERIC for large-scale HPC and ML workloads.</p>
<p>The key finding of our results shows that CNTD_MERIC preserves the benefits of both libraries and introduces negligible overhead. Both configuration modes (Analysis and Enable) show nearly identical execution times across all benchmarks, with a minimum variation between &#x02212;0.25% and &#x0002B;0.45%. Similarly, energy consumption and average power are closely aligned, confirming an overhead within &#x02212;5% to &#x0002B;3%, with respect to the original COUNTDOWN configurations. On Intel nodes, the framework also demonstrated optimization capability, achieving energy and power savings of approximately 13%&#x02013;21% across the NAS benchmarks, proving that the integrated library retains the full energy-efficiency potential of the original COUNTDOWN implementation. This validated that CNTD_MERIC is also capable of delivering energy-efficient execution while preserving application performance. In addition to single-node validation, we conducted multi-node experiments on the Fujitsu A64FX system with the Wide ResNet workload, NAS Parallel Benchmark (BT Class D), and Quantum ESPRESSO on up to eight nodes. For Quantum ESPRESSO, the workload scaled from 2 to 8 nodes using a hybrid MPI&#x0002B;OpenMP configuration. In all workloads, we observed substantial reductions in execution time. While Wide ResNet and BT exhibited a non-monotonic energy trend, Quantum ESPRESSO demonstrated a monotonic increase in energy consumption, rising moderately from 139.7 kJ to 201.829 kJ. Across all workloads, average power increased almost linearly with the number of nodes, consistent with proportional resource utilization across the cluster. This highlights CNTD_MERIC&#x00027;s capability to provide stable, accurate runtime, energy, and power monitoring on multi-node ARM-based HPC systems.</p>
<p>We acknowledge that the current validation is limited to CPU-based systems specifically Intel x86 and ARM A64FX architectures. While we describe CNTD_MERIC as multi-architecture, the portability and applicability to GPUs or other accelerators remain to be explored. In future work, we plan to extend CNTD_MERIC framework toward co-tuning strategies. Our goal is to coordinate COUNTDOWN&#x00027;s tuning of MPI communication phases with MERIC&#x00027;s tuning of computation, memory, and I/O-bound phases so that both components work together at runtime. We also intend to explore integration with emerging system-wide frameworks such as PowerStack, and to assess the possibility of adding GPU power monitoring and control on systems that support it. These steps will allow us to develop a more adaptive and multi-architecture co-tuning approach.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The datasets presented in this article are available upon reasonable request. Requests should be directed to Kashaf Ad Dooja (<email>kashaf.addooja2&#x00040;unibo.it</email>).</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>KA: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Investigation, Methodology, Software. OY: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Investigation, Software. OV: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Supervision. LR: Writing &#x02013; review &#x00026; editing. DC: Writing &#x02013; review &#x00026; editing, Funding acquisition. AB: Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Conceptualization, Funding acquisition, Methodology, Supervision.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alappat</surname> <given-names>C.</given-names></name> <name><surname>Meyer</surname> <given-names>N.</given-names></name> <name><surname>Laukemann</surname> <given-names>J.</given-names></name> <name><surname>Gruber</surname> <given-names>T.</given-names></name> <name><surname>Hager</surname> <given-names>G.</given-names></name> <name><surname>Wellein</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Execution-cache-memory modeling and performance tuning of sparse matrix-vector multiplication and lattice quantum chromodynamics on A64FX</article-title>. <source>Concurr. Comput. Pract. Exp</source>. <volume>34</volume>:<fpage>e6512</fpage>. doi: <pub-id pub-id-type="doi">10.1002/cpe.6512</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cesarini</surname> <given-names>D.</given-names></name> <name><surname>Bartolini</surname> <given-names>A.</given-names></name> <name><surname>Bonf&#x000E0;</surname> <given-names>P.</given-names></name> <name><surname>Cavazzoni</surname> <given-names>C.</given-names></name> <name><surname>Benini</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>Countdown: a run-time library for performance-neutral energy saving in MPI applications</article-title>. <source>IEEE Trans. Comput</source>. <volume>70</volume>, <fpage>682</fpage>&#x02013;<lpage>695</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TC.2020.2995269</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cesarini</surname> <given-names>D.</given-names></name> <name><surname>Bartolini</surname> <given-names>A.</given-names></name> <name><surname>Borghesi</surname> <given-names>A.</given-names></name> <name><surname>Cavazzoni</surname> <given-names>C.</given-names></name> <name><surname>Luisier</surname> <given-names>M.</given-names></name> <name><surname>Benini</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Countdown slack: a run-time library to reduce energy footprint in large-scale MPI applications</article-title>. <source>IEEE Trans. Parallel Distrib. Syst</source>. <volume>31</volume>, <fpage>2696</fpage>&#x02013;<lpage>2709</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPDS.2020.3000418</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chadha</surname> <given-names>M.</given-names></name> <name><surname>Arima</surname> <given-names>E.</given-names></name> <name><surname>Raoofy</surname> <given-names>A.</given-names></name> <name><surname>Gerndt</surname> <given-names>M.</given-names></name> <name><surname>Schulz</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Sustainability in HPC: vision and opportunities,&#x0201D;</article-title> in <source>Proceedings of the SC &#x00027;23 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis</source>, 1876&#x02013;1880. doi: <pub-id pub-id-type="doi">10.1145/3624062.3624271</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Corbalan</surname> <given-names>J.</given-names></name> <name><surname>Alonso</surname> <given-names>L.</given-names></name> <name><surname>Aneas</surname> <given-names>J.</given-names></name> <name><surname>Brochard</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Energy optimization and analysis with ear,&#x0201D;</article-title> in <source>2020 IEEE International Conference on Cluster Computing (CLUSTER)</source> (<publisher-loc>Kobe</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>464</fpage>&#x02013;<lpage>472</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CLUSTER49012.2020.00067</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Corbal&#x000E1;n</surname> <given-names>J.</given-names></name> <name><surname>Brochard</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <source>EAR: Energy Management Framework for Supercomputers</source>. <publisher-loc>Barcelona</publisher-loc>: <publisher-name>Barcelona Supercomputing Center (BSC)</publisher-name>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="web"><collab>Fujitsu Global</collab> (<year>2020</year>). <source>Supercomputer Fugaku CPU a64fx Realizing High Performance, High-Density Packaging, and Low Power Consumption</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.fujitsu.com/global/about/resources/publications/technicalreview/2020-03/article03.html">https://www.fujitsu.com/global/about/resources/publications/technicalreview/2020-03/article03.html</ext-link> (Accessed July 11, 2025).</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ge</surname> <given-names>R.</given-names></name> <name><surname>Feng</surname> <given-names>X.</given-names></name> <name><surname>Feng</surname> <given-names>W.-c.</given-names></name> <name><surname>Cameron</surname> <given-names>K. W.</given-names></name></person-group> (<year>2007</year>). <article-title>&#x0201C;CPU miser: a performance-directed, run-time system for power-aware clusters,&#x0201D;</article-title> in <source>2007 International Conference on Parallel Processing (ICPP 2007)</source> (<publisher-loc>Xi&#x00027;an</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>18</fpage>&#x02013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICPP.2007.29</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gholkar</surname> <given-names>N.</given-names></name> <name><surname>Mueller</surname> <given-names>F.</given-names></name> <name><surname>Rountree</surname> <given-names>B.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Uncore power scavenger: a runtime for uncore power conservation on hpc systems,&#x0201D;</article-title> in <source>Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC &#x00027;19</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>). doi: <pub-id pub-id-type="doi">10.1145/3295500.3356150</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hackenberg</surname> <given-names>D.</given-names></name> <name><surname>Ilsche</surname> <given-names>T.</given-names></name> <name><surname>Schuchart</surname> <given-names>J.</given-names></name> <name><surname>Sch&#x000F6;ne</surname> <given-names>R.</given-names></name> <name><surname>Nagel</surname> <given-names>W. E.</given-names></name> <name><surname>Simon</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>&#x0201C;HDEEM: high definition energy efficiency monitoring,&#x0201D;</article-title> in <source>2014 Energy Efficient Supercomputing Workshop</source> (<publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>10</lpage>. doi: <pub-id pub-id-type="doi">10.1109/E2SC.2014.13</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Deep residual learning for image recognition,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>770</fpage>&#x02013;<lpage>778</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hsu</surname> <given-names>C.-H.</given-names></name> <name><surname>Feng</surname> <given-names>W.-C.</given-names></name></person-group> (<year>2005</year>). <article-title>&#x0201C;A power-aware run-time system for high-performance computing,&#x0201D;</article-title> in <source>SC &#x00027;05: Proceedings of the 2005 ACM/IEEE Conference on Supercomputing</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1109/SC.2005.3</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="web"><collab>Intel Corporation</collab> (<year>2025</year>). <italic>Intel</italic>&#x024C7; <italic>Xeon</italic>&#x024C7; <italic>CPU Max Series</italic> &#x02013; <italic>AI, Deep Learning, and HPC Processors</italic>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html">https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html</ext-link> (Accessed October 27, 2025).</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Jadhav</surname> <given-names>O.</given-names></name> <name><surname>Krishna</surname> <given-names>S. V.</given-names></name> <name><surname>Agrawal</surname> <given-names>S.</given-names></name> <name><surname>Valmiki</surname> <given-names>M.</given-names></name> <name><surname>Kaushal</surname> <given-names>A.</given-names></name> <name><surname>Dinde</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Micro-benchmarks performance analysis of fujitsu arm a64fx and intel cascadelake processor nodes,&#x0201D;</article-title> in <source>2022 3rd International Conference on Issues and Challenges in Intelligent Computing Techniques (ICICT)</source> (<publisher-loc>Ghaziabad</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICICT55121.2022.10064546</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kodama</surname> <given-names>Y.</given-names></name> <name><surname>Kondo</surname> <given-names>M.</given-names></name> <name><surname>Sato</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Evaluation of spec CPU and spec OMP on the a64fx,&#x0201D;</article-title> in <source>2021 IEEE International Conference on Cluster Computing (CLUSTER)</source> (<publisher-loc>Portland, OR</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>553</fpage>&#x02013;<lpage>561</lpage>. doi: <pub-id pub-id-type="doi">10.1109/Cluster48925.2021.00088</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kodama</surname> <given-names>Y.</given-names></name> <name><surname>Kondo</surname> <given-names>M.</given-names></name> <name><surname>Sato</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>Evaluation of performance and power consumption on supercomputer Fugaku using spec HPC benchmarks</article-title>. <source>IEICE Trans. Electron</source>. <volume>E106</volume>, <fpage>303</fpage>&#x02013;<lpage>311</lpage>. doi: <pub-id pub-id-type="doi">10.1587/transele.2022LHP0001</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kodama</surname> <given-names>Y.</given-names></name> <name><surname>Odajima</surname> <given-names>T.</given-names></name> <name><surname>Arima</surname> <given-names>E.</given-names></name> <name><surname>Sato</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Evaluation of power management control on the supercomputer Fugaku,&#x0201D;</article-title> in <source>2020 IEEE International Conference on Cluster Computing (CLUSTER)</source> (<publisher-loc>Kobe</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>484</fpage>&#x02013;<lpage>493</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CLUSTER49012.2020.00069</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Basu Roy</surname> <given-names>R.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Samsi</surname> <given-names>S.</given-names></name> <name><surname>Gadepally</surname> <given-names>V.</given-names></name> <name><surname>Tiwari</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Toward sustainable HPC: carbon footprint estimation and environmental implications of HPC systems,&#x0201D;</article-title> in <source>Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC &#x00027;23</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>). doi: <pub-id pub-id-type="doi">10.1145/3581784.3607035</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Madella</surname> <given-names>G.</given-names></name> <name><surname>Tesser</surname> <given-names>F.</given-names></name> <name><surname>Alonso</surname> <given-names>L.</given-names></name> <name><surname>Corbalan</surname> <given-names>J.</given-names></name> <name><surname>Cesarini</surname> <given-names>D.</given-names></name> <name><surname>Bartolini</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>The regale library: a DDS interoperability layer for the HPC powerstack</article-title>. <source>J. Low Power Electron. Appl</source>. <volume>15</volume>:<fpage>10</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jlpea15010010</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>McCalpin</surname> <given-names>J. D.</given-names></name></person-group> (<year>1991</year>). <source>STREAM: Sustainable Memory Bandwidth in High Performance Computers</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.cs.virginia.edu/stream/">https://www.cs.virginia.edu/stream/</ext-link> (Accessed July 08, 2025).</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="web"><collab>NASA Advanced Supercomputing Division</collab> (<year>2024</year>). <source>NAS Parallel Benchmarks (NPB)</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.nas.nasa.gov/software/npb.html">https://www.nas.nasa.gov/software/npb.html</ext-link> (Accessed June 21, 2024).</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rountree</surname> <given-names>B.</given-names></name> <name><surname>Lownenthal</surname> <given-names>D. K.</given-names></name> <name><surname>de Supinski</surname> <given-names>B. R.</given-names></name> <name><surname>Schulz</surname> <given-names>M.</given-names></name> <name><surname>Freeh</surname> <given-names>V. W.</given-names></name> Bletsch <etal/></person-group>. (<year>2009</year>). <article-title>&#x0201C;Adagio: making DVS practical for complex HPC applications,&#x0201D;</article-title> in <source>Proceedings of the 23rd International Conference on Supercomputing, ICS &#x00027;09</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>460</fpage>&#x02013;<lpage>469</lpage>. doi: <pub-id pub-id-type="doi">10.1145/1542275.1542340</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schuchart</surname> <given-names>J.</given-names></name> <name><surname>Gerndt</surname> <given-names>M.</given-names></name> <name><surname>Kjeldsberg</surname> <given-names>P. G.</given-names></name> <name><surname>Lysaght</surname> <given-names>M.</given-names></name> <name><surname>Horak</surname> <given-names>D.</given-names></name> <name><surname>Riha</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>The readex formalism for automatic tuning for energy efficiency</article-title>. <source>Computing</source> <volume>99</volume>, <fpage>727</fpage>&#x02013;<lpage>745</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00607-016-0532-7</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Sreepathi</surname> <given-names>S.</given-names></name> <name><surname>Taylor</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Early evaluation of fugaku a64fx architecture using climate workloads,&#x0201D;</article-title> in <source>2021 IEEE International Conference on Cluster Computing (CLUSTER)</source> (<publisher-loc>Portland, OR</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>719</fpage>&#x02013;<lpage>727</lpage>. doi: <pub-id pub-id-type="doi">10.1109/Cluster48925.2021.00107</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="web"><collab>Tiffany Trader</collab> (<year>2023</year>). <source>Intel Officially Launches Sapphire Rapids and HPC-Optimized Max Series</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.hpcwire.com/2023/01/10/intel-officially-launches-sapphire-rapids-and-max-series/">https://www.hpcwire.com/2023/01/10/intel-officially-launches-sapphire-rapids-and-max-series/</ext-link> (Accessed July 11, 2025).</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Venkatesh</surname> <given-names>A.</given-names></name> <name><surname>Vishnu</surname> <given-names>A.</given-names></name> <name><surname>Hamidouche</surname> <given-names>K.</given-names></name> <name><surname>Tallent</surname> <given-names>N.</given-names></name> <name><surname>Panda</surname> <given-names>D. D.</given-names></name> <name><surname>Kerbyson</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>&#x0201C;A case for application-oblivious energy-efficient MPI runtime,&#x0201D;</article-title> in <source>Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC &#x00027;15</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>). doi: <pub-id pub-id-type="doi">10.1145/2807591.2807658</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vysocky</surname> <given-names>O.</given-names></name> <name><surname>Beseda</surname> <given-names>M.</given-names></name> <name><surname>Riha</surname> <given-names>L.</given-names></name> <name><surname>Zapletal</surname> <given-names>J.</given-names></name> <name><surname>Lysaght</surname> <given-names>M.</given-names></name> <name><surname>Kannan</surname> <given-names>V.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;Meric and radar generator: Tools for energy evaluation and runtime tuning of hpc applications,&#x0201D;</article-title> in <source>High Performance Computing in Science and Engineering</source>, eds. T. Kozubek, M. Cermak, P. Tichy, R. Blaheta, J. Sistek, et al. (Cham: Springer International Publishing), <fpage>144</fpage>&#x02013;<lpage>159</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-97136-0_11</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Walker</surname> <given-names>D. W.</given-names></name></person-group> (<year>1994</year>). <article-title>The design of a standard message passing interface for distributed memory concurrent computers</article-title>. <source>Parallel Comput</source>. <volume>20</volume>, <fpage>657</fpage>&#x02013;<lpage>673</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0167-8191(94)90033-7</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Yoshida</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Fujitsu high performance CPU for the post-k computer,&#x0201D;</article-title> in <source>2018 IEEE Hot Chips 30 Symposium (HCS)</source> (<publisher-loc>Cupertino, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2</fpage>.13. Presentation slides.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Yue</surname> <given-names>A.</given-names></name> <name><surname>Yew</surname> <given-names>P.-C.</given-names></name> <name><surname>Mehta</surname> <given-names>S.</given-names></name></person-group> (<year>2025</year>). <article-title>&#x0201C;EVeREST-C: an effective and versatile runtime energy saving tool for CPUS,&#x0201D;</article-title> in <source>Proceedings of the 2025 International Conference on Supercomputing, ICS &#x00027;25</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>14</lpage>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0003">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2203606/overview">Terry Jones</ext-link>, Oak Ridge National Laboratory (DOE), United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0004">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2279774/overview">Yoonho Park</ext-link>, IBM Research, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3144269/overview">Timothy Newman</ext-link>, The University of Alabama in Huntscille, United States</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn0001"><label>1</label><p><italic>Green500 List</italic>- <italic>November 2019</italic>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://top500.org/lists/green500/2019/11/">https://top500.org/lists/green500/2019/11/</ext-link> (Accessed June 23, 2025).</p></fn>
<fn id="fn0002"><label>2</label><p><italic>Top500</italic>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://top500.org/">https://top500.org/</ext-link> (Accessed June 16, 2025).</p></fn>
</fn-group>
</back>
</article> 