<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="review-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Computer Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-9898</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomp.2026.1783945</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Mini Review</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>From distributed tracing to proactive SLO management: a mini-review of trace-driven performance prediction for cloud-native microservices</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Yu</surname> <given-names>Miaopeng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3343524"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname> <given-names>Haonan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Du</surname> <given-names>Jinran</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Lin</surname> <given-names>Kequan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Dai</surname> <given-names>Tao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Fu</surname> <given-names>Yanzhe</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yang</surname> <given-names>Chunyan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Electric Power Research Institute, CSG</institution>, <city>Guangzhou</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>Guangdong Provincial Key Laboratory of Power System Network Security</institution>, <city>Guangzhou</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>China Southern Power Grid</institution>, <city>Guangzhou</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Chunyan Yang, <email xlink:href="mailto:yangcy2@csg.cn">yangcy2@csg.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-18">
<day>18</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>8</volume>
<elocation-id>1783945</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>03</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Yu, Liu, Du, Lin, Dai, Fu and Yang.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Yu, Liu, Du, Lin, Dai, Fu and Yang</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Cloud-native microservices improve development velocity and elasticity, but they also create complex and dynamic service dependencies. Resource contention, queue buildup, and downstream slowdowns can propagate through call chains, amplifying end-to-end tail latency (e.g., p95/p99) and increasing Service Level Objective (SLO) violation risks. While many studies focus on <italic>post-hoc</italic> anomaly detection and root-cause analysis, industrial operations increasingly demand proactive capabilities, like predicting performance risks before a request finishes, issuing early warnings from partial trace prefixes, and producing actionable signals for mitigation. This mini-review synthesizes recent progress on trace-driven proactive SLO management. We summarize problem formulations and evaluation protocols for SLO violation and tail-quantile prediction, prefix early warning under precision constraints, and actionable intermediate outputs such as bottleneck candidate ranking and what-if estimation. We then survey modeling approaches spanning feature-based baselines, sequence models, graph neural networks, sequence-graph fusion, and multimodal/causal extensions, highlighting practical issues such as class imbalance, sampling-induced missing spans, and topology drift. Finally, we survey commonly used public benchmarks and traces, and discuss open challenges toward deployable, trustworthy proactive SLO management.</p></abstract>
<kwd-group>
<kwd>causal inference</kwd>
<kwd>distributed tracing</kwd>
<kwd>graph neural networks</kwd>
<kwd>microservices</kwd>
<kwd>multimodal learning</kwd>
<kwd>prefix-based early warning</kwd>
<kwd>SLO violation prediction</kwd>
<kwd>tail latency prediction</kwd>
</kwd-group>
<funding-group>
  <funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="0"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="58"/>
<page-count count="7"/>
<word-count count="5857"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Networks and Communications</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Modern cloud services increasingly adopt microservice architectures to enable independent development, deployment, and elastic scaling (<xref ref-type="bibr" rid="B10">Dragoni et al., 2017</xref>; <xref ref-type="bibr" rid="B34">Newman, 2021</xref>). In mission-critical domains such as power grid dispatching, cloud-native transformations have been actively explored to improve system scalability and operational efficiency (<xref ref-type="bibr" rid="B25">Liang et al., 2016</xref>; <xref ref-type="bibr" rid="B50">Wen et al., 2016</xref>). A single user request typically traverses multiple services and middleware components, forming a dynamic call graph (<xref ref-type="bibr" rid="B13">Gan et al., 2019</xref>). When a service experiences resource shortages, queue buildup, lock contention, garbage collection pauses, or downstream degradation, latency and errors can propagate along dependencies, manifesting as tail-latency spikes and SLO violations (<xref ref-type="bibr" rid="B51">Wu et al., 2020</xref>; <xref ref-type="bibr" rid="B53">Yu et al., 2021</xref>). This is especially relevant to latency-sensitive real-time interactive services such as cloud-hosted mobile online games (<xref ref-type="bibr" rid="B29">Meil&#x000E4;nder et al., 2014</xref>). Traditional alerting based on single-metric thresholds often lags behind these cascading behaviors (<xref ref-type="bibr" rid="B35">Notaro et al., 2021</xref>).</p>
<p>Distributed tracing provides request-centric observability by recording traces and spans that capture the execution path and timing across services (<xref ref-type="bibr" rid="B43">Sigelman et al., 2010</xref>). Tracing has been widely used for diagnosis and root-cause analysis (<xref ref-type="bibr" rid="B51">Wu et al., 2020</xref>; <xref ref-type="bibr" rid="B53">Yu et al., 2021</xref>), but it can also support proactive operations (<xref ref-type="bibr" rid="B15">Grohmann et al., 2021</xref>). In proactive SLO management, the goal is to forecast risk before user impact becomes visible, ideally while a request is still in-flight (trace prefix). Such forecasts are most valuable when they arrive early enough to enable mitigation and are accompanied by actionable signals, such as likely bottleneck services or predicted benefits of interventions. Beyond compute-side mitigation, network-side control (e.g., SDN-enabled QoS enforcement/traffic steering) provides additional actionable knobs that can complement early-warning predictors (<xref ref-type="bibr" rid="B14">Gorlatch et al., 2014</xref>).</p>
<p>Service level objectives (SLOs) are central in site reliability engineering (SRE) as explicit targets for availability and latency (<xref ref-type="bibr" rid="B2">Beyer et al., 2016</xref>). The operational motivation for proactive prediction is closely related to tail behavior - when requests fan out across many microservices, the slowest component often dominates the end-to-end latency distribution (&#x0201C;tail at scale&#x0201D;) (<xref ref-type="bibr" rid="B9">Dean and Barroso, 2013</xref>). Although microservices improve modularity and independent deployment, they also multiply dependencies and complicate performance debugging and capacity planning (<xref ref-type="bibr" rid="B10">Dragoni et al., 2017</xref>).</p>
<p>The remainder of this mini-review is organized as follows. Section 2 introduces tracing primitives, prefix definitions, and multimodal observability. Section 3 formalizes prediction tasks and evaluation protocols. Section 4 surveys modeling approaches from feature-based baselines to graph neural networks and causal extensions. Section 5 surveys public benchmarks and datasets. Section 6 discusses open challenges toward deployable proactive SLO management.</p></sec>
<sec id="s2">
<label>2</label>
<title>Tracing signals, prefixes, and multimodal observability</title>
<p>A trace represents a single end-to-end request, while a span represents a timed operation within a service or component. Spans are linked by parent-child or caller-callee relations (<xref ref-type="bibr" rid="B43">Sigelman et al., 2010</xref>). Traces can be projected into call graphs whose nodes and edges carry attributes such as service identity, latency, retries, and error codes.</p>
<p>Proactive settings introduce an online constraint: at inference time, only a partial trace prefix is observable. This distinguishes proactive SLO management from <italic>post-hoc</italic> diagnosis, where complete traces are available. Prefixes may be defined by elapsed wall-clock time, the number of observed spans, or the depth of topological expansion from the root span. Concurrent branches often complicate the notion of &#x0201C;progress,&#x0201D; since multiple spans may execute in parallel without a natural ordering. Modeling, therefore, requires explicit decisions on prefix ordering, time encoding, and how to represent partially observed subtrees.</p>
<p>Traces alone may miss important context. Metrics often reflect resource and saturation signals (CPU throttling, garbage collection, network congestion), and logs contain semantic clues (exceptions, error messages). Recent work increasingly treats proactive SLO management as a multimodal problem, aligning trace, metric, and log signals at request or service granularity (<xref ref-type="bibr" rid="B56">Zhao et al., 2023</xref>). In practice, data quality issues are common: head-based or tail-based sampling can drop intermediate spans, context propagation bugs can break trace continuity, and duplicated caller/callee instrumentation can bias edge statistics. Robust preprocessing and reporting of data cleaning choices are crucial for reproducibility (<xref ref-type="bibr" rid="B22">Huye et al., 2024</xref>).</p>
<p>Most modern tracing deployments rely on standardized instrumentation and context propagation. OpenTelemetry provides cross-language APIs/SDKs and a vendor-neutral specification that has become a de facto industry standard (<xref ref-type="bibr" rid="B36">OpenTelemetry, 2025</xref>). In production AIOps pipelines, traces are often fused with logs and metrics; classic log-analysis studies highlight both the value and the pitfalls of relying on noisy operational data (<xref ref-type="bibr" rid="B19">He et al., 2016</xref>; <xref ref-type="bibr" rid="B11">Du et al., 2017</xref>).</p></sec>
<sec id="s3">
<label>3</label>
<title>Problem formulations and evaluation</title>
<p>Proactive SLO management can be viewed as a family of prediction and decision-support tasks. Common formulations include: (i) SLO violation prediction (binary classification), (ii) tail latency/quantile prediction (regression), (iii) prefix early warning (when to alert), and (iv) actionable intermediate outputs such as bottleneck ranking and what-if estimation. <xref ref-type="table" rid="T1">Table 1</xref> summarizes these tasks along with recommended evaluation metrics and representative systems.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Typical proactive SLO management tasks and evaluation targets.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Task</bold></th>
<th valign="top" align="left"><bold>Input &#x02192; Output</bold></th>
<th valign="top" align="left"><bold>Recommended metrics</bold></th>
<th valign="top" align="left"><bold>Representative systems</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">SLO violation prediction</td>
<td valign="top" align="left">Trace/metric history (window) &#x02192; P(violation)</td>
<td valign="top" align="left">PR-AUC; calibration error</td>
<td valign="top" align="left">SuanMing (<xref ref-type="bibr" rid="B15">Grohmann et al., 2021</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Tail/quantile latency prediction</td>
<td valign="top" align="left">Prefix (&#x0002B;config) &#x02192; latency / tail quantile (e.g., p95)</td>
<td valign="top" align="left">pinball loss; relative error</td>
<td valign="top" align="left">PERT-GNN (<xref ref-type="bibr" rid="B44">Tam et al., 2023</xref>); <break/> FastPERT (<xref ref-type="bibr" rid="B45">Tam et al., 2025</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Prefix early warning</td>
<td valign="top" align="left">Prefix stream &#x02192; alert time</td>
<td valign="top" align="left">EAR&#x00040;Precision&#x02265;<italic>P</italic><sub>0</sub>; <break/> Coverage; FAR</td>
<td valign="top" align="left">(limited dedicated work)</td>
</tr>
<tr>
<td valign="top" align="left">Actionable outputs</td>
<td valign="top" align="left">Trace/call graph &#x02192; Top-k candidates / what-if</td>
<td valign="top" align="left">Top-k hit; NDCG; intervention error</td>
<td valign="top" align="left">MicroRCA (<xref ref-type="bibr" rid="B51">Wu et al., 2020</xref>); <break/> Sage (<xref ref-type="bibr" rid="B12">Gan et al., 2021</xref>)</td>
</tr></tbody>
</table>
</table-wrap>
<p><bold>SLO violation and latency prediction:</bold> SLO violation labels are often unavailable in public traces. Therefore, many studies define proxy violation labels using latency-threshold rules (e.g., based on p95/p99) computed from historical data. This proxy-vs-real-SLO gap should be made explicit because real SLOs may depend on API endpoint, customer tier, time window, and composite indicators. For binary classification, PR-AUC is preferred over ROC-AUC due to class imbalance. Calibration error (<xref ref-type="bibr" rid="B16">Guo et al., 2017</xref>) should also be reported since early warning systems require well-calibrated probabilities. For quantile/latency regression, pinball loss directly measures quantile accuracy, while relative error metrics capture prediction quality across different latency scales.</p>
<p><bold>Prefix early warning:</bold> A key operational requirement for early warning is to control false alarms while maximizing detection earliness. One practical evaluation protocol adapted from early time-series classification literature (<xref ref-type="bibr" rid="B52">Xing et al., 2012</xref>; <xref ref-type="bibr" rid="B42">Sch&#x000E4;fer and Leser, 2020</xref>; <xref ref-type="bibr" rid="B3">Bilski and Jastrz&#x00119;bska, 2023</xref>) is to select an alert threshold on a validation set such that Precision&#x02265;<italic>P</italic><sub>0</sub> (e.g., 0.90 or 0.95), then report metrics on a held-out test set. These include: (a) Earliest Alarm Ratio (EAR), the average minimal prefix fraction at which a true-violation request triggers an alert; (b) Coverage, the fraction of violating requests that are successfully alerted; and (c) False Alarm Rate (FAR), the fraction of non-violating requests that trigger an alert. This protocol is consistent with cost-sensitive thresholding and captures the &#x0201C;earliness-accuracy&#x0201D; trade-off central to proactive prediction. While SLO violation prediction and bottleneck ranking have received substantial attention, work on prefix-based early warning with explicit earliness evaluation remains relatively limited in the microservices literature.</p>
<p><bold>Bottleneck localization and root-cause analysis:</bold> Beyond forecasting SLO violation risk, trace data are widely used to localize bottlenecks and root causes. Systems such as MicroRCA (<xref ref-type="bibr" rid="B51">Wu et al., 2020</xref>) and MicroRank (<xref ref-type="bibr" rid="B53">Yu et al., 2021</xref>) analyze propagation patterns across microservices to rank likely culprits, while critical-path analysis frameworks like CRISP (<xref ref-type="bibr" rid="B55">Zhang et al., 2022</xref>) summarize large volumes of traces into actionable performance explanations. These diagnosis-oriented tasks are complementary to early warning, since a practical proactive controller often needs both &#x0201C;will we violate&#x0201D; and &#x0201C;where should we intervene&#x0201D; signals.</p>
<p><bold>What-if estimation:</bold> What-if estimation extends bottleneck ranking by quantifying the expected improvement from specific interventions. Given a predicted SLO violation, what-if models answer questions such as: &#x0201C;If we scale Service A from 2 to 4 replicas, by how much will end-to-end latency decrease?&#x0201D; or &#x0201C;If we reroute 20% of traffic away from Region B, what is the probability of meeting SLO?&#x0201D; Sage (<xref ref-type="bibr" rid="B12">Gan et al., 2021</xref>) solves this through counterfactual generation: the system generates hypothetical scenarios where a candidate service&#x00027;s resource utilization is set to &#x0201C;healthy&#x0201D; values, then predicts the resulting end-to-end latency using a learned generative model. If the counterfactual latency meets SLO, the intervention is deemed effective. Evaluation of what-if estimation typically uses metrics such as intervention error (the difference between predicted and actual post-intervention latency) or decision accuracy (whether the recommended intervention actually resolves the SLO violation when applied). Because ground-truth intervention outcomes are expensive to obtain, which requires either production experiments or high-fidelity simulation, evaluation datasets for what-if estimation remain scarce, and most published results rely on controlled testbeds or synthetic fault injection.</p></sec>
<sec id="s4">
<label>4</label>
<title>Modeling approaches: from baselines to structured and multimodal predictors</title>
<p>Feature-based baselines: Gradient-boosted trees such as XGBoost, LightGBM, and CatBoost remain strong in practice because they are easy to train, efficient at inference, and amenable to probability calibration (<xref ref-type="bibr" rid="B6">Chen and Guestrin, 2016</xref>; <xref ref-type="bibr" rid="B23">Ke et al., 2017</xref>; <xref ref-type="bibr" rid="B39">Prokhorenkova et al., 2018</xref>). However, handcrafted aggregates may miss fine-grained dependency structure and temporal evolution.</p>
<p><bold>Sequence models:</bold> Sequence models treat spans as event tokens and are naturally aligned with prefix prediction. Recurrent units such as LSTM and GRU provide efficient prefix encoders (<xref ref-type="bibr" rid="B21">Hochreiter and Schmidhuber, 1997</xref>; <xref ref-type="bibr" rid="B7">Cho et al., 2014</xref>), while Transformer-style attention offers flexible modeling of long-range dependencies and irregular events (<xref ref-type="bibr" rid="B47">Vaswani et al., 2017</xref>). Temporal convolutional networks provide a convolutional alternative for generic sequence modeling (<xref ref-type="bibr" rid="B1">Bai et al., 2018</xref>), while long-horizon Transformer variants target long sequence time-series forecasting (<xref ref-type="bibr" rid="B57">Zhou et al., 2021a</xref>). However, all sequence approaches must address long traces, concurrent branches that violate strict sequential assumptions, and appropriate position/time encoding.</p>
<p><bold>Graph neural networks:</bold> Graph neural networks explicitly model service dependencies and can support explanation via node/edge importance. Building on general GNN primitives such as graph convolutions, neighborhood aggregation, and graph attention (<xref ref-type="bibr" rid="B24">Kipf and Welling, 2017</xref>; <xref ref-type="bibr" rid="B18">Hamilton et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Veli&#x00109;kovi&#x00107; et al., 2017</xref>), they are well-suited for bottleneck ranking and resource decision support, including proactive auto-scaling (<xref ref-type="bibr" rid="B37">Park et al., 2021</xref>, <xref ref-type="bibr" rid="B38">2024</xref>; <xref ref-type="bibr" rid="B31">Meng et al., 2023</xref>). Challenges include over-smoothing on deep call chains, missing edges due to sampling, and topology drift caused by deployments and elastic scaling.</p>
<p><bold>Hybrid sequence-graph models:</bold> Hybrid models aim to combine temporal evolution and topology. A prominent design pattern is the dual-encoder architecture: one encoder (typically GNN-based) captures call-graph structure, while another (LSTM, GRU, or Transformer) models temporal dynamics. The two representations are then fused via gating, attention, or concatenation. TraceGra exemplifies this approach by using a GNN to extract topological features from a unified trace-and-metric graph representation and an LSTM to capture temporal patterns, combining them through an encoder&#x02013;decoder framework for anomaly detection (<xref ref-type="bibr" rid="B5">Chen et al., 2022</xref>). TopoMAD modifies LSTM cells with additional GNN gates to jointly model spatial dependencies among microservices and temporal evolution of metrics (<xref ref-type="bibr" rid="B20">He et al., 2023</xref>). More recently, USRFNet introduced a dual-stream architecture that explicitly separates traffic-side features (modeled by GNN) from resource-side features (modeled by gated MLP), then fuses them via cross-diffusion attention and low-rank fusion to predict window-level p95 latency (<xref ref-type="bibr" rid="B40">Qian et al., 2025</xref>). These hybrid designs address the limitations of single-representation models, where pure sequence models ignore graph structure, while pure GNN models may homogenize heterogeneous feature types through uniform message passing.</p>
<p><bold>Structural latency predictors and multimodal extensions:</bold> Recent latency predictors introduce structural inductive bias over PERT-like decompositions to improve both accuracy and interpretability (<xref ref-type="bibr" rid="B44">Tam et al., 2023</xref>, <xref ref-type="bibr" rid="B45">2025</xref>). Multimodal extensions further align trace graphs with metrics and logs (<xref ref-type="bibr" rid="B56">Zhao et al., 2023</xref>), improving robustness when any single modality is incomplete.</p>
<p><bold>Causal and counterfactual approaches:</bold> Causal methods are increasingly explored for actionable prediction. Rather than only estimating SLO violation risk, the goal is to estimate the effect of hypothetical interventions, such as scaling a specific service or rerouting traffic on end-to-end latency outcomes. Sage introduces a causal Bayesian network combined with a graphical variational auto-encoder to generate counterfactual latency scenarios (<xref ref-type="bibr" rid="B12">Gan et al., 2021</xref>). Given an observed SLO violation, Sage hypothetically &#x0201C;fixes&#x0201D; the utilization of candidate services to normal values and predicts whether SLO would be met (restored), thereby identifying root causes through counterfactual reasoning rather than correlation. <xref ref-type="bibr" rid="B26">Lohse et al. (2025)</xref> proposed a causal discovery framework that reconstructs the latency DAG of microservice architectures using domain knowledge and causal discovery algorithms, enabling identification of actionable intervention targets that causally influence high latency. However, causal estimation faces significant challenges: shared resources (CPU, network bandwidth, memory) introduce confounding, opportunities for randomized experiments are limited in production, and observational data alone cannot distinguish correlation from causation without strong structural assumptions. Current practical systems often combine structured causal models with small-scale online validation (A/B tests, canary deployments) to verify predicted intervention effects before broad rollout.</p></sec>
<sec id="s5">
<label>5</label>
<title>Public datasets and benchmarks</title>
<p>Reproducible evaluation requires both controllable benchmarks and representative traces. DeathStarBench provides open-source microservice applications with configurable workloads and fault injection via external tools, enabling controlled studies of tail latency and cascading slowdowns (<xref ref-type="bibr" rid="B13">Gan et al., 2019</xref>). Other benchmarks, such as Train-Ticket (<xref ref-type="bibr" rid="B58">Zhou et al., 2021b</xref>) and SockShop (<xref ref-type="bibr" rid="B49">Weaveworks, 2023</xref>), are also used in the literature, though DeathStarBench remains the most widely adopted for SLO-focused studies. Large-scale public traces, such as Alibaba microservice call graphs, enable learning from realistic production dynamics (<xref ref-type="bibr" rid="B27">Luo et al., 2021</xref>, <xref ref-type="bibr" rid="B28">2022</xref>), but often require significant preprocessing, including deduplication, missing-span handling, and topology repair. Huye (<xref ref-type="bibr" rid="B22">Huye et al., 2024</xref>) systematically documents topological inconsistencies in the Alibaba traces and introduces Casper, a method that recovers significantly more valid traces by leveraging dataset redundancies. Because SLO violations are rare events, most datasets exhibit extreme class imbalance, and reported results can be sensitive to sampling strategies, time splits, and proxy label definitions. Reporting dataset statistics like trace volume, span depth distribution, violation rate, and sampling rate, and releasing the preprocessing code are critical to meaningful comparisons.</p>
<p>Beyond class imbalance, two data-quality issues recur across public traces. First, sampling-induced missing spans: production tracing systems typically employ head-based or tail-based sampling to reduce overhead, which can drop intermediate spans and break parent-child relationships. Models trained on complete traces may not generalize to sampled data; evaluation protocols should therefore report sampling rates and assess robustness to incomplete call graphs. Second, topology drift: microservice deployments evolve through canary releases, auto-scaling, and service mesh reconfigurations, so traces collected months apart may reflect different call-graph structures. Models assuming static topology risk silent degradation; benchmarks ideally should include temporal metadata to enable drift-aware evaluation.</p>
<p>Outside microservice-specific benchmarks, large-scale cluster traces are often used to evaluate workload prediction and resource management components underpinning SLO control. Google has released Borg cluster-usage traces and corresponding analyses (<xref ref-type="bibr" rid="B41">Reiss et al., 2012</xref>; <xref ref-type="bibr" rid="B46">Tirmazi et al., 2020</xref>). Microsoft has released Azure workload traces and telemetry through the AzurePublicDataset and Resource Central projects (<xref ref-type="bibr" rid="B8">Cortez et al., 2017</xref>). These datasets complement microservice benchmarks by providing realistic resource-demand dynamics, although they generally lack request-level call graphs. A summary is shown in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Commonly used microservice benchmarks and traces for trace-driven performance modeling.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset/benchmark</bold></th>
<th valign="top" align="left"><bold>Type</bold></th>
<th valign="top" align="left"><bold>Scale (approx.)</bold></th>
<th valign="top" align="left"><bold>Key features for proactive SLO modeling</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">DeathStarBench (<xref ref-type="bibr" rid="B13">Gan et al., 2019</xref>)</td>
<td valign="top" align="left">Benchmark apps</td>
<td valign="top" align="left">5 apps, configurable</td>
<td valign="top" align="left">Tail-latency experiments; <break/> fault injection via external tools</td>
</tr>
<tr>
<td valign="top" align="left">Train-Ticket (<xref ref-type="bibr" rid="B58">Zhou et al., 2021b</xref>)</td>
<td valign="top" align="left">Benchmark app</td>
<td valign="top" align="left">41 services (current)</td>
<td valign="top" align="left">Realistic ticket-booking workflow; <break/> fault injection supported</td>
</tr>
<tr>
<td valign="top" align="left">Alibaba Cluster Trace (v2021/v2022) (<xref ref-type="bibr" rid="B27">Luo et al., 2021</xref>, <xref ref-type="bibr" rid="B28">2022</xref>)</td>
<td valign="top" align="left">Production traces</td>
<td valign="top" align="left">&#x0007E;20M traces (0.5% sample), <break/> 20K&#x0002B; microservices</td>
<td valign="top" align="left">Public call-graph traces; <break/> requires preprocessing; <break/> corrected version available (<xref ref-type="bibr" rid="B22">Huye et al., 2024</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Google Borg Traces (<xref ref-type="bibr" rid="B41">Reiss et al., 2012</xref>; <xref ref-type="bibr" rid="B46">Tirmazi et al., 2020</xref>)</td>
<td valign="top" align="left">Cluster workload</td>
<td valign="top" align="left">&#x0007E;12K machines, 29 days</td>
<td valign="top" align="left">Task/resource usage and scheduling; <break/> useful for capacity planning; <break/> no request-level call graphs</td>
</tr>
<tr>
<td valign="top" align="left">AzurePublicDataset (V1/V2) (<xref ref-type="bibr" rid="B8">Cortez et al., 2017</xref>; <xref ref-type="bibr" rid="B32">Microsoft, 2019a</xref>,<xref ref-type="bibr" rid="B33">b</xref>)</td>
<td valign="top" align="left">Cloud workload</td>
<td valign="top" align="left">VM-level, 30 days</td>
<td valign="top" align="left">Azure VM traces for resource demand analysis; <break/> complements microservice benchmarks</td>
</tr>
<tr>
<td valign="top" align="left">Resource Central (<xref ref-type="bibr" rid="B8">Cortez et al., 2017</xref>)</td>
<td valign="top" align="left">Cloud telemetry</td>
<td valign="top" align="left">VM-level</td>
<td valign="top" align="left">Azure VM telemetry with forecasting tasks; <break/> includes released dataset</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="discussion" id="s6">
<label>6</label>
<title>Discussion</title>
<p>Several gaps remain between research prototypes and deployable proactive SLO management. We organize the discussion around three themes: (A) evaluation gaps and label semantics, (B&#x02013;C) operational integration and scalability/efficiency, and (D) the path from correlation to action. Then, (E) concludes with an outlook.</p>
<sec>
<label>6.1</label>
<title>Evaluation gaps and label semantics</title>
<p>A recurring issue is the disconnect between proxy labels and real SLO semantics. Most studies define violation labels using latency quantiles computed from training data, but production SLOs are often more nuanced: they may vary by API endpoint, customer tier, or time window, and may incorporate composite indicators beyond latency. This proxy-vs-real gap makes it unclear how offline accuracy translates to operational value. Future work should explore evaluation protocols that incorporate SLO heterogeneity. Calibration and uncertainty quantification represent another under-addressed dimension. Early warning systems must balance false alarms against missed violations, yet most published models report only discrimination metrics without assessing probability calibration. Techniques such as Platt scaling or conformal prediction could improve reliability, but their application to trace-based predictors remains limited. Moreover, prefix-based early warning, where predictions must be issued from incomplete traces, introduces additional evaluation complexity. The trade-off between earliness and accuracy lacks standardized benchmarks, and most existing systems do not report metrics such as EAR or coverage that capture this trade-off.</p>
</sec>
<sec>
<label>6.2</label>
<title>Operational integration</title>
<p>To bridge prototypes to production, inference latency is a critical component: a proactive predictor must return results fast enough to enable mitigation before the request completes. Lightweight models or hierarchical prediction strategies may be necessary. Integration with existing observability stacks, like commercial APM tools or open-source platforms designed for <italic>post-hoc</italic> diagnosis requires streaming trace ingestion, low-latency feature extraction, and seamless alert routing. Production environments also drift continuously through canary releases, traffic shifts, and instrumentation changes, causing models trained on historical data to silently degrade.</p>
</sec>
<sec>
<label>6.3</label>
<title>Scalability and efficiency</title>
<p>Large-scale deployments generate millions of traces per minute. Complementary to trace-driven predictors, scalability modeling for real-time interactive applications on clouds helps characterize scaling behavior and cost&#x02013;benefit trade-offs of mitigation actions under latency constraints (<xref ref-type="bibr" rid="B30">Meil&#x000E4;nder and Gorlatch, 2018</xref>). Training and inference at this scale demands strategies such as sampling-based training, online learning, and model compression. Memory efficiency is equally important: graph-based models must maintain dynamic adjacency structures, while sequence models over long traces incur quadratic attention costs unless efficient variants are employed. Co-designing models with streaming infrastructure offers a promising direction but requires tight coupling between ML and systems engineering.</p>
</sec>
<sec>
<label>6.4</label>
<title>From correlation to action</title>
<p>The hardest challenge is moving from correlation to actionable decisions. Predicting that a request is at risk is not equivalent to knowing which intervention will help. Such interventions may include compute-side actions (e.g., autoscaling and configuration changes) as well as network-side control, such as SDN-enabled QoS enforcement/traffic steering (<xref ref-type="bibr" rid="B14">Gorlatch et al., 2014</xref>). Complementary to trace-driven prediction, a line of research closes the loop by directly optimizing resource allocation and autoscaling to mitigate tail latency and SLO/SLA violations. Sinan (<xref ref-type="bibr" rid="B54">Zhang et al., 2021</xref>) uses ML models to estimate the performance impact of inter-service dependencies and allocates resources per tier to preserve end-to-end tail-latency targets. POBO (<xref ref-type="bibr" rid="B17">Guo et al., 2023</xref>) applies safe Bayesian optimization to search for resource configurations under system-wide and tail-latency constraints. DeepScaler (<xref ref-type="bibr" rid="B31">Meng et al., 2023</xref>) leverages spatiotemporal GNNs with adaptive graph learning for holistic autoscaling across microservices, and QueueFlower (<xref ref-type="bibr" rid="B4">Cao et al., 2024</xref>) performs dynamic queue balancing using real-time latency feedback without offline dependency profiling. These systems are highly complementary to the actionable outputs surveyed in this review and further motivate integrating trace-prefix warning, bottleneck localization, and what-if estimation with safe controllers in a closed-looped pipeline. Progress likely requires combining structured models that expose interpretable decomposition, targeted online validation, and causal reasoning that accounts for shared-resource confounding. Observational trace data alone cannot distinguish whether two services are slow due to causal dependency or shared congestion.</p>
</sec>
<sec>
<label>6.5</label>
<title>Outlook</title>
<p>Despite these challenges, trace-driven proactive SLO management is a rapidly growing area. The adoption of OpenTelemetry, the availability of public benchmarks, and industry interest in AIOps indicates continued progress. Key opportunities include standardized evaluation protocols reflecting real SLO semantics, integration of causal reasoning into predictive pipelines, and end-to-end systems that close the loop from prediction to automated mitigation.</p></sec>
</sec>
</body>
<back>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>MY: Conceptualization, Investigation, Project administration, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. HL: Investigation, Writing &#x02013; review &#x00026; editing. JD: Investigation, Writing &#x02013; review &#x00026; editing. KL: Investigation, Writing &#x02013; review &#x00026; editing. TD: Visualization, Writing &#x02013; review &#x00026; editing. YF: Visualization, Writing &#x02013; review &#x00026; editing. CY: Project administration, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>MY, HL, JD, TD, YF, and CY were employed by Electric Power Research Institute, CSG. KL was employed by China Southern Power Grid.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. The author(s) used ChatGPT (OpenAI) solely for language editing and grammar improvement. ChatGPT was not used to generate any technical content, results, figures or tables, or references. All edits were reviewed and approved by the author(s), who take full responsibility for the final manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bai</surname> <given-names>S.</given-names></name> <name><surname>Kolter</surname> <given-names>J. Z.</given-names></name> <name><surname>Koltun</surname> <given-names>V.</given-names></name></person-group> (<year>2018</year>). <article-title>An empirical evaluation of generic convolutional and recurrent networks for sequence modeling</article-title>. <source>arXiv preprint arXiv:1803.01271</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1803.01271</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Beyer</surname> <given-names>B.</given-names></name> <name><surname>Jones</surname> <given-names>C.</given-names></name> <name><surname>Petoff</surname> <given-names>J.</given-names></name> <name><surname>Murphy</surname> <given-names>N. R.</given-names></name></person-group> (<year>2016</year>). <source>Site Reliability Engineering: How Google Runs Production Systems</source>. <publisher-loc>Sebastopol, CA</publisher-loc>: <publisher-name>O&#x00027;Reilly Media, Inc</publisher-name>.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bilski</surname> <given-names>J. M.</given-names></name> <name><surname>Jastrz&#x00119;bska</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Calimera: a new early time series classification method</article-title>. <source>Inform. Process. Manag</source>. <volume>60</volume>:<fpage>103465</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ipm.2023.103465</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>H.</given-names></name> <name><surname>He</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Queueflower: orchestrating microservice workflows via dynamic queue balancing,&#x0201D;</article-title> in <source>2024 IEEE International Conference on Web Services (ICWS)</source> (<publisher-loc>Shenzhen</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1293</fpage>&#x02013;<lpage>1299</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICWS62655.2024.00155</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>F.</given-names></name> <name><surname>Jiang</surname> <given-names>J.</given-names></name> <name><surname>Zhong</surname> <given-names>G.</given-names></name> <name><surname>Xu</surname> <given-names>D.</given-names></name> <name><surname>Tan</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>TraceGra: a trace-based anomaly detection for microservice using graph deep learning</article-title>. <source>Comput. Commun.</source> <volume>204</volume>, <fpage>109</fpage>&#x02013;<lpage>117</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.comcom.2023.03.028</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Guestrin</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Xgboost: a scalable tree boosting system,&#x0201D;</article-title> in <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD&#x00027;16</source> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>785</fpage>&#x02013;<lpage>794</lpage>. doi: <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>K.</given-names></name> <name><surname>van Merrienboer</surname> <given-names>B.</given-names></name> <name><surname>Gulcehre</surname> <given-names>C.</given-names></name> <name><surname>Bahdanau</surname> <given-names>D.</given-names></name> <name><surname>Bougares</surname> <given-names>F.</given-names></name> <name><surname>Schwenk</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>&#x0201C;Learning phrase representations using rnn encoder&#x02013;decoder for statistical machine translation,&#x0201D;</article-title> in <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source> (<publisher-loc>Doha</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>1724</fpage>&#x02013;<lpage>1734</lpage>. doi: <pub-id pub-id-type="doi">10.3115/v1/D14-1179</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cortez</surname> <given-names>E.</given-names></name> <name><surname>Bonde</surname> <given-names>A.</given-names></name> <name><surname>Muzio</surname> <given-names>A.</given-names></name> <name><surname>Russinovich</surname> <given-names>M.</given-names></name> <name><surname>Fontoura</surname> <given-names>M.</given-names></name> <name><surname>Bianchini</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Resource central: understanding and predicting workloads for improved resource management in large cloud platforms,&#x0201D;</article-title> in <source>Proceedings of the 26th Symposium on Operating Systems Principles, SOSP&#x00027;17</source> (<publisher-loc>Shanghai</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>153</fpage>&#x02013;<lpage>167</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3132747.3132772</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dean</surname> <given-names>J.</given-names></name> <name><surname>Barroso</surname> <given-names>L. A.</given-names></name></person-group> (<year>2013</year>). <article-title>The tail at scale</article-title>. <source>Commun. ACM</source> <volume>56</volume>, <fpage>74</fpage>&#x02013;<lpage>80</lpage>. doi: <pub-id pub-id-type="doi">10.1145/2408776.2408794</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dragoni</surname> <given-names>N.</given-names></name> <name><surname>Giallorenzo</surname> <given-names>S.</given-names></name> <name><surname>Lafuente</surname> <given-names>A. L.</given-names></name> <name><surname>Mazzara</surname> <given-names>M.</given-names></name> <name><surname>Montesi</surname> <given-names>F.</given-names></name> <name><surname>Mustafin</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2017</year>). <source>Microservices: Yesterday, Today, and Tomorrow</source>. Cham: Springer International <volume>Publishing</volume>, <fpage>195</fpage>&#x02013;<lpage>216</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-67425-4_12</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Du</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Zheng</surname> <given-names>G.</given-names></name> <name><surname>Srikumar</surname> <given-names>V.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Deeplog: anomaly detection and diagnosis from system logs through deep learning,&#x0201D;</article-title> in <source>Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS&#x00027;17</source> (<publisher-loc>Dallas, TX</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1285</fpage>&#x02013;<lpage>1298</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3133956.3134015</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gan</surname> <given-names>Y.</given-names></name> <name><surname>Liang</surname> <given-names>M.</given-names></name> <name><surname>Dev</surname> <given-names>S.</given-names></name> <name><surname>Lo</surname> <given-names>D.</given-names></name> <name><surname>Delimitrou</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Sage: practical and scalable ml-driven performance debugging in microservices,&#x0201D;</article-title> in <source>Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS&#x00027;21</source> (<publisher-loc>Virtual</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>135</fpage>&#x02013;<lpage>151</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3445814.3446700</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gan</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Cheng</surname> <given-names>D.</given-names></name> <name><surname>Shetty</surname> <given-names>A.</given-names></name> <name><surname>Rathi</surname> <given-names>P.</given-names></name> <name><surname>Katarki</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;An open-source benchmark suite for microservices and their hardware-software implications for cloud &#x00026;edge systems,&#x0201D;</article-title> in <source>Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS&#x00027;19</source> (<publisher-loc>Providence, RI</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>3</fpage>&#x02013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3297858.3304013</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gorlatch</surname> <given-names>S.</given-names></name> <name><surname>Humernbrum</surname> <given-names>T.</given-names></name> <name><surname>Glinka</surname> <given-names>F.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Improving qos in real-time internet applications: from best-effort to software-defined networks,&#x0201D;</article-title> in <source>2014 International Conference on Computing, Networking and Communications (ICNC)</source> (<publisher-loc>Honolulu, HI</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>189</fpage>&#x02013;<lpage>193</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCNC.2014.6785329</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Grohmann</surname> <given-names>J.</given-names></name> <name><surname>Straesser</surname> <given-names>M.</given-names></name> <name><surname>Chalbani</surname> <given-names>A.</given-names></name> <name><surname>Eismann</surname> <given-names>S.</given-names></name> <name><surname>Arian</surname> <given-names>Y.</given-names></name> <name><surname>Herbst</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Suanming: explainable prediction of performance degradations in microservice applications,&#x0201D;</article-title> in <source>Proceedings of the ACM/SPEC International Conference on Performance Engineering, ICPE&#x00027;21</source> (<publisher-loc>Virtual</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>165</fpage>&#x02013;<lpage>176</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3427921.3450248</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>C.</given-names></name> <name><surname>Pleiss</surname> <given-names>G.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Weinberger</surname> <given-names>K. Q.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;On calibration of modern neural networks,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>Sydney</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1321</fpage>&#x02013;<lpage>1330</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>H.</given-names></name> <name><surname>Cao</surname> <given-names>H.</given-names></name> <name><surname>He</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Shi</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Pobo: safe and optimal resource management for cloud microservices</article-title>. <source>Perform. Eval</source>. <volume>162</volume>:<fpage>102376</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.peva.2023.102376</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hamilton</surname> <given-names>W. L.</given-names></name> <name><surname>Ying</surname> <given-names>R.</given-names></name> <name><surname>Leskovec</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Inductive representation learning on large graphs,&#x0201D;</article-title> in <source>Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS&#x00027;17</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>1025</fpage>&#x02013;<lpage>1035</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>S.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <name><surname>He</surname> <given-names>P.</given-names></name> <name><surname>Lyu</surname> <given-names>M. R.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Experience report: system log analysis for anomaly detection,&#x0201D;</article-title> in <source>2016 IEEE 27th International Symposium on Software Reliability Engineering (ISSRE)</source> (<publisher-loc>Ottawa, ON</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>207</fpage>&#x02013;<lpage>218</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ISSRE.2016.21</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>G.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>A spatiotemporal deep learning approach for unsupervised anomaly detection in cloud systems</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>/ <volume>34</volume>, <fpage>1705</fpage>&#x02013;<lpage>1719</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2020.3027736</pub-id><pub-id pub-id-type="pmid">33064657</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hochreiter</surname> <given-names>S.</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>1997</year>). <article-title>Long short-term memory</article-title>. <source>Neural Comput</source>. <volume>9</volume>, <fpage>1735</fpage>&#x02013;<lpage>1780</lpage>. doi: <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Huye</surname> <given-names>D.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Sambasivan</surname> <given-names>R. R.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Systemizing and mitigating topological inconsistencies in alibaba&#x00027;s microservice call-graph datasets,&#x0201D;</article-title> in <source>Proceedings of the 15th ACM/SPEC International Conference on Performance Engineering, ICPE&#x00027;24</source> (<publisher-loc>London</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>276</fpage>&#x02013;<lpage>285</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3629526.3645043</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ke</surname> <given-names>G.</given-names></name> <name><surname>Meng</surname> <given-names>Q.</given-names></name> <name><surname>Finley</surname> <given-names>T.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Ma</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Lightgbm: a highly efficient gradient boosting decision tree,&#x0201D;</article-title> in <source>Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS&#x00027;17</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>3149</fpage>&#x02013;<lpage>3157</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kipf</surname> <given-names>T. N.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <source>Semi-supervised classification with graph convolutional networks</source>. In International Conference on Learning Representations.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname> <given-names>S.</given-names></name> <name><surname>Hu</surname> <given-names>R.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <name><surname>He</surname> <given-names>C.</given-names></name> <name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>A new generation of power dispatching automation system based on cloud computing architecture</article-title>. <source>South. Power Syst. Technol</source>. <volume>10</volume>, <fpage>8</fpage>&#x02013;<lpage>14</lpage>. Chinese. doi: <pub-id pub-id-type="doi">10.13648/j.cnki.issn1674-0629.2016.06.002</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Lohse</surname> <given-names>C.</given-names></name> <name><surname>Tsutsumi</surname> <given-names>D.</given-names></name> <name><surname>Ba</surname> <given-names>A.</given-names></name> <name><surname>Harsha</surname> <given-names>P.</given-names></name> <name><surname>Subramanian</surname> <given-names>C.</given-names></name> <name><surname>Straesser</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>&#x0201C;Causal latency modelling for cloud microservices,&#x0201D;</article-title> in <source>2025 IEEE 18th International Conference on Cloud Computing (CLOUD)</source> (<publisher-loc>Helsinki</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>143</fpage>&#x02013;<lpage>151</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CLOUD67622.2025.00024</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Lu</surname> <given-names>C.</given-names></name> <name><surname>Ye</surname> <given-names>K.</given-names></name> <name><surname>Xu</surname> <given-names>G.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Characterizing microservice dependency and performance: Alibaba trace analysis,&#x0201D;</article-title> in <source>Proceedings of the ACM Symposium on Cloud Computing, SoCC&#x00027;21</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>412</fpage>&#x02013;<lpage>426</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3472883.3487003</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Ye</surname> <given-names>K.</given-names></name> <name><surname>Xu</surname> <given-names>G.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Yang</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;The power of prediction: microservice auto scaling via workload learning,&#x0201D;</article-title> in <source>Proceedings of the 13th Symposium on Cloud Computing, SoCC&#x00027;22</source> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>355</fpage>&#x02013;<lpage>369</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3542929.3563477</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Meil&#x000E4;nder</surname> <given-names>D.</given-names></name> <name><surname>Glinka</surname> <given-names>F.</given-names></name> <name><surname>Gorlatch</surname> <given-names>S.</given-names></name> <name><surname>Lin</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Liao</surname> <given-names>X.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Bringing mobile online games to clouds,&#x0201D;</article-title> in <source>2014 IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS)</source> (<publisher-loc>Toronto, ON</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>340</fpage>&#x02013;<lpage>345</lpage>. doi: <pub-id pub-id-type="doi">10.1109/INFCOMW.2014.6849255</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meil&#x000E4;nder</surname> <given-names>D.</given-names></name> <name><surname>Gorlatch</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Modeling the scalability of real-time online interactive applications on clouds</article-title>. <source>Fut. Gen. Comput. Syst</source>. <volume>86</volume>, <fpage>1019</fpage>&#x02013;<lpage>1031</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.future.2017.07.041</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Meng</surname> <given-names>C.</given-names></name> <name><surname>Song</surname> <given-names>S.</given-names></name> <name><surname>Tong</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>M.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Deepscaler: holistic autoscaling for microservices based on spatiotemporal gnn with adaptive graph learning,&#x0201D;</article-title> in <source>2023 38th IEEE/ACM International Conference on Automated Software Engineering (ASE)</source> (<publisher-loc>Luxembourg</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>53</fpage>&#x02013;<lpage>65</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ASE56229.2023.00038</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><collab>Microsoft</collab> (<year>2019a</year>). <article-title>AzurePublicDatasetV1</article-title>. <source>GitHub Repository</source>. Redmond, WA:<collab>Microsoft</collab>(Accessed January 07, 2026).</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><collab>Microsoft</collab> (<year>2019b</year>). <source>AzurePublicDatasetV2. GitHub Repository</source>. Redmond, WA:<collab>Microsoft</collab>(Accessed January 07, 2026).</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Newman</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <source>Building Microservices: Designing Fine-Grained Systems</source>. <publisher-loc>Sebastopol, CA</publisher-loc>: <publisher-name>O&#x00027;Reilly Media, Inc</publisher-name>.</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Notaro</surname> <given-names>P.</given-names></name> <name><surname>Cardoso</surname> <given-names>J.</given-names></name> <name><surname>Gerndt</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>A survey of aiops methods for failure management</article-title>. <source>ACM Trans. Intell. Syst. Technol</source>. <volume>12</volume>, <fpage>1</fpage>&#x02013;<lpage>45</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3483424</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><collab>OpenTelemetry</collab> (<year>2025</year>). <source>OpenTelemetry Specification 1.52.0</source>.<collab>OpenTelemetry</collab>(Accessed January 07, 2026).</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Choi</surname> <given-names>B.</given-names></name> <name><surname>Lee</surname> <given-names>C.</given-names></name> <name><surname>Han</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Graf: a graph neural network based proactive resource allocation framework for slo-oriented microservices,&#x0201D;</article-title> in <source>Proceedings of the 17th International Conference on emerging Networking EXperiments and Technologies, CoNEXT&#x00027;21</source> (<publisher-loc>Virtual</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>154</fpage>&#x02013;<lpage>167</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3485983.3494866</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Choi</surname> <given-names>B.</given-names></name> <name><surname>Lee</surname> <given-names>C.</given-names></name> <name><surname>Han</surname> <given-names>D.</given-names></name></person-group> (<year>2024</year>). <article-title>Graph neural network-based slo-aware proactive resource autoscaling framework for microservices</article-title>. <source>IEEE/ACM Trans. Netw</source>. <volume>32</volume>, <fpage>3331</fpage>&#x02013;<lpage>3346</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNET.2024.3393427</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Prokhorenkova</surname> <given-names>L.</given-names></name> <name><surname>Gusev</surname> <given-names>G.</given-names></name> <name><surname>Vorobev</surname> <given-names>A.</given-names></name> <name><surname>Dorogush</surname> <given-names>A. V.</given-names></name> <name><surname>Gulin</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Catboost: unbiased boosting with categorical features,&#x0201D;</article-title> in <source>Proceedings of the 32nd International Conference on Neural Information Processing Systems, NIPS&#x00027;18</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>6639</fpage>&#x02013;<lpage>6649</lpage>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Qian</surname> <given-names>W.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Chow</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Learning unified system representations for microservice tail latency prediction</article-title>. <source>arXiv preprint</source> arXiv:2508.01635. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2508.01635</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Reiss</surname> <given-names>C.</given-names></name> <name><surname>Tumanov</surname> <given-names>A.</given-names></name> <name><surname>Ganger</surname> <given-names>G. R.</given-names></name> <name><surname>Katz</surname> <given-names>R. H.</given-names></name> <name><surname>Kozuch</surname> <given-names>M. A.</given-names></name></person-group> (<year>2012</year>). <article-title>&#x0201C;Heterogeneity and dynamicity of clouds at scale: Google trace analysis,&#x0201D;</article-title> in <source>Proceedings of the Third ACM Symposium on Cloud Computing, SOCC&#x00027;12</source> (<publisher-loc>San Jose, CA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1145/2391229.2391236</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sch&#x000E4;fer</surname> <given-names>P.</given-names></name> <name><surname>Leser</surname> <given-names>U.</given-names></name></person-group> (<year>2020</year>). <article-title>Teaser: early and accurate time series classification</article-title>. <source>Data Min. Knowl. Discov</source>. <volume>34</volume>, <fpage>1336</fpage>&#x02013;<lpage>1362</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10618-020-00690-z</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Sigelman</surname> <given-names>B. H.</given-names></name> <name><surname>Barroso</surname> <given-names>L. A.</given-names></name> <name><surname>Burrows</surname> <given-names>M.</given-names></name> <name><surname>Stephenson</surname> <given-names>P.</given-names></name> <name><surname>Plakal</surname> <given-names>M.</given-names></name> <name><surname>Beaver</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2010</year>). <source>Dapper, a Large-Scale Distributed Systems Tracing Infrastructure</source>. <publisher-loc>Mountain View, CA</publisher-loc>: <publisher-name>Technical report, Google, Inc</publisher-name>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tam</surname> <given-names>D. S. H.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Xie</surname> <given-names>S.</given-names></name> <name><surname>Lau</surname> <given-names>W. C.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Pert-gnn: latency prediction for microservice-based cloud-native applications via graph neural networks,&#x0201D;</article-title> in <source>Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD&#x00027;23</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>2155</fpage>&#x02013;<lpage>2165</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3580305.3599465</pub-id></mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tam</surname> <given-names>D. S. H.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Xie</surname> <given-names>S.</given-names></name> <name><surname>Lau</surname> <given-names>W. C.</given-names></name></person-group> (<year>2025</year>). <article-title>Fastpert: Towards fast microservice application latency prediction via structural inductive bias over pert networks</article-title>. <source>Proc. AAAI Conf. Artif. Intell</source>. <volume>39</volume>, <fpage>20787</fpage>&#x02013;<lpage>20795</lpage>. doi: <pub-id pub-id-type="doi">10.1609/aaai.v39i19.34291</pub-id></mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tirmazi</surname> <given-names>M.</given-names></name> <name><surname>Barker</surname> <given-names>A.</given-names></name> <name><surname>Deng</surname> <given-names>N.</given-names></name> <name><surname>Haque</surname> <given-names>M. E.</given-names></name> <name><surname>Qin</surname> <given-names>Z. G.</given-names></name> <name><surname>Hand</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Borg: the next generation,&#x0201D;</article-title> in <source>Proceedings of the Fifteenth European Conference on Computer Systems, EuroSys&#x00027;20</source> (<publisher-loc>Heraklion</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3342195.3387517</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Attention is all you need,&#x0201D;</article-title> in <source>Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS&#x00027;17</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>6000</fpage>&#x02013;<lpage>6010</lpage>.</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Veli&#x00109;kovi&#x00107;</surname> <given-names>P.</given-names></name> <name><surname>Cucurull</surname> <given-names>G.</given-names></name> <name><surname>Casanova</surname> <given-names>A.</given-names></name> <name><surname>Romero</surname> <given-names>A.</given-names></name> <name><surname>Li&#x000F2;</surname> <given-names>P.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name></person-group> (<year>2017</year>). <article-title>Graph attention networks</article-title>. <source>arXiv preprint</source> arXiv:1710.10903. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1710.10903</pub-id></mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal"><collab>Weaveworks</collab> (<year>2023</year>). <source>Sock Shop: A Microservice Demo Application</source>.<collab>Weaveworks</collab>(Archived Dec 29, 2023; Accessed January 07, 2026).</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wen</surname> <given-names>B.</given-names></name> <name><surname>Su</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Gu</surname> <given-names>Q.</given-names></name> <name><surname>Sun</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <article-title>Architecture design of an intelligent dispatching supporting platform based on integration of core business</article-title>. <source>South. Power Syst. Technol</source>. <volume>10</volume>, <fpage>15</fpage>&#x02013;<lpage>19</lpage>. Chinese. doi: <pub-id pub-id-type="doi">10.13648/j.cnki.issn1674-0629.2016.06.003</pub-id></mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>L.</given-names></name> <name><surname>Tordsson</surname> <given-names>J.</given-names></name> <name><surname>Elmroth</surname> <given-names>E.</given-names></name> <name><surname>Kao</surname> <given-names>O.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Microrca: root cause localization of performance issues in microservices,&#x0201D;</article-title> in <source>NOMS 2020</source> - <italic>2020 IEEE/IFIP Network Operations and Management Symposium</italic> (Budapest: IEEE), <fpage>1</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1109/NOMS47738.2020.9110353</pub-id></mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xing</surname> <given-names>Z.</given-names></name> <name><surname>Pei</surname> <given-names>J.</given-names></name> <name><surname>Yu</surname> <given-names>P. S.</given-names></name></person-group> (<year>2012</year>). <article-title>Early classification on time series</article-title>. <source>Knowl. Inform. Syst</source>. <volume>31</volume>, <fpage>105</fpage>&#x02013;<lpage>127</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10115-011-0400-x</pub-id></mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>G.</given-names></name> <name><surname>Chen</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Guan</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Jing</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Microrank: end-to-end latency issue localization with extended spectrum analysis in microservice environments,&#x0201D;</article-title> in <source>Proceedings of the Web Conference 2021, WWW&#x00027;21</source> (<publisher-loc>Ljubljana</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>3087</fpage>&#x02013;<lpage>3098</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3442381.3449905</pub-id></mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Hua</surname> <given-names>W.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Suh</surname> <given-names>G. E.</given-names></name> <name><surname>Delimitrou</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Sinan: Ml-based and qos-aware resource management for cloud microservices,&#x0201D;</article-title> in <source>Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS&#x00027;21</source> (<publisher-loc>ACM</publisher-loc>), <fpage>167</fpage>&#x02013;<lpage>181</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3445814.3446693</pub-id></mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Ramanathan</surname> <given-names>M. K.</given-names></name> <name><surname>Raj</surname> <given-names>P.</given-names></name> <name><surname>Parwal</surname> <given-names>A.</given-names></name> <name><surname>Sherwood</surname> <given-names>T.</given-names></name> <name><surname>Chabbi</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;CRISP: critical path analysis of Large-Scale microservice architectures,&#x0201D;</article-title> in <source>2022 USENIX Annual Technical Conference (USENIX ATC 22)</source> (<publisher-loc>Carlsbad, CA</publisher-loc>: <publisher-name>USENIX Association</publisher-name>), <fpage>655</fpage>&#x02013;<lpage>672</lpage>.</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>C.</given-names></name> <name><surname>Ma</surname> <given-names>M.</given-names></name> <name><surname>Zhong</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Tan</surname> <given-names>Z.</given-names></name> <name><surname>Xiong</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Robust multimodal failure detection for microservice systems,&#x0201D;</article-title> in <source>Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD&#x00027;23</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>5639</fpage>&#x02013;<lpage>5649</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3580305.3599902</pub-id></mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Peng</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Xiong</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2021a</year>). <article-title>Informer: beyond efficient transformer for long sequence time-series forecasting</article-title>. <source>Proc. AAAI Conf. Artif. Intell</source>. <volume>35</volume>, <fpage>11106</fpage>&#x02013;<lpage>11115</lpage>. doi: <pub-id pub-id-type="doi">10.1609/aaai.v35i12.17325</pub-id></mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>X.</given-names></name> <name><surname>Peng</surname> <given-names>X.</given-names></name> <name><surname>Xie</surname> <given-names>T.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name> <name><surname>Ji</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2021b</year>). <article-title>Fault analysis and debugging of microservice systems: industrial survey, benchmark system, and empirical study</article-title>. <source>IEEE Trans. Softw. Eng</source>. <volume>47</volume>, <fpage>243</fpage>&#x02013;<lpage>260</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TSE.2018.2887384</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3279948/overview">Xiaojie Wang</ext-link>, Chongqing University of Posts and Telecommunications, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3349470/overview">Sergei Gorlatch</ext-link>, University of M&#x000FC;nster, Germany</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3361351/overview">Hongchen Cao</ext-link>, ShanghaiTech University, China</p>
</fn>
</fn-group>
</back>
</article>