<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="systematic-review" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1749956</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Systematic Review</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Formal methods for safety-critical machine learning: a systematic literature review</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Newcomb</surname>
<given-names>Alexandra</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3284034"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ochoa</surname>
<given-names>Omar</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3327275"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>Department of Electrical Engineering and Computer Science, Embry-Riddle Aeronautical University</institution>, <city>Daytona Beach</city>, <state>FL</state>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Alexandra Newcomb, <email xlink:href="mailto:davidofa@my.erau.edu">davidofa@my.erau.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-18">
<day>18</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1749956</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>22</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Newcomb and Ochoa.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Newcomb and Ochoa</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The integration of Machine Learning (ML) systems into safety-critical domains heightens the need for rigorous safety guarantees. Traditional testing-based verification techniques are insufficient for fully capturing the complex, data-driven, and non-deterministic behaviors of modern ML models. Therefore, applying formal methods&#x2014;which provide rigorous mathematical guarantees of a system&#x2019;s adherence to specified properties&#x2014;to ML systems has been of particular interest in recent years.</p>
</sec>
<sec>
<title>Methods</title>
<p>This work presents a comprehensive Systematic Literature Review of peer-reviewed research from 2020 to mid-2025 on the use of formal methods to enhance ML safety, specifically for safety-critical applications. Articles selected present empirical research applying formal methods to modern machine learning approaches. Application domains as well as gaps, limitations, and challenges in this research area are compiled and presented.</p>
</sec>
<sec>
<title>Results</title>
<p>Following a structured protocol, 46 studies were identified across four major digital libraries and classified into eight categories: Reachability and Over-Approximation Techniques, SMT-based Verification and Abstraction/Refinement, MILP/ILP Approaches, Model Checking Approaches, Runtime Verification Approaches, Shielding Techniques, Control Barrier Function Methods, and Risk Verification Methods. The review synthesizes methodological advances, application areas, and comparative strengths over traditional verification, while also presenting bibliometric trends in the literature.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Analysis reveals persistent challenges and gaps, including scalability to large and complex models, integration with training processes, and limited real-world validation. Future research opportunities include developing integrated training-verification loops, scalable verification frameworks, hybrid formal methods, and novel techniques for emerging ML paradigms such as Large Language Models. This work serves both as a state-of-the-art reference and as a roadmap for advancing the safe deployment of ML systems.</p>
</sec>
</abstract>
<kwd-group>
<kwd>formal methods</kwd>
<kwd>machine learning</kwd>
<kwd>safe autonomy</kwd>
<kwd>safety-critical systems</kwd>
<kwd>software verification</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the National Science Foundation Graduate Research Fellowship Program under Grant No. 2445056.</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="4"/>
<equation-count count="0"/>
<ref-count count="64"/>
<page-count count="16"/>
<word-count count="12340"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>The increasing deployment of Machine Learning (ML) systems in critical sectors such as transportation, healthcare, and industrial automation introduces significant risks due to the inherent unpredictability and black-box nature of these complex models (<xref ref-type="bibr" rid="ref57">Xiao et al., 2023</xref>; <xref ref-type="bibr" rid="ref9">Bengio et al., 2025</xref>; <xref ref-type="bibr" rid="ref21">Guendouzi et al., 2025</xref>; <xref ref-type="bibr" rid="ref1">Adadi and Berrada, 2018</xref>). Failures in these systems can lead to severe consequences, including loss of human life, economic damage, and erosion of public trust. Furthermore, the rise of agentic AI warrants rigorous safety guarantees to ensure these systems work in the best interest of the public and are not susceptible to security breaches (<xref ref-type="bibr" rid="ref9">Bengio et al., 2025</xref>). Therefore, safe ML has been identified as a critical area of future ML research. Notably, the first International Artificial Intelligence (AI) Safety Report, written as a collaboration by 96 AI experts following the first International AI Safety Summit in 2023, discusses the need for safety nets within ML systems (<xref ref-type="bibr" rid="ref9">Bengio et al., 2025</xref>).</p>
<p>Traditional verification approaches, i.e., testing, struggle to provide comprehensive safety guarantees, especially given the data-driven and adaptive nature of ML algorithms (<xref ref-type="bibr" rid="ref47">Samadi et al., 2024</xref>; <xref ref-type="bibr" rid="ref59">Zhang et al., 2022</xref>; <xref ref-type="bibr" rid="ref33">Meyer, 2023</xref>). Formal methods, encompassing rigorous mathematical techniques like Model Checking, Theorem Proving, and Runtime Verification, offer potential solutions to these challenges by enabling explicit reasoning about system correctness, safety properties, and robustness (<xref ref-type="bibr" rid="ref45">Paul et al., 2023</xref>). This review defines formal verification as any rigorous mathematical method that reasons about a system&#x2019;s behavior logically and provides theoretical guarantees against specified formal properties (<xref ref-type="bibr" rid="ref45">Paul et al., 2023</xref>; <xref ref-type="bibr" rid="ref56">Wang and Tepfenhart, 2020</xref>). Despite the promise of formal methods, the integration of formal methods into ML safety and quality assurance is challenging due to the non-deterministic and often unexplainable nature of ML models. Research within this area remains fragmented and underexplored, motivating the need for a systematic synthesis of existing research to clearly delineate the current state, capabilities, and limitations of formal methods for safe ML.</p>
<p>This work presents a Systematic Literature Review (SLR) of current research in the area of ML safety and quality assurance through the usage of formal methods. The SLR categorizes current work into eight categories: Reachability and Over-Approximation Techniques, SMT-based Verification and Abstraction/Refinement, MILP/ILP Approaches, Model Checking Approaches, Runtime Verification Approaches, Shielding Techniques, Control Barrier Function Methods, and Risk Verification Methods. Current literature belonging to each area is then detailed. Note that this SLR focuses specifically on ML safety through formal verification, and both explainability and LLM hallucination are therefore outside the scope of this review.</p>
<p>Section 2 details the research method of the SLR followed by both bibliometric and technical results presented in Sections 3 and 4, respectively. Section 5 presents the analysis and discussion of the extracted data. Section 6 summarizes the quality assessment results. Lastly, Section 7 presents related work, and Section 8 briefly discusses strengths and limitations of the methods.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Research method</title>
<p>This section details the protocol used to conduct the SLR. This protocol is based on the methodologies presented by <xref ref-type="bibr" rid="ref10">Carrera-Rivera et al. (2022)</xref>, <xref ref-type="bibr" rid="ref64">Zhou et al. (2015)</xref>, and <xref ref-type="bibr" rid="ref28">Kitchenham and Charters (2007)</xref>.</p>
<sec id="sec3">
<label>2.1</label>
<title>PICOC and synonyms</title>
<p>When framing the research questions and search terms, the Population, Intervention, Comparison, Outcome, and Context (PICOC) method was used <xref ref-type="bibr" rid="ref10">Carrera-Rivera et al. (2022)</xref> and <xref ref-type="bibr" rid="ref28">Kitchenham and Charters (2007)</xref>. <xref ref-type="table" rid="tab1">Table 1</xref> presents the PICOC terms for this research.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>PICOC terms.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">PICOC criteria</th>
<th align="left" valign="top">Term</th>
<th align="left" valign="top">Synonyms</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Population</td>
<td align="left" valign="top">Machine learning</td>
<td align="left" valign="top">Artificial intelligence, neural network, reinforcement learning, large language model, transformer</td>
</tr>
<tr>
<td align="left" valign="top">Intervention</td>
<td align="left" valign="top">Formal methods</td>
<td align="left" valign="top">Formal verification, temporal logic, model checking, theorem proving, runtime verification, reactive synthesis, static analysis, automated reasoning</td>
</tr>
<tr>
<td align="left" valign="top">Comparison</td>
<td align="left" valign="top">Testing</td>
<td align="left" valign="top">Traditional verification</td>
</tr>
<tr>
<td align="left" valign="top">Outcome</td>
<td align="left" valign="top">Safety</td>
<td align="left" valign="top">Quality, safe</td>
</tr>
<tr>
<td align="left" valign="top">Context</td>
<td align="left" valign="top">Safety-critical</td>
<td align="left" valign="top">Critical system, life-critical, mission-critical, high-reliability</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Research questions</title>
<p>This research addresses the following three Research Questions (RQs), formulated from the PICOC terms.</p>
<list list-type="order">
<list-item>
<p>What formal methods are currently used to ensure the safety of ML systems and how do they improve upon traditional verification approaches?</p>
</list-item>
<list-item>
<p>In which domains or applications have these formal methods been successfully employed for safe ML?</p>
</list-item>
<list-item>
<p>What gaps, limitations, and challenges exist within current work combining formal methods and ML safety, and what future research directions can be identified from these gaps?</p>
</list-item>
</list>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Digital library sources</title>
<p>Articles for the SLR were selected from IEEE Xplore, ACM Digital Library, Science Direct by Elsevier, and Springer. These digital libraries were chosen for their relevance to the RQs and their reputability for publishing peer reviewed articles.</p>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>Search strings</title>
<p>Using the PICOC criteria, the following search string was synthesized: (&#x201C;Machine Learning&#x201D; OR &#x201C;Artificial Intelligence&#x201D; OR &#x201C;Neural Network&#x201D; OR &#x201C;Reinforcement Learning&#x201D; OR &#x201C;Large Language Model&#x201D; OR &#x201C;Transformer&#x201D;) AND (&#x201C;Formal Methods&#x201D; OR &#x201C;Formal Verification&#x201D; OR &#x201C;Temporal Logic&#x201D; OR &#x201C;Model Checking&#x201D; OR &#x201C;Theorem Proving&#x201D; OR &#x201C;Runtime Verification&#x201D; OR &#x201C;Reactive Synthesis&#x201D; OR &#x201C;Automated Reasoning&#x201D;) AND (&#x201C;Testing&#x201D; OR &#x201C;Traditional Verification&#x201D;) AND (&#x201C;Safety&#x201D; OR &#x201C;Quality&#x201D; OR &#x201C;Safe&#x201D;) AND (&#x201C;Safety-Critical&#x201D; OR &#x201C;Critical System&#x201D; OR &#x201C;Life-Critical&#x201D; OR &#x201C;Mission-Critical&#x201D; OR &#x201C;High-Reliability&#x201D;).</p>
<p>An additional title search term was used for Springer to narrow down results further: &#x201C;Formal&#x201D; OR &#x201C;Temporal Logic&#x201D; OR &#x201C;Verification&#x201D; OR &#x201C;Model Checking&#x201D; OR &#x201C;Runtime.&#x201D; Because Science Direct only allows eight Boolean connectors per query, the search string was split into the following two search strings, for Science Direct only:</p>
<list list-type="order">
<list-item>
<p>(&#x201C;Machine Learning&#x201D; OR &#x201C;Artificial Intelligence&#x201D; OR &#x201C;Neural Network&#x201D;) AND (&#x201C;Formal Methods&#x201D; OR &#x201C;Formal Verification&#x201D;) AND (&#x201C;Testing&#x201D;) AND (&#x201C;Safety&#x201D;) AND (&#x201C;Safety-Critical&#x201D;)</p>
</list-item>
<list-item>
<p>(&#x201C;Reinforcement Learning&#x201D; OR &#x201C;Large Language Model&#x201D; OR &#x201C;Transformer&#x201D;) AND (&#x201C;Model Checking&#x201D; OR &#x201C;Runtime Verification&#x201D;) AND (&#x201C;Testing&#x201D;) AND (&#x201C;Quality&#x201D; OR &#x201C;Safe&#x201D;) AND (&#x201C;Safety-Critical&#x201D;).</p>
</list-item>
</list>
</sec>
<sec id="sec7">
<label>2.5</label>
<title>Inclusion and exclusion criteria</title>
<p>The inclusion and exclusion criteria are presented in <xref ref-type="table" rid="tab2">Table 2</xref>. Criteria are adapted from <xref ref-type="bibr" rid="ref10">Carrera-Rivera et al. (2022)</xref> with several additions. Due to the significant development in ML in the last five years, articles were chosen to be no earlier than 2020. Despite their lower quality as opposed to conference and journal articles, workshop articles were included, as these articles frequently present novel ideas for future exploration.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Inclusion and exclusion criteria.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Criteria</th>
<th align="left" valign="top">Inclusion</th>
<th align="left" valign="top">Exclusion</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Time Period</td>
<td align="left" valign="top">2020 to June 2025 (5.5&#x202F;years)</td>
<td align="left" valign="top">Articles published before 2015 and after June 2025.</td>
</tr>
<tr>
<td align="left" valign="top">Language</td>
<td align="left" valign="top">Articles written in English.</td>
<td align="left" valign="top">All articles not written in English.</td>
</tr>
<tr>
<td align="left" valign="top">Type of literature/source</td>
<td align="left" valign="top">Conference, workshop, and journal articles presenting original empirical research.</td>
<td align="left" valign="top">All other articles, including grey literature. Articles that do not present empirical research.</td>
</tr>
<tr>
<td align="left" valign="top">Relevance to Research Questions</td>
<td align="left" valign="top">Relevant articles to at least one research question.</td>
<td align="left" valign="top">Articles irrelevant to all research questions</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In addition to the criteria presented in <xref ref-type="table" rid="tab2">Table 2</xref>, studies that do not directly apply formal methods to ML systems were excluded. This exclusion applies to studies that formally verify controllers or code synthesized by an ML model, or to studies focusing on autonomous systems broadly. Articles that do not mention &#x201C;formal verification&#x201D; or &#x201C;formal methods&#x201D; explicitly were removed. Articles related to cybersecurity were also excluded, as this SLR focuses on safe ML as opposed to secure ML.</p>
</sec>
<sec id="sec8">
<label>2.6</label>
<title>Quality assessment checklist</title>
<p>After the initial articles were selected using the inclusion and exclusion criteria on the full list, the quality assessment checklist, shown in <xref ref-type="table" rid="tab3">Table 3</xref>, was used primarily to characterize the quality of the articles and help determine future research directions. Articles were not excluded based on the quality assessment criteria, as this work is primarily interested in the ideas and research directions within safe ML through formal methods. Additionally, due to the limited nature of work in this area, it was counterproductive to further exclude articles outside of the inclusion and exclusion criteria. The questions are adapted from Zhou et al. and utilize the Reporting, Rigor, Credibility, and Relevance criteria (<xref ref-type="bibr" rid="ref64">Zhou et al., 2015</xref>). For each article, each question received a score of either 0 &#x2013; &#x201C;No,&#x201D; 0.5 &#x2013; &#x201C;Partially,&#x201D; or 1 &#x2013; &#x201C;Yes.&#x201D;</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Quality assessment checklist.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">ID</th>
<th align="center" valign="top">Quality assessment question</th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td align="left" valign="top"><italic>Reporting</italic></td>
</tr>
<tr>
<td align="left" valign="top">1</td>
<td align="left" valign="top">Do the authors clearly state the aims (goals, purpose, problems, motivations, objectives, questions) of the research?</td>
</tr>
<tr>
<td align="left" valign="top">2</td>
<td align="left" valign="top">Does the study clearly answer the research question(s) or clearly present the results?</td>
</tr>
<tr>
<td/>
<td align="left" valign="top"><italic>Rigor</italic></td>
</tr>
<tr>
<td align="left" valign="top">3</td>
<td align="left" valign="top">Are the methods used in the research clearly stated and fully defined?</td>
</tr>
<tr>
<td align="left" valign="top">4</td>
<td align="left" valign="top">Was the data collection method sufficiently rigorous and clearly described?</td>
</tr>
<tr>
<td align="left" valign="top">5</td>
<td align="left" valign="top">Does the research design address the aims of the research?</td>
</tr>
<tr>
<td align="left" valign="top">6</td>
<td align="left" valign="top"><italic>Credibility</italic></td>
</tr>
<tr>
<td align="left" valign="top">7</td>
<td align="left" valign="top">Does the article discuss limitations, challenges, or threats to validity?</td>
</tr>
<tr>
<td align="left" valign="top">8</td>
<td align="left" valign="top">Is the research replicable?</td>
</tr>
<tr>
<td align="left" valign="top">9</td>
<td align="left" valign="top">Are the findings credible (free from bias, reliable, trustworthy)?</td>
</tr>
<tr>
<td/>
<td align="left" valign="top"><italic>Relevance</italic></td>
</tr>
<tr>
<td align="left" valign="top">10</td>
<td align="left" valign="top">Is the study relevant or of value to the research community?</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec9">
<label>2.7</label>
<title>Data extraction form</title>
<p>After obtaining the final set of research articles, data was systematically extracted for analysis using the data extraction form presented in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>. The data extraction form was generated using guidelines from <xref ref-type="bibr" rid="ref28">Kitchenham and Charters (2007)</xref>.</p>
<p>Note that fields related to experimental design and results were not directly used to answer the RQs. Extracting information about each experiment was used to validate the significance of the authors&#x2019; findings and therefore the relevance of the proposed method. As a result, these fields primarily assisted with the quality assessment of each study. Experimental data was also used to help determine the application area or domain when answering RQ2.</p>
</sec>
<sec id="sec10">
<label>2.8</label>
<title>Data analysis and synthesis</title>
<p>After extracting data using the data extraction form, the data was analyzed and synthesized through two methods. First, a bibliometric analysis was performed on the data to determine bibliometric information such as which conferences or journals produce the most amount of research in this area, the top three countries where research in this area is being conducted, and the citation counts for the different articles.</p>
<p>Next, the technical content from the articles was analyzed from the extracted data to answer the RQs. This was accomplished by comparing the responses to each data item across articles to determine trends, application areas, challenges, and future research directions. The findings are then synthesized to answer the RQs in Sections 4 and 5. Data is presented both quantitatively and qualitatively. Separately, an analysis of the quality of the articles is presented after executing the quality assessment checklist on each article.</p>
</sec>
</sec>
<sec id="sec11">
<label>3</label>
<title>Bibliometric results</title>
<p>When querying the respective databases with the search string, the number of papers listed were 215 for IEEE Xplore, 300 for ACM Digital Library, 266 for Science Direct, and 135 for Springer. This resulted in a total of 916 papers. The addition of the title search term for Springer narrowed results down considerably from 705 initial articles from Springer alone. The advanced search feature was used in each digital library to filter by year and for research articles. After the initial selection of papers, abstracts and titles were read to determine relevance to the RQs. As a result, the number of articles ultimately selected for the SLR were 11 from IEEE Xplore, 8 from ACM Digital Library, 5 from Science Direct, and 22 from Springer. The distribution of papers by digital library source is shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Distribution of selected articles from digital library sources.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Pie chart showing distribution of sources: Springer represents forty-eight percent, IEEE Xplore twenty-four percent, ACM Digital Library seventeen percent, and Science Direct eleven percent. Each section is distinctly colored.</alt-text>
</graphic>
</fig>
<p>The distribution of selected articles that were published as conference papers, journal articles, or workshop papers is shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>. Most reviewed articles were conference papers belonging to 26 distinct international conferences. The largest number of conference papers came from Bridging the Gap between AI and Reality, published on Springer. Journal articles similarly came from a variety of journals, with no single journal producing multiple articles used in this SLR. Only two workshop articles were selected for review, and both were published as part of the International Workshop on Verification and Monitoring at Runtime Execution (VORTEX) proceedings, published on ACM.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Distribution of articles selected as conference, journal, or workshop articles.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Pie chart comparing publication types, showing seventy-eight percent in blue for conference, eighteen percent in orange for journal, and four percent in gray for workshop.</alt-text>
</graphic>
</fig>
<p>The countries of origin of the selected articles are shown in <xref ref-type="fig" rid="fig3">Figure 3</xref>. The United States of America and United Kingdom are shortened to USA and UK, respectively. The top three countries from which the largest number of articles originated from are the USA, Germany, and China. Furthermore, <xref ref-type="fig" rid="fig4">Figure 4</xref> displays the distribution of articles by year. Interestingly, the number of articles on the subject of safe ML through formal methods has remained steady over the last five years, with the largest number of articles published in 2023 and the lowest number published in 2020. It is noteworthy that a significant number of the articles were published in 2025, as only the first six months of 2025 were included in the search. Lastly, the distribution of citation counts for the selected articles is shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>. Because many of the articles are new, i.e., published within the last two years, there are many articles with zero citations.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Distribution of selected articles by country.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing the count per country, with the USA having the highest count at twelve, followed by Germany at eight, China at seven, and other countries decreasing to one each.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Distribution of selected articles by year.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing number of articles by year from 2020 to 2025. There were two articles from 2020, eight articles from 2021, seven articles from 2022, sixteen articles from 2023, five articles from 2024, and eight articles from 2025.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Distribution of citation counts for the selected articles.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing the count of occurrences for various numbers of citations, with the majority having zero or one citation, and counts decreasing as the number of citations increases, except for minor peaks at nineteen and sixty-six citations.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec12">
<label>4</label>
<title>Technical results</title>
<p><xref ref-type="supplementary-material" rid="SM1">Supplementary Table 2</xref> summarizes the articles discussed throughout the rest of this work. Articles are referred to by their Identifications (IDs), where &#x201C;S&#x201D; stands for &#x201C;Study.&#x201D; After data extraction, findings from the studies were grouped into the following eight categories, which are used to answer RQ1 in the rest of this section. While some articles fit under multiple categories, each article was classified under the most applicable category. The eight categories of formal methods identified to ensure the safety of ML systems are:</p>
<list list-type="order">
<list-item>
<p>Reachability and Over-Approximation Techniques</p>
</list-item>
<list-item>
<p>SMT-based Verification and Abstraction/Refinement</p>
</list-item>
<list-item>
<p>Mixed Integer Linear Programming (MILP)/Integer Linear Programming (ILP) Approaches</p>
</list-item>
<list-item>
<p>Model Checking Approaches</p>
</list-item>
<list-item>
<p>Runtime Verification Approaches</p>
</list-item>
<list-item>
<p>Shielding Techniques</p>
</list-item>
<list-item>
<p>Control Barrier Function Methods</p>
</list-item>
<list-item>
<p>Risk Verification Methods</p>
</list-item>
</list>
<sec id="sec13">
<label>4.1</label>
<title>Reachability and over-approximation techniques</title>
<p>Reachability analysis for neural networks computes the set of all outputs the network can produce when its inputs&#x2014;and, if modeled, its parameters (weights and biases)&#x2014;vary within specified bounds. This technique propagates sets (e.g., represented by intervals or star sets) through layers or time steps; some solutions compute a sound over-approximation that provably contains every true output, while other architectures admit exact set propagation (<xref ref-type="bibr" rid="ref33">Meyer, 2023</xref>; <xref ref-type="bibr" rid="ref12">Choi et al., 2025</xref>).</p>
<p>S22 extends the star-set reachability framework to RNNs to aid in formally verifying the robustness of RNNs (<xref ref-type="bibr" rid="ref12">Choi et al., 2025</xref>). The authors develop both exact and over-approximate reachability algorithms that incrementally unroll recurrent layers, encode dependencies between current and past hidden states via Minkowski sum of star sets, and offer soundness and completeness guarantees. Unrolling refers to the technique of expanding the recurrent network across a chosen number of time steps by creating a separate copy of the recurrent layer for each time step. This transformation converts the RNN&#x2019;s cyclic, time-dependent structure into a deep, acyclic Feed-Forward Neural Network (FFNN) at the cost of producing a much larger network (<xref ref-type="bibr" rid="ref12">Choi et al., 2025</xref>).</p>
<p>S23 proposes a mixed-monotonicity-based reachability method for computing interval over-approximations of a NN&#x2019;s output when both its inputs and its parameters lie within known bounds (<xref ref-type="bibr" rid="ref33">Meyer, 2023</xref>). In this approach, the network&#x2019;s partial networks are treated as static functions whose derivatives are known to lie within a certain bound. For each output neuron, a worst-case input corner is chosen based on whether each slope is positive or negative. The input corners are then used to compute tight upper and lower output bounds. This approach handles uncertainties by treating both input values and all weights as uncertain within given intervals and uses interval arithmetic on the network&#x2019;s Jacobian to propagate these uncertainties layer by layer. Rather than completing this step once, the method is applied to every contiguous block of layers (i.e., all partial networks). These interval bounds are then intersected to result in a tighter overall output range. This approach is applicable to any Lipschitz-continuous activation function (<xref ref-type="bibr" rid="ref33">Meyer, 2023</xref>).</p>
<p>On the other hand, S28 presents a high-parallelization framework for formally verifying feed-forward DNNs under bounded input uncertainty by computing interval over-approximations of layer outputs (<xref ref-type="bibr" rid="ref23">Hafaiedh et al., 2025</xref>). Interval over-approximation is a static analysis technique that computes, for each neuron in the network, a conservative lower-upper bound on its activation by propagating input uncertainty through the network. The framework checks if, for all inputs satisfying some precondition, the DNN&#x2019;s outputs satisfy a specified post-condition. The method runs several incomplete verifiers in parallel, and for each neuron takes the intersection of their interval bounds to refine the layer&#x2019;s over-approximation before feeding it forward (<xref ref-type="bibr" rid="ref23">Hafaiedh et al., 2025</xref>).</p>
<p>S12, S29, S31, and S34 apply reachability and over-approximation techniques to NNs for image-based applications (<xref ref-type="bibr" rid="ref43">Parameshwaran and Wang, 2025</xref>; <xref ref-type="bibr" rid="ref62">Zhong et al., 2023</xref>; <xref ref-type="bibr" rid="ref50">Tang et al., 2023</xref>; <xref ref-type="bibr" rid="ref5">Ashok et al., 2020</xref>). S12 aims to improve the scalability and efficiency of the verification problem for image-based NNs by performing formal verification, specifically linear bound propagation using &#x1D6FC; &#x2212; &#x1D6FD; &#x2013; &#x1D436;&#x1D445;&#x1D442;&#x1D44A;&#x1D441;, on structured input spaces within the latent space. This technique thereby reduces input dimensionality and computational complexity. Specifically, the authors propose the Scalable and Interpretable Verification of Image-Based Neural Network Controllers (SEVIN). The method learns a structured latent representation of the controller&#x2019;s input space, thereby decreasing the computational complexity of the verification task and making formal verification more scalable (<xref ref-type="bibr" rid="ref43">Parameshwaran and Wang, 2025</xref>).</p>
<p>Similarly, S29 introduces the Abstract Refinement Enhancer for Neural network verifAcation (ARENA), which constructs a linear constraint encoding of a NN&#x2019;s behavior using abstract interpretation bounds, then iteratively refines that encoding. ARENA targets potential violations of robustness and proves whether the regions are spurious (<xref ref-type="bibr" rid="ref62">Zhong et al., 2023</xref>). S31 introduces the tool Fast Grouping for Multi-neuron Relaxation (FaGMR), a verifier that encodes the network and an input-perturbation region into linear constraints. The authors also address verifying robustness properties. Nonlinear activations are over-approximated by convex constraints, then a Linear Programming (LP) solver checks whether the predicted class can change within the perturbation region (<xref ref-type="bibr" rid="ref50">Tang et al., 2023</xref>). S34 proposes DeepAbstract, an abstraction-based method that clusters neurons within each hidden layer based on empirical behavior (<xref ref-type="bibr" rid="ref5">Ashok et al., 2020</xref>). Each neuron is represented by its vector of activation values over a chosen input set, then k-means groups neurons with similar activation vectors. Neurons in each cluster are merged, yielding a smaller abstract network. Verification is then run on the abstract network.</p>
<p>On the other hand, S30 and S43 focus specifically on Semantic Segmentation NNs (<xref ref-type="bibr" rid="ref41">Pal et al., 2023</xref>; <xref ref-type="bibr" rid="ref53">Tran et al., 2021</xref>). S30 performs set-based reachability analysis for neural networks, specifically star-set reachability using the &#x201C;approx-star&#x201D; method (<xref ref-type="bibr" rid="ref41">Pal et al., 2023</xref>). The work presents a standardized benchmark intended to support fair and repeatable comparisons of verification approaches for semantic segmentation. S43 performs reachability analysis on Semantic Segmentation NNs, using ImageStar set representations and LP optimization to compute bounds needed for over-approximation (<xref ref-type="bibr" rid="ref53">Tran et al., 2021</xref>). The authors introduce a relaxed reachability variant for Rectified Linear Unit (ReLU) layers and pixel-classification reasoning, which is controlled by a relaxation factor that reduces how many LP problems are solved.</p>
<p>S37 focus specifically on reachability analysis and abstract interpretation for CNNs (<xref ref-type="bibr" rid="ref27">Kirov et al., 2023</xref>). The CNN&#x2019;s safety-relevant requirements are formalized as mathematical properties over bounded input perturbation sets and bounded trajectory modification sets. For each property, the verifier over-approximates the CNN&#x2019;s reachable output set over the constrained input set and checks whether the output set violates the output bound. When over-approximation yields &#x201C;unknown,&#x201D; randomized simulation searches for an explicit violating example.</p>
<p>S41 instead explores safety assurance when quantizing DNNs for use on resource constrained devices (<xref ref-type="bibr" rid="ref58">Zhang et al., 2023</xref>). The authors use both Differential Reachability Analysis (DRA) and MILP. Given a DNN, its corresponding fully quantized QNN, an input region, and an allowed error bound, the method verifies whether the maximum output deviation between the DNN and QNN is always less than the bound for all inputs in the region. DRA attempts to prove the bound quickly; if it cannot, the MILP encoding is used to decide the property exactly.</p>
<p>Lastly, S42 applies star-based reachability analysis to DNNs used for time-series regression tasks (<xref ref-type="bibr" rid="ref42">Pal et al., 2023</xref>). The framework encodes bounded sensor noise as a set of possible inputs and then propagates that set through the network layer-by-layer to obtain an output reachable set. The reachable output bounds are then compared to permissible bounds to determine robustness at each time step and across a sequence.</p>
</sec>
<sec id="sec14">
<label>4.2</label>
<title>SMT-based verification and abstraction/refinement</title>
<p>SMT-based verification for NNs encodes both the neural network&#x2019;s computation and the safety or robustness property as logical formulas in rich theories (e.g., linear real arithmetic or bit-vectors) and then utilizes optimized SMT solvers to check for violations or to prove the absence of counterexamples (<xref ref-type="bibr" rid="ref15">Das et al., 2025</xref>; <xref ref-type="bibr" rid="ref6">Bachiri et al., 2025</xref>). Abstraction/Refinement (AR) verifies a DNN by first constructing a smaller over-approximating model whose correctness implies correctness of the original. If the abstract query is UNSAT, the original is UNSAT; if it is SAT, the produced counterexample is checked on the original network and refinement is applied only for spurious counterexamples or otherwise inconclusive abstract results. By iteratively refining the abstraction, AR trades precision for solver performance and can be substantially faster than verifying the full network directly (<xref ref-type="bibr" rid="ref17">Elboher et al., 2024</xref>).</p>
<p>S3, S4, S27, S32, and S35 specifically apply SMT-based verification to DNNs (<xref ref-type="bibr" rid="ref40">Nuhu et al., 2022</xref>; <xref ref-type="bibr" rid="ref47">Samadi et al., 2024</xref>; <xref ref-type="bibr" rid="ref17">Elboher et al., 2024</xref>; <xref ref-type="bibr" rid="ref60">Zhao et al., 2022</xref>; <xref ref-type="bibr" rid="ref44">Paterson et al., 2021</xref>). After determining candidate unsafe input subregions (i.e., candidate unsafe sub-requirements), S3 utilizes the Marabou framework to validate the unsafe input space on a DNN (<xref ref-type="bibr" rid="ref40">Nuhu et al., 2022</xref>). Marabou, an SMT-based framework for verifying DNNs, validates whether the input subregion violates the safety property. The Negative Selection Algorithm, a meta-heuristic algorithm, is used to search for candidate unsafe input subregions. Next, the subregions are fed into Marabou to formally verify that the corresponding safety property is violated, thereby validating the unsafe subregion and unsafe sub-requirement.</p>
<p>Similarly, S4 uses Marabou to systematically insert perturbations into the DNN under test and determine which safety properties are violated as a result (<xref ref-type="bibr" rid="ref47">Samadi et al., 2024</xref>). The authors propose utilizing Marabou to analyze the resilience of a DNN to input perturbations. Specifically, the study aims to determine the threshold weights that result in property violations and focuses on Single Event Upsets (SEUs) and Multi-Bit Upsets (MBUs). An SEU is a single bit flip within the memory&#x2019;s stored data, which impacts the weights of the DNN parameters. SEUs are performed by randomly selecting a weight within the DNN and flipping a bit in its binary representation. An MBU occurs when multiple bits are flipped in the system memory or processor of the DNN due to multiple SEUs in the network parameters. The proposed framework simulates both SEUs and MBUs within DNN network parameters. The authors then analyze the resulting effect on DNN performance and adherence to properties. The output of the DNN is compared with the anticipated output from the training data.</p>
<p>S27 focuses on AR for SMT-solving by introducing residual reasoning, a scheme that captures and reuses information (e.g., already-proven safe regions of the search tree) across successive AR iterations, thereby pruning redundant work and accelerating verification of DNNs (<xref ref-type="bibr" rid="ref17">Elboher et al., 2024</xref>). The authors contribute a formal, sound, and complete residual-reasoning framework for DNN verification as well as a detailed design for extending the Marabou verifier to support residual reasoning. <xref ref-type="fig" rid="fig6">Figure 6</xref> displays the verification loop described by S27, where &#x0393; is a context formula encoding scenarios already shown safe.</p>
<p>S32 applies Counterexample-Guided Abstraction Refinement (CEGAR) to DNN verification to determine whether a DNN satisfies specified input/output safety properties formulated as verification queries (<xref ref-type="bibr" rid="ref60">Zhao et al., 2022</xref>). The framework involves building an over-approximated abstract network of the DNN and iterating AR using counterexamples. Backend formal verification is accomplished using Marabou. S35 introduces DeepCert, which performs constraint-based formal verification of DNN robustness using Marabou (<xref ref-type="bibr" rid="ref44">Paterson et al., 2021</xref>). DeepCert encodes a perturbation to an image as constraints linking original pixels, perturbed inputs, and a perturbation bound, then determines whether a misclassification is possible within that bound.</p>
<p>On the other hand, S19 focuses on verifying the robustness of SDNNs, in safety-critical applications, from adversarial attacks using formal verification (<xref ref-type="bibr" rid="ref15">Das et al., 2025</xref>). The authors specifically explore modeling and verification using SMT constraints. The framework receives two inputs: the SDNN and a property for verification. The formal verification framework then rigorously analyzes the SDNN using SMT-solving and either declares property adherence or presents a counterexample. The authors also present an interval bound derivation to improve the performance of the SMT solution further.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Verification loop as described in S27 (<xref ref-type="bibr" rid="ref17">Elboher et al., 2024</xref>).</p>
</caption>
<graphic xlink:href="frai-09-1749956-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating an iterative network verification process: abstract the original network, verify the abstract network and skip checked subtrees, then either finish if UNSAT or SAT, or refine and update if there is a spurious SAT, repeating as needed.</alt-text>
</graphic>
</fig>
<p>S20 presents an SMT-based verification of QNNs by approximating QNN values with rational-number versions (<xref ref-type="bibr" rid="ref6">Bachiri et al., 2025</xref>). The authors combine rational approximations using set theory with SMT-based verification over rational arithmetic programs. The authors&#x2019; approach consists of four steps:</p>
<list list-type="order">
<list-item>
<p>Convert natural language descriptions of how an autonomous vehicle controller should behave into formal SMT predicates. This is done by first converting textual requirements into abstract scenarios that describe the intended autonomous driving behaviors. Then, these abstract scenarios are translated into logical scenarios from which formal decision properties can be specified.</p>
</list-item>
<list-item>
<p>Approximate the fixed-point QNN with a rational-number version by replacing every weight, bias, and operation with its exact rational-number counterpart. This supports SMT solving, as fixed-point mathematics is computationally infeasible for SMT solving at scale.</p>
</list-item>
<list-item>
<p>Verify the safety property over the rational network using SMT-LIB. If the solver returns UNSAT, no counterexample exists. If the solver returns SAT, there is an input violating the property, and the check stops.</p>
</list-item>
<list-item>
<p>Generalize the results back to the original QNN by accounting for quantization error that the fixed-point QNN experiences during arithmetic operations (<xref ref-type="bibr" rid="ref6">Bachiri et al., 2025</xref>).</p>
</list-item>
</list>
<p>On a separate note, S33 models a NN controller as a transition predicate and utilizes an inductive invariant method for proving time-unbounded safety (<xref ref-type="bibr" rid="ref63">Zhou and Tripakis, 2024</xref>). The framework decomposes the verification step so that the NN-related implication is handled using an NN verification engine and the environment-related implication is handled using an SMT solver.</p>
<p>S39 formally verifies an RL-trained controller for vertical collision avoidance (<xref ref-type="bibr" rid="ref19">Genin et al., 2021</xref>). The authors compute conservative safe command ranges for states in a critical region, discretize the state space into cubes and over-approximate the safe bounds per cube, then use an SMT solver to prove whether the NN&#x2019;s command acceleration is always within safe bounds for each critical cube. The result is a classification of cubes as verified safe versus potentially unsafe.</p>
<p>Focusing on a different area of ML research, S45 applies SMT solving to Spiking NNs, which are NNs that process information via discrete &#x201C;spikes&#x201D; over time (similar to human brains) (<xref ref-type="bibr" rid="ref7">Banerjee et al., 2023</xref>). The framework translates the step-by-step execution of a Spiking NN into SMT constraints, where Boolean variables represent whether each neuron spikes at each timestep, and real variables represent each neuron&#x2019;s instant potential and stored potential. Properties are then verified by checking satisfiability of a combined constraint system representing the Spiking NN behavior, input constraints, and violation of the desired output condition. If satisfiable, the solver provides a counterexample input spike train. If unsatisfiable, the property holds for all allowed inputs.</p>
<p>Lastly, S46 focuses specifically on SMT solving for Long Short-Term Memory (LSTM) networks, which are a type of RNN designed for sequential and time-series data (<xref ref-type="bibr" rid="ref36">Moradkhani et al., 2023</xref>). The authors train an LSTM model, extract learned parameters, and encode the LSTM&#x2019;s step-to-step update equations as an SMT constraint system. They then use an SMT solver with Bounded Model Checking capabilities to determine whether the LSTM can violate the intended safety specification.</p>
</sec>
<sec id="sec15">
<label>4.3</label>
<title>MILP/ILP approaches</title>
<p>MILP/ILP verification for NNs encodes a NN and a safety specification over a bounded input set into a set of linear and integer constraints/inequalities. Integer (binary) variables capture discrete choices introduced by activations and quantization; the solver asks a feasibility question: &#x201C;Is there an input in this set that violates the property?&#x201D; If the solver finds one, that assignment is a concrete counterexample; if none exists, the property is certified for that region. MILP mixes continuous and integer variables, whereas ILP uses only integers (<xref ref-type="bibr" rid="ref59">Zhang et al., 2022</xref>; <xref ref-type="bibr" rid="ref35">Mistry et al., 2022</xref>).</p>
<p>S5 presents an approach in which fixed-point primitives are encoded as operations in MILP to allow the authors to apply MILP solving to QNNs (<xref ref-type="bibr" rid="ref35">Mistry et al., 2022</xref>). Otherwise, MILP cannot be used directly on QNNs. The verification problem is converted from a validity problem to a satisfiability problem, which allows the MILP solver to determine satisfaction (with a produced counterexample) or unsatisfaction (indicating a successful verification or valid input formula). Rather than being used for optimization, the MILP solver is used to check the feasibility of the equations. Similarly, S14 presents an encoding method for QNNs that reduces the verification problem to ILP (<xref ref-type="bibr" rid="ref59">Zhang et al., 2022</xref>). The authors introduce piecewise constant functions for the encoding of QNN activation functions, which are then further encoded as integer linear constraints using additional Boolean variables. Both S5 and S14 use Gurobi as the backend MILP/ILP solver.</p>
<p>Separately, S36 applies MILP to ReLU NNs to verify monotonicity properties for a safety-critical avionics system (<xref ref-type="bibr" rid="ref55">Vidot et al., 2022</xref>). A model is monotone if an input change in a specified direction results in an output moving in only one direction. The authors encode the ReLU NN and the monotonicity requirement into MILP constraints. They then check monotonicity over many paired input sub-spaces. By running MILP feasibility checks on each sub-space, they classify regions where monotonicity holds, fails everywhere, or partially fails.</p>
<p>Lastly, S6 combines polynomial inclusion computation with barrier certificate generation to formally verify synthesized DNNs from RL using linear programming solvers (<xref ref-type="bibr" rid="ref61">Zhao et al., 2023</xref>). The authors abstract a DNN controller as a polynomial using Bernstein polynomials to facilitate verification by polynomial inclusion. Safety adherence is determined by the existence of barrier certificates under the abstract controller, and Sum-of-Squares optimization is used to search for these barrier certificates. If a barrier certificate is found, adherence to the corresponding safety property is determined for all executions of the original DNN controller.</p>
</sec>
<sec id="sec16">
<label>4.4</label>
<title>Model checking approaches</title>
<p>Within formal methods broadly, Model Checking provides an automated framework to verify that system designs meet desired specifications. An abstract model <italic>M</italic> of the system under verification is constructed using finite state automata. Programs are modeled as transition systems, represented using nodes, variables, and transitions. A set of correctness and safety formulas <italic>F</italic> is defined using a logic formalism. The formulas describe the required correctness properties of the system. Next, the state space of <italic>M</italic> is systematically explored to check if <italic>F</italic> is consistently satisfied. If a property is violated, the model checker provides a counterexample demonstrating where the violation occurs (<xref ref-type="bibr" rid="ref45">Paul et al., 2023</xref>; <xref ref-type="bibr" rid="ref25">Jhala and Majumdar, 2009</xref>).</p>
<p>S2 applies Symbolic Model Checking to NNs by formally modeling a trained NN and verifying the model to estimate robustness to noise (<xref ref-type="bibr" rid="ref37">Naseer et al., 2020</xref>). The study introduces the Formal Analysis of Neural Network (FANNet), which is composed of three main procedures. The first procedure is behavior extraction, in which the formal model of the NN is built using the weights and activations of the network on known test samples. The temporal properties are translated into the logical language of the model checker. The second procedure is noise tolerance analysis, in which the specific noise tolerance for the NN is algorithmically determined. The last procedure is the adversarial noise vector extraction, in which a unique array of noise patterns that the NN is sensitive to is built (<xref ref-type="bibr" rid="ref37">Naseer et al., 2020</xref>). S9 applies Bounded Model Checking to a simplified version of a DNN modeling adaptive cruise control behavior (<xref ref-type="bibr" rid="ref38">Nenchev, 2025</xref>). However, the model checked may not describe all behaviors of the actual system, therefore introducing potential gaps in the evaluation. The proposed approach should therefore be used alongside other verification techniques.</p>
<p>Both S1 and S44 apply Model Checking to RL-based systems (<xref ref-type="bibr" rid="ref51">Tao et al., 2025</xref>; <xref ref-type="bibr" rid="ref2">Adelt et al., 2023</xref>). S1 proposes a framework for formally specifying requirements, constructing an abstract model, and model checking an RL-based system (<xref ref-type="bibr" rid="ref51">Tao et al., 2025</xref>). The study provides specification templates for formally specifying requirements of RL-based systems. A model construction process for generating the abstract model is then presented. Finally, the authors present Reinforcement Learning Verification-as-a-Service (ReLVaaS), a framework allowing users to specify formal requirements, construct the abstract models, and perform model checking using PRISM and Storm. On the other hand, S44 combines shielding with Statistical Model Checking-based learning for an RL agent (<xref ref-type="bibr" rid="ref2">Adelt et al., 2023</xref>). The RL agent is constrained by a verified shield, and the authors prove, in Differential Dynamic Logic (dL) and KeYmaera X, that actions satisfying the agent&#x2019;s contracts preserve safety and resilience. The framework then enforces, at runtime, that the agent can only choose those safe actions. Statistical Model Checking is used both during training and for statistical evaluation to compute confidence intervals of property satisfaction.</p>
<p>S26 introduces Deep Statistical Model Checking (DSMC), a framework that treats a trained neural network as an oracle to resolve nondeterminism in a Markov Decision Process (MDP), yielding a Markov chain whose behavior can be assessed via Statistical Model Checking (<xref ref-type="bibr" rid="ref20">Gros et al., 2022</xref>). DSMC treats a trained neural network as a black-box oracle that resolves nondeterministic choices in a formally specified MDP. Whenever the MDP reaches a decision point, it queries the NN, using the current state, for the next action. The result is a fully probabilistic Markov Chain that can then be used for analysis.</p>
<p>On a separate note, S25 presents a unified Model Checking framework for ReLU RNNs that leverages a polyhedron abstraction domain and bidirectional propagation to symbolically verify both qualitative and quantitative temporal and robustness properties (<xref ref-type="bibr" rid="ref31">Liang et al., 2024</xref>). The contribution is a systematic verification framework that combines polyhedron forward propagation, dimension preserving abstraction, and Monte Carlo sampling. Polyhedron forward propagation is used to track sets of possible RNN states. Dimension-preserving abstraction is used to curb combinatorial explosion of polyhedral vertices. Lastly, Monte Carlo sampling and backward propagation are used for quantitative estimation of satisfaction probabilities and to refine approximations.</p>
<p>Model Checking of MARL frameworks is explored by S15, which applies Model Checking to an abstract representation of a Markov game and joint policies (<xref ref-type="bibr" rid="ref16">El Mqirmi et al., 2021</xref>). The study presents Assured Multi-Agent Reinforcement Learning (AMARL), a formal verification framework for MARL systems. AMARL formally guarantees the safety of agents acting in an unknown environment. The authors introduce the Abstract Markov Game (AMG) and present a procedure for automatically generating AMGs to conduct verification over.</p>
<p>S21 applies Model Checking to a framework combining Federated Learning, Genetic Algorithms (GA), and Meta-learning into a multi-layer Industrial Cyber Physical Systems (ICPS) framework named FedGA-Meta (<xref ref-type="bibr" rid="ref21">Guendouzi et al., 2025</xref>). The authors first model the ICPS. Key elements of the ICPS, such as sensors, actuators, edge/fog/cloud servers, the network links, the data flows, and the Federated Learning process itself, are formalized as a Labeled Transition System (LTS). The behavioral models are then represented as timed automata in UPPAAL, and each property is posed as a query to the UPPAAL model checker. UPPAAL then exhaustively explores every possible execution within specified timing bounds to verify that each property holds, producing a counterexample otherwise. This provides a machine-checked guarantee that the FedGA-Meta system satisfies the stated properties (<xref ref-type="bibr" rid="ref21">Guendouzi et al., 2025</xref>).</p>
<p>Lastly, S40 first utilizes active automata learning using Angluin&#x2019;s L&#x002A; algorithm to learn a minimal Deterministic Finite Automata (DFA) abstracting an RNN&#x2019;s behavior (<xref ref-type="bibr" rid="ref26">Khmelnitsky et al., 2021</xref>; <xref ref-type="bibr" rid="ref4">Angluin, 1987</xref>). Statistical Model Checking is then used to determine whether the DFA abstraction violates a specification. When violations are found, the method checks whether the RNN truly violates the property or whether the DFA abstraction inaccurately represents the RNN, refining the learned DFA accordingly (<xref ref-type="bibr" rid="ref26">Khmelnitsky et al., 2021</xref>).</p>
</sec>
<sec id="sec17">
<label>4.5</label>
<title>Runtime verification approaches</title>
<p>Broadly, Runtime Verification is a lightweight formal method applied at runtime to provide rigorous guarantees of property adherence for a system. Runtime Verification provides precise information on the runtime behavior of the monitored system at the cost of limited execution coverage. Within this method, safety properties are specified in a formal language, a monitor is generated from the formal specification, and the monitor is instrumented to extract information from the system to verify property adherence against the system trace (<xref ref-type="bibr" rid="ref8">Bartocci et al., 2018</xref>).</p>
<p>S8 combines runtime assurance with Theorem Proving and SMT-solving to formally verify an airborne collision avoidance system that utilizes an NN for decision making (<xref ref-type="bibr" rid="ref14">Cofer et al., 2022</xref>). The runtime monitor evaluates the flight plan generated by the collision avoidance system and contains a safe backup planner. The results of the evaluation are then fed into a decision logic component which selects a backup flight plan that ensures safe flight. The decision component decides on a safe flight plan based on a tabular specification of safety rules. The decision logic code is synthesized from a formal specification, and formal proofs of correctness are produced after each step during synthesis. Additionally, the runtime monitor is modeled using the Architecture Analysis and Design Language (AADL) and the study formally analyzes the monitor, using SMT-solving, against safety properties. The Assume Guarantee Reasoning Environment (AGREE) is used to analyze the runtime assurance architecture (<xref ref-type="bibr" rid="ref14">Cofer et al., 2022</xref>).</p>
<p>Similarly, S16 utilizes both Theorem Proving and runtime monitoring within a modified formally constrained RL framework (<xref ref-type="bibr" rid="ref24">Hunt et al., 2021</xref>). The study introduces Verifiably Safe Reinforcement Learning (VSRL). Rather than assuming a perfect simulator state, VSRL first trains a lightweight object detector, using only a handful of labeled examples, to extract the positions of safety-critical objects from raw Red Green Blue frames. Instead of embedding constraints directly into the RL algorithm, VSRL wraps the original environment with a safety guard. Whenever the agent proposes an unsafe action, that action is replaced at runtime by a uniformly sampled safe one. The authors show that this method produces a refined MDP in which all actions are safe and whose optimal policies exactly correspond to the best safe policies in the original problem. The authors also use dL to model the environment and controller as hybrid programs. The safety properties are then specified and proven in the dL proof calculus, thus providing a formal certificate of safety from the starting safe state (<xref ref-type="bibr" rid="ref24">Hunt et al., 2021</xref>).</p>
<p>S38 also focuses on RL-based systems and formally verifies an RL agent implemented via MATLAB Simulink&#x2019;s RL toolbox (<xref ref-type="bibr" rid="ref3">Adelt et al., 2021</xref>). The RL component&#x2019;s acceptable behavior is specified as a hybrid contract in dL, constraining which actions are permitted in which observed states. The Simulink model is transformed into a dL hybrid program where the RL agent is represented as a monitored nondeterministic choice over safe actions consistent with the contract. Safety properties are proven deductively for the entire closed-loop system, and runtime monitors ensure the trained agent&#x2019;s executed actions satisfy the contract during simulation and training.</p>
<p>S10 performs runtime monitoring of multiple parallel ML models for Over-The-Air (OTA) updates in CPSs (<xref ref-type="bibr" rid="ref22">Guissouma et al., 2023</xref>). An initial ML model, such as an Artificial Neural Network (ANN), is first deployed after training and the first validation phase. While operating, new data may be added to the original training base, requiring an updated ML model to be sent OTA to the vehicle. As a result, a new version of the ML model is retrained. Further, multiple models may be retrained with different configurations under the new training data. These newly trained versions are then deployed to the vehicle as Shadow Versions (SVs), where each model runs in parallel to the Active Version (AV). While all versions are running, online monitoring to ensure safety is simultaneously performed on all models, and safety properties are expressed in STL. The data during monitoring is collected and analyzed, and when one SV is found to be more robust or safe than the AV, the AV is replaced with that SV. The AV serves as the baseline for future updates (<xref ref-type="bibr" rid="ref22">Guissouma et al., 2023</xref>).</p>
<p>On the other hand, S11 incorporates formal symbolic reasoning into the construction of runtime monitors for DNNs (<xref ref-type="bibr" rid="ref11">Cheng, 2021</xref>). The monitors are based on building an abstraction out of neuron activation patterns from the training data. In order to accomplish this, a set of neurons to be monitored is selected. Then, feature vectors are formed by taking the values of all monitored neurons for each input into the DNN. The monitor is constructed algorithmically by building a compact set representation that contains all feature vectors. The result is a sound guarantee, in that a warning over an input means that there is no close input within the training dataset. However, one issue with this approach is that in real-world deployment, the method can yield many false alarms. As a result, the study proposes a different monitor construction algorithm with robustness guarantees (<xref ref-type="bibr" rid="ref11">Cheng, 2021</xref>).</p>
<p>On a separate note, S13 applies Runtime Verification to verify ML-driven chatbots (<xref ref-type="bibr" rid="ref18">Ferrando et al., 2023</xref>). A runtime monitor is placed outside of a chatbot environment to ensure that the interaction between the user and chatbot follows a specified interaction protocol. The study presents Runtime Verification for Rasa (RV4Rasa), a framework for the runtime verification of chatbots developed in Rasa. These chatbots can be run locally on a computer rather than solely on the cloud. However, the framework can be extended to other chatbot environments. RV4Rasa checks, based on structured data generated during the Natural Language Understanding (NLU) step, whether the user and chatbot follow a specified interaction protocol (<xref ref-type="bibr" rid="ref18">Ferrando et al., 2023</xref>).</p>
<p>Lastly, S18 develops runtime monitors analyzing the input and output channels of black-box neural networks, especially CNNs and ANNs (<xref ref-type="bibr" rid="ref54">Tripuramallu et al., 2024</xref>). Safety properties are formalized as Valued Discrete Timed Automata (VDTA), and the runtime monitors are synthesized from the VDTAs. The runtime monitor operates over the inputs into the CNN and the outputs of the CNN.</p>
</sec>
<sec id="sec18">
<label>4.6</label>
<title>Shielding techniques</title>
<p>A shield is a reactive system implemented alongside the learning agent that enforces safety properties specified in a formal language. The learning agent observes the environment and selects an appropriate action, which is then checked by the shield and corrected if the chosen action is deemed unsafe (<xref ref-type="bibr" rid="ref39">Newcomb et al., 2024</xref>). Note that shielding is also utilized in S15 and S44, however, shielding was not the primary formal technique presented in these studies and they are therefore both discussed in Section 4.4.</p>
<p>S17 presents a dynamic shielding mechanism for MARL where shields dynamically split and merge depending on agent behavior, promoting collaboration between agents to ensure safety (<xref ref-type="bibr" rid="ref57">Xiao et al., 2023</xref>). When agents are at risk of conservative behavior, such as when their shields block them from taking action due to lack of coordination, the independent shields can merge into one shield that maintains the safe behavior of the agents. When the agents move apart from each other, the shields split apart into multiple shields.</p>
<p>The authors present an algorithm for shield synthesis called K-Step Look Ahead Shields, which is a variant of traditional shield synthesis. The result is potentially improved computational efficiency when shields dynamically centralize and decentralize, as well as improved coordination among agents. The authors use Linear Temporal Logic to formalize safety specifications. They also theoretically prove that their shield synthesis technique guarantees safety (<xref ref-type="bibr" rid="ref57">Xiao et al., 2023</xref>).</p>
</sec>
<sec id="sec19">
<label>4.7</label>
<title>Control barrier function methods</title>
<p>Control Barrier Function (CBF) methods assign a scalar safety score to each state. The safe set is then the super-level set containing all states with scores above a safety threshold, e.g., zero. A function is a CBF if, for every safe state, there exists at least one action that guarantees the next state remains in the safe set. Controllers can then use this condition as a safety filter to choose actions that keep trajectories inside the safe set. S7 reduces a learned Value Function (VF) used within RL into a CBF and verifies that the CBF is valid. If the CBF is valid, the CBF may be used as a formal safety certificate for the RL policy. Experiments validate that learned CBFs as safety filters or certificates are feasible through RL (<xref ref-type="bibr" rid="ref49">Tan et al., 2024</xref>).</p>
</sec>
<sec id="sec20">
<label>4.8</label>
<title>Risk verification methods</title>
<p>Risk verification is a data-driven formal verification approach that, from system execution traces, computes high-confidence bounds on statistical risk metrics over the distribution of robustness values for a given formal specification. The method therefore quantifies the tail risk&#x2014;the risk from the extreme end of the robustness distribution&#x2014;and the severity of potential violations (<xref ref-type="bibr" rid="ref13">Cleaveland et al., 2022</xref>).</p>
<p>S24 applies risk estimation to NN controllers (<xref ref-type="bibr" rid="ref13">Cleaveland et al., 2022</xref>). Safety requirements are formalized as either simple state constraints or STL formulas. The study estimates, from trajectory data, the risk that an NN-controlled stochastic system will violate a formal specification. Then, the study characterizes how that risk changes when the system is perturbed (e.g., due to environmental changes or modeling errors), by deriving bounds based on measures of system closeness. The steps of the verification framework are as follows:</p>
<list list-type="order">
<list-item>
<p>Define a robustness score for each run. For any simulated trajectory of the system under the NN controller, compute the distance from violating a safety rule at every time step. Next, take the minimum of those distances over time as the robustness of that run.</p>
</list-item>
<list-item>
<p>Treat robustness as a random variable. Because the system is stochastic, a distribution of robustness values is obtained by running many simulations.</p>
</list-item>
<list-item>
<p>Choose a risk metric. Rather than caring only about average performance, pick a measure such as Value-at-Risk (VaR) or Conditional VaR to focus on the worst-case tail of the distribution. Estimate an upper bound on that risk metric from the finite sample, with high confidence, using concentration inequalities.</p>
</list-item>
<list-item>
<p>Bound the risk under model mismatch. Real systems or richer simulators may differ from the model. The authors show how to upper-bound the increase in risk when moving from the model to the real system by quantifying how much two trajectories can diverge. This step yields a risk-verification gap, which is an explicit amount that must be added to the model&#x2019;s nominal risk bound to stay safe on the real system (Cleaveland, <xref ref-type="bibr" rid="ref48">Tambon et al., 2022</xref>).</p>
</list-item>
</list>
</sec>
<sec id="sec21">
<label>4.9</label>
<title>Comparison with traditional verification approaches</title>
<p>This section answers the second half of RQ1 by detailing how the described formal methods improve upon traditional verification approaches. S1-2, S4, S8-10, S12, S14, S16, S23, S26, S28, S31-32, S36-37, S41 and S43 briefly discuss limitations to traditional verification that warrants the investigation of formal methods for providing correctness guarantees on ML systems. The evidence from these studies supports deeper studies into formal methods as opposed to traditional methods for enforcing safe ML.</p>
<p>S1 briefly discusses how traditional verification techniques on RL-based systems only provide statistical guarantees at most, and more rigorous techniques, such as formal methods, are therefore necessary to ensure the trustworthiness of RL-based systems (<xref ref-type="bibr" rid="ref51">Tao et al., 2025</xref>). Also related to RL-based systems, S16 mentions that providing a purely statistical guarantee that an RL agent remains within safe states requires an infeasible amount of training data (<xref ref-type="bibr" rid="ref24">Hunt et al., 2021</xref>).</p>
<p>Regarding NNs and DNNs, S23 also discusses statistical testing and explains how statistical testing of autonomous systems utilizing NNs is insufficient when verifying the safety of these systems (<xref ref-type="bibr" rid="ref33">Meyer, 2023</xref>). S2 explains that testing an NN on a complete input set is impossible as the input set is often infinite. Therefore, testing is insufficient when verifying an NN (<xref ref-type="bibr" rid="ref37">Naseer et al., 2020</xref>). Similarly, S4 mentions that traditional verification techniques on DNNs, such as simulations or random noise analysis, are insufficient for providing formal guarantees against errors. Therefore, traditional methods cannot capture the complete set of potential faults, which is necessary for safety critical applications (<xref ref-type="bibr" rid="ref47">Samadi et al., 2024</xref>). S8 explains that because the behavior of NNs is largely attributed to its training data, it is generally impossible to determine the correctness of a NN through white-box methods or design analysis (<xref ref-type="bibr" rid="ref14">Cofer et al., 2022</xref>). S9 describes that traditional verification, such as simulation testing, is non-exhaustive by nature. Therefore, incorporating formal methods into the verification process will help obtain additional completeness guarantees of autonomous software (<xref ref-type="bibr" rid="ref38">Nenchev, 2025</xref>).</p>
<p>On a similar note, S10 mentions that simulation-based verification methods are insufficient for verifying real-time systems, and as a result, some software errors only become apparent after deployment (<xref ref-type="bibr" rid="ref22">Guissouma et al., 2023</xref>). S14, S28, and S41 explain that traditional verification techniques, i.e., testing, for DNNs are useful for finding input samples that lead to incorrect behavior. However, these techniques cannot prove the absence of input samples leading to incorrect behavior (<xref ref-type="bibr" rid="ref59">Zhang et al., 2022</xref>; <xref ref-type="bibr" rid="ref23">Hafaiedh et al., 2025</xref>; <xref ref-type="bibr" rid="ref58">Zhang et al., 2023</xref>). S26 discusses how human inspection is not a viable verification technique to use for NNs as it is for traditional programming. The complex function representation of a NN makes it challenging to verify through mechanical analysis of important properties (<xref ref-type="bibr" rid="ref20">Gros et al., 2022</xref>).</p>
<p>Lastly, S12 argues that existing methods for verifying image-based NNs are limited by the high dimensionality and complexity of image inputs. Therefore, these methods face scalability challenges and computational inefficiencies. Additionally, traditional verification techniques often treat NNs as black-boxes, which makes it challenging to understand the decision making within each controller (<xref ref-type="bibr" rid="ref43">Parameshwaran and Wang, 2025</xref>).</p>
</sec>
</sec>
<sec id="sec22">
<label>5</label>
<title>Analysis and discussion</title>
<p>This section answers RQ2 and RQ3 in respective subsections.</p>
<sec id="sec23">
<label>5.1</label>
<title>Research question 2</title>
<p>RQ2 asks, &#x201C;in which domains or applications have these formal methods been successfully employed for safe ML?&#x201D; <xref ref-type="table" rid="tab4">Table 4</xref> presents the application areas or domains identified, the number of articles that correspond to that application/domain, and the source IDs. The largest number of articles discuss the formal verification of NN and DNN controllers, followed by RL policies and NNs/DNNs for perception.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Applications and domains of selected articles.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Application/domain</th>
<th align="center" valign="top">Number of articles</th>
<th align="center" valign="top">Sources</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">RL for 6G networks</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S1</td>
</tr>
<tr>
<td align="left" valign="top">NN on noisy inputs</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">S2, S29</td>
</tr>
<tr>
<td align="left" valign="top">Black box system verification</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S3</td>
</tr>
<tr>
<td align="left" valign="top">DNN resilience</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S4</td>
</tr>
<tr>
<td align="left" valign="top">QNNs</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">S5, S14, S20, S41</td>
</tr>
<tr>
<td align="left" valign="top">DNN controllers</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">S6, S9, S27, S28, S32</td>
</tr>
<tr>
<td align="left" valign="top">RL policies</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">S7, S16, S38, S39, S44</td>
</tr>
<tr>
<td align="left" valign="top">NN controllers</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">S8, S10, S23, S24, S26, S33</td>
</tr>
<tr>
<td align="left" valign="top">NNs/DNNs for perception</td>
<td align="center" valign="top">5</td>
<td align="center" valign="top">S11, S12, S31, S34, S35</td>
</tr>
<tr>
<td align="left" valign="top">ML-Driven Chatbots</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S13</td>
</tr>
<tr>
<td align="left" valign="top">MARL</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">S15, S17</td>
</tr>
<tr>
<td align="left" valign="top">CNNs</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">S18, S37</td>
</tr>
<tr>
<td align="left" valign="top">SDNNs</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S19</td>
</tr>
<tr>
<td align="left" valign="top">Federated-learning enabled systems</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S21</td>
</tr>
<tr>
<td align="left" valign="top">LSTMs/RNNs</td>
<td align="center" valign="top">4</td>
<td align="center" valign="top">S22, S25, S40, S46</td>
</tr>
<tr>
<td align="left" valign="top">Semantic segmentation NNs</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">S30, S43</td>
</tr>
<tr>
<td align="left" valign="top">NNs/DNNs for regression</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">S36, S42</td>
</tr>
<tr>
<td align="left" valign="top">Spiking NNs</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">S45</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec24">
<label>5.2</label>
<title>Research question 3</title>
<p>RQ3 asks, &#x201C;What gaps, limitations, and challenges exist within current work combining formal methods and ML safety, and what future research directions can be identified from these gaps?&#x201D;</p>
<sec id="sec25">
<label>5.2.1</label>
<title>Gaps, limitations, and challenges</title>
<p>Scalability and computational efficiency remain significant issues, even though many of the articles consider these issues in their designs. Specifically, the application of formal methods to complex and large NNs is a particular gap in the literature, as the articles focus on small or medium sized networks. Additionally, time complexity is often left unquantified, and there is a need to perform formal analysis on the time complexity of the solutions. Further, many evaluations are confined to small, domain-specific benchmarks and have not been validated in real-world scenarios, limiting the generalizability from the experiments to the real world.</p>
<p>Especially related to Model Checking, the abstraction of NN/DNN behavior may lead to missed behavior from the real system, resulting in potential gaps during evaluation. Therefore, accurate abstraction methods that retain DNN behavior is a notable challenge. Nearly all approaches apply formal methods after model training, with the exceptions of S15 and S16, with little integration back into the training loop. S1 suggests using Model Checking results to refine the RL model as future work (<xref ref-type="bibr" rid="ref51">Tao et al., 2025</xref>).</p>
<p>On a separate note, S13, focusing on formally verifying ML-driven chatbots, assumes that the NLU step produces structured information about the user&#x2019;s request after a query, which may not apply to all chatbots (<xref ref-type="bibr" rid="ref18">Ferrando et al., 2023</xref>). There is therefore a gap in formally verifying Large Language Models (LLMs) or other ML-driven chatbots that lack explainability. Lastly, the complexity of combining advanced ML methods with formal techniques demands steep technical expertise, potentially hindering adoption. Along this same note, there is little discussion of integration with mainstream ML frameworks or deployment pipelines, which may delay uptake by practitioners beyond specialized research settings.</p>
</sec>
<sec id="sec26">
<label>5.2.2</label>
<title>Future research directions</title>
<p>Each article provides possible future work to pursue for its respective solution, and readers are encouraged to explore these for specific future directions on frameworks of interest. This section instead focuses on high level future research directions that can be gleaned from the overall body of literature.</p>
<p>One area of future work is developing integrated training-verification loops. Specifically, extending beyond post-hoc verification to embed formal checks during model training, which allows for corrective adjustments in real time. Additionally, investigating scalability is a significant area of future work. The formal techniques discussed in this SLR can be extended to larger, more complex networks, real-world networks, and diverse datasets to assess practical viability. The performance of the various formal verification frameworks can also be compared when applied to large-scale networks to further identify scalability gaps in the literature.</p>
<p>Algorithms can be developed to automatically select parameters that balance precision versus performance. Several of the current solutions require human intervention to select parameters during the formal verification process, such as S3, which requires expertise and is error-prone (<xref ref-type="bibr" rid="ref40">Nuhu et al., 2022</xref>). Further, standardized metrics and benchmarks are an area of future work. Establishing common datasets, network architectures, and evaluation metrics would enable fair comparison across tools.</p>
<p>Hybrid methods, possibly combining multiple techniques or adding an additional formal refinement loop, are a potentially significant area of future exploration. Complementary techniques may be combined to leverage the unique benefits of each technique. For example, Statistical Model Checking can be combined with existing formal approaches to tighten confidence in safety guarantees. Alternately, solutions that blend off-line formal guarantees with on-line data-driven monitors or shields are an interesting area of future work.</p>
<p>Researchers can explore cloud-based architectures to offload heavy verification tasks, support large-scale monitoring, and reduce on-board overhead. Additionally, the formal methods and ML communities would benefit from well documented, modular toolkits and reference implementations to lower the barrier for adoption and facilitate reproducible experiments. Ideally, these toolkits can plug into common ML workflows, e.g., TensorFlow or PyTorch, with minimal instrumentation (<xref ref-type="bibr" rid="ref52">TensorFlow, 2025</xref>; <xref ref-type="bibr" rid="ref46">PyTorch, 2025</xref>).</p>
<p>Lastly, with the explosion of LLMs, exploring additional techniques for the formal verification of ML-driven chatbots is of interest. This is especially relevant for techniques that either aid in explainability or are effective without relying on structured information from the NLU step.</p>
</sec>
</sec>
</sec>
<sec id="sec27">
<label>6</label>
<title>Quality assessment</title>
<p>A total score for each paper was obtained by summing the scores of either 0, 0.5, or 1 for each field of the quality assessment checklist. The total score is therefore out of nine points as there are nine total fields. <xref ref-type="fig" rid="fig7">Figure 7</xref> presents the distribution of total scores among the selected 46 articles. Notably, of the 46 articles, only seven explicitly discussed limitations, challenges, and threats to validity (field 6). Six articles partially discussed limitations.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Quality assessment score distribution of the selected articles.</p>
</caption>
<graphic xlink:href="frai-09-1749956-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart showing the distribution of quality assessment scores from 3.5 to 9, with most counts at 7.5 and 8, peaking at 15 for score 8. Data illustrates score frequency.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec28">
<label>7</label>
<title>Related work</title>
<p>Several related surveys and literature reviews have been conducted, and this section discusses the most recent (within the last five years) related work. Krichen et al. survey formal methods approaches for the validation and verification of ML systems (<xref ref-type="bibr" rid="ref29">Krichen et al., 2022</xref>). When discussing validation, the survey focuses on literature utilizing formal methods for validating data preparation and training phases. On the verification of ML systems, the formal methods for NNs, Decision Tree Ensembles, and Support Vector Machines are surveyed (<xref ref-type="bibr" rid="ref29">Krichen et al., 2022</xref>). Similarly, <xref ref-type="bibr" rid="ref30">Larsen et al. (2022)</xref> focus both on applying formal methods to ML systems to aid in their adoption within safety-critical domains and applying ML to formal methods to increase the scalability of formal methods to larger problems. Regarding the former, the authors survey formal methods for improving the explainability of ML systems and the verification of ML systems (<xref ref-type="bibr" rid="ref30">Larsen et al., 2022</xref>).</p>
<p><xref ref-type="bibr" rid="ref48">Tambon et al. (2022)</xref> along with <xref ref-type="bibr" rid="ref34">Meyer and Oosthuizen (2023)</xref>, both contribute SLRs in topics tangential to this work. <xref ref-type="bibr" rid="ref48">Tambon et al. (2022)</xref> discuss the challenges related to the certification of ML-based safety-critical systems and conduct an SLR on literature between 2015 and 2020. The authors identify the following as the main pillars of ML certification: Robustness, Uncertainty, Explainability, Verification, Safe Reinforcement Learning, and Direct Certification. Comprehensive analyses are provided for each of these areas. When discussing verification, the authors focus on formal methods and empirically guided testing. Similarly to this work, the authors recognize the need to bridge academic research with real-world adoption (<xref ref-type="bibr" rid="ref48">Tambon et al., 2022</xref>). On the other hand, Meyer and Oosthuizen provide an SLR on the verification and validation of AI-enabled CPS. The review classifies traditional verification methods into multiple categories and mentions that the self-adaptive learning nature of AI requires new verification approaches. The authors briefly discuss formal verification approaches (<xref ref-type="bibr" rid="ref34">Meyer and Oosthuizen, 2023</xref>).</p>
<p>Separately, Meng et al. specifically survey techniques to improve the adversarial robustness of DNNs from a formal verification perspective (<xref ref-type="bibr" rid="ref32">Meng et al., 2022</xref>). Adversarial robustness refers to the reliability of ML models to malicious input perturbations. The survey details formal techniques such as SMT-solving, Linear Programming/MILP, interval arithmetic, Reachability Analysis, and more.</p>
<p>Although previous work either systematically reviews or surveys formal verification applied to ML-enabled systems, current literature on the subject is relatively dated, with most of these studies published in <xref ref-type="bibr" rid="ref48">Tambon et al., 2022</xref> and the most recent study published in 2023. None of these studies review literature produced after <xref ref-type="bibr" rid="ref48">Tambon et al., 2022</xref>. Additionally, only two of the studies conducted systematic reviews, and even then, the SLRs are on tangential topics in which formal verification is not the primary focus. There is therefore a gap in the literature for SLRs reviewing state-of-the-art empirical research in safe ML through formal verification, which this work fills.</p>
</sec>
<sec id="sec29">
<label>8</label>
<title>Strengths and limitations</title>
<p>This study contributes a current and detailed review on literature in formal methods for safety-critical ML, including thorough discussions on gaps, limitations, challenges, and future work. Additionally, the authors conduct a thorough quality assessment of all articles and present the quality assessment score distribution to readers for reference. Limitations to this study include that the SLR was focused specifically on four digital library sources. Work contributed to other digital libraries was therefore not reviewed. Similarly, non-peer reviewed work, such as those published on ArXiv or ResearchGate, were not reviewed. Additionally, including testing and traditional verification in the PICOC terms limited the scope of the review and excluded articles that do not explicitly include these terms.</p>
</sec>
<sec sec-type="conclusions" id="sec30">
<label>9</label>
<title>Conclusion</title>
<p>This SLR presents a comprehensive and detailed overview of the state-of-the-art in safe ML through the application of formal methods. By surveying 46 peer-reviewed studies across eight distinct categories, this work synthesizes both the theory and practical implementations that have emerged between 2020 and mid-2025. As a result, this work highlights both the diversity of formal verification techniques currently in use and the increasing scholarly interest and maturity of this research area. Other similar reviews do not capture work produced after 2022 and contribute mostly unsystematic surveys.</p>
<p>The analysis of gaps, limitations, and challenges reveals several areas for investigation that can help guide future research. By outlining opportunity areas for future work, this review serves not only as a record of existing contributions but also as a roadmap for researchers seeking to advance the safety of ML systems. Future researchers may use this work to both familiarize themselves with the current state of the research and determine where in the field to contribute new work.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec31">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec32">
<title>Author contributions</title>
<p>AN: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Writing &#x2013; original draft. OO: Conceptualization, Supervision, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec33">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec34">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec35">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="disclaimer" id="sec36">
<title>Author disclaimer</title>
<p>Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation.</p>
</sec>
<sec sec-type="supplementary-material" id="sec37">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2026.1749956/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2026.1749956/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.docx" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adadi</surname><given-names>A.</given-names></name> <name><surname>Berrada</surname><given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Peeking inside the black-box: a survey on explainable artificial intelligence (XAI)</article-title>. <source>IEEE Access</source> <volume>6</volume>, <fpage>52138</fpage>&#x2013;<lpage>52160</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2018.2870052</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Adelt</surname><given-names>J.</given-names></name> <name><surname>Bruch</surname><given-names>S.</given-names></name> <name><surname>Herber</surname><given-names>P.</given-names></name> <name><surname>Niehage</surname><given-names>M.</given-names></name> <name><surname>Remke</surname><given-names>A.</given-names></name></person-group> (<year>2023</year>). <source>Shielded learning for resilience and performance based on statistical model checking in Simulink</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Adelt</surname><given-names>J.</given-names></name> <name><surname>Liebrenz</surname><given-names>T.</given-names></name> <name><surname>Herber</surname><given-names>P.</given-names></name></person-group> (<year>2021</year>). <source>Formal verification of intelligent hybrid systems that are modeled with Simulink and the reinforcement learning toolbox</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Angluin</surname><given-names>D.</given-names></name></person-group> (<year>1987</year>). <article-title>Learning regular sets from queries and counterexamples</article-title>. <source>Inf. Comput.</source> <volume>75</volume>, <fpage>87</fpage>&#x2013;<lpage>106</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0890-5401(87)90052-6</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ashok</surname><given-names>P.</given-names></name> <name><surname>Hashemi</surname><given-names>V.</given-names></name> <name><surname>K&#x0159;et&#x00ED;nsk&#x00FD;</surname><given-names>J.</given-names></name> <name><surname>Mohr</surname><given-names>S.</given-names></name></person-group> (<year>2020</year>). <source>DeepAbstract: Neural network abstraction for accelerating verification</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bachiri</surname><given-names>W.</given-names></name> <name><surname>Seladji</surname><given-names>Y.</given-names></name> <name><surname>Garoche</surname><given-names>P.-L.</given-names></name></person-group> (<year>2025</year>). <article-title>Formal specification and SMT verification of quantized neural network for autonomous vehicles</article-title>. <source>Sci. Comput. Program.</source> <volume>245</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.scico.2025.103316</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Banerjee</surname><given-names>S.</given-names></name> <name><surname>Ghosh</surname><given-names>S.</given-names></name> <name><surname>Banerjee</surname><given-names>A.</given-names></name> <name><surname>Mohalik</surname><given-names>S. K.</given-names></name></person-group> (<year>2023</year>). <source>SMT-based modeling and verification of spiking neural networks: A case study</source>. <publisher-loc>Switzerland, Springer, Cham</publisher-loc>: <publisher-name>Cham</publisher-name>.</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bartocci</surname><given-names>E.</given-names></name> <name><surname>Falcone</surname><given-names>Y.</given-names></name> <name><surname>Francalanza</surname><given-names>A.</given-names></name> <name><surname>Reger</surname><given-names>G.</given-names></name></person-group> (<year>2018</year>). &#x201C;<article-title>Introduction to runtime verification</article-title>&#x201D; in <source>Lectures on runtime verification</source> (<publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>).</mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bengio</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>International AI safety report</article-title>. <source>SI</source> <volume>2</volume>:<fpage>4755</fpage>. doi: <pub-id pub-id-type="doi">10.70777/si.v2i2.14755</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Carrera-Rivera</surname><given-names>A.</given-names></name> <name><surname>Ochoa</surname><given-names>W.</given-names></name> <name><surname>Larrinaga</surname><given-names>F.</given-names></name> <name><surname>Lasa</surname><given-names>G.</given-names></name></person-group> (<year>2022</year>). <article-title>How-to conduct a systematic literature review: a quick guide for computer science research</article-title>. <source>MethodsX</source> <volume>9</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.mex.2022.101895</pub-id>, <pub-id pub-id-type="pmid">36405369</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cheng</surname><given-names>C.-H.</given-names></name></person-group> (<year>2021</year>). &#x201C;<article-title>Provably-robust runtime monitoring of neuron activation patterns</article-title>&#x201D; in <source>Grenoble, France, Design</source> (<publisher-loc>Automation &#x0026; Test in</publisher-loc>: <publisher-name>Europe Conference &#x0026; Exhibition (DATE)</publisher-name>).</mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Choi</surname><given-names>S. W.</given-names></name> <name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Yang</surname><given-names>X.</given-names></name> <name><surname>Yamaguchi</surname><given-names>T.</given-names></name> <name><surname>Hoxha</surname><given-names>B.</given-names></name> <name><surname>Fainekos</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Reachability analysis of recurrent neural networks</article-title>. <source>Nonlinear Anal. Hybrid Syst.</source> <volume>56</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.nahs.2025.101581</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cleaveland</surname><given-names>M.</given-names></name> <name><surname>Lindemann</surname><given-names>L.</given-names></name> <name><surname>Ivanov</surname><given-names>R.</given-names></name> <name><surname>Pappas</surname><given-names>G. J.</given-names></name></person-group> (<year>2022</year>). <article-title>Risk verification of stochastic systems with neural network controllers</article-title>. <source>Artif. Intell.</source> <volume>313</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.artint.2022.103782</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Cofer</surname><given-names>D.</given-names></name> <etal/></person-group>., <year>2022</year>. <article-title>Flight test of a collision avoidance neural network with run-time assurance</article-title>. <conf-name>Portsmouth, VA, USA, 2022 IEEE/AIAA 41st Digital Avionics Systems Conference (DASC)</conf-name>.</mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Das</surname><given-names>S.</given-names></name> <name><surname>Banerjee</surname><given-names>A.</given-names></name> <name><surname>Mohalik</surname><given-names>S. K.</given-names></name></person-group> (<year>2025</year>). <source>Modeling and verification of Sigma Delta neural networks using satisfiability modulo theory</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>El Mqirmi</surname><given-names>P.</given-names></name> <name><surname>Belardinelli</surname><given-names>F.</given-names></name> <name><surname>Le&#x00F3;n</surname><given-names>B. G.</given-names></name></person-group> (<year>2021</year>). <source>An abstraction-based method to check multi-agent deep reinforcement-learning behaviors</source>. <publisher-loc>SC, International Foundation for Autonomous Agents and Multiagent Systems</publisher-loc>: <publisher-name>Richland</publisher-name>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Elboher</surname><given-names>Y. Y.</given-names></name> <name><surname>Cohen</surname><given-names>E.</given-names></name> <name><surname>Katz</surname><given-names>G.</given-names></name></person-group> (<year>2024</year>). <article-title>On applying residual reasoning within neural network verification</article-title>. <source>Softw. Syst. Model.</source> <volume>23</volume>, <fpage>721</fpage>&#x2013;<lpage>736</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10270-023-01138-w</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ferrando</surname><given-names>A.</given-names></name> <name><surname>Gatti</surname><given-names>A.</given-names></name> <name><surname>Mascardi</surname><given-names>V.</given-names></name></person-group> (<year>2023</year>). <source>RV4Rasa: A formalism-agnostic runtime verification framework for verifying ChatBots in rasa</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Genin</surname><given-names>D.</given-names></name> <name><surname>Papusha</surname><given-names>I.</given-names></name> <name><surname>Brul&#x00E9;</surname><given-names>J.</given-names></name> <name><surname>Young</surname><given-names>T.</given-names></name> <name><surname>Mullins</surname><given-names>G.</given-names></name> <name><surname>Kouskoulas</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>Formal verification of neural network controllers for collision-free flight</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gros</surname><given-names>T. P.</given-names></name> <name><surname>Hermanns</surname><given-names>H.</given-names></name> <name><surname>Hoffmann</surname><given-names>J.</given-names></name> <name><surname>Klauck</surname><given-names>M.</given-names></name> <name><surname>Steinmetz</surname><given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>Analyzing neural network behavior through deep statistical model checking</article-title>. <source>Int. J. Softw. Tools Technol. Transfer</source> <volume>25</volume>, <fpage>407</fpage>&#x2013;<lpage>426</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10009-022-00685-9</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guendouzi</surname><given-names>B. S.</given-names></name> <name><surname>Ouchani</surname><given-names>S.</given-names></name> <name><surname>Al Assaad</surname><given-names>H.</given-names></name> <name><surname>El Zaher</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <article-title>Ensuring the federation correctness: formal verification of federated learning in industrial cyber-physical systems</article-title>. <source>Futur. Gener. Comput. Syst.</source> <volume>166</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.future.2024.107675</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Guissouma</surname><given-names>H.</given-names></name> <name><surname>Zink</surname><given-names>M.</given-names></name> <name><surname>Sax</surname><given-names>E.</given-names></name></person-group>, <year>2023</year>. <article-title>Continuous safety assessment of updated supervised learning models in shadow mode</article-title>. <conf-name>L'Aquila, Italy, 2023 IEEE 20th International Conference on Software Architecture Companion (ICSA-C)</conf-name>.</mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hafaiedh</surname><given-names>I. B.</given-names></name> <name><surname>Chouchane</surname><given-names>A.</given-names></name> <name><surname>Elaoud</surname><given-names>A.</given-names></name> <name><surname>Lamouchi</surname><given-names>L.</given-names></name> <name><surname>Ghazel</surname><given-names>M.</given-names></name></person-group> (<year>2025</year>). <source>A high parallelization method for automated formal verification of deep neural networks</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hunt</surname><given-names>N.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>Verifiably safe exploration for end-to-end reinforcement learning</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jhala</surname><given-names>R.</given-names></name> <name><surname>Majumdar</surname><given-names>R.</given-names></name></person-group> (<year>2009</year>). <article-title>Software model checking</article-title>. <source>ACM Comput. Surv.</source> <volume>41</volume>, <fpage>1</fpage>&#x2013;<lpage>54</lpage>. doi: <pub-id pub-id-type="doi">10.1145/1592434.1592438</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Khmelnitsky</surname><given-names>I.</given-names></name> <name><surname>Neider</surname><given-names>D.</given-names></name> <name><surname>Roy</surname><given-names>R.</given-names></name> <name><surname>Xie</surname><given-names>X.</given-names></name> <name><surname>Barbot</surname><given-names>B.</given-names></name> <name><surname>Bollig</surname><given-names>B.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>Property-directed verification and robustness certification of recurrent neural networks</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kirov</surname><given-names>D.</given-names></name> <name><surname>Rollini</surname><given-names>S. F.</given-names></name> <name><surname>Guglielmo</surname><given-names>L. D.</given-names></name> <name><surname>Cofer</surname><given-names>D.</given-names></name></person-group> (<year>2023</year>). <source>Formal verification of a neural network based prognostics system for aircraft equipment</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kitchenham</surname><given-names>B.</given-names></name> <name><surname>Charters</surname><given-names>S.</given-names></name></person-group> (<year>2007</year>). <source>Guidelines for performing systematic literature reviews in software engineering</source>. <publisher-loc>Keele, UK</publisher-loc>: <publisher-name>EBSE Technical Report EBSE-2007-01</publisher-name>.</mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Krichen</surname><given-names>M.</given-names></name> <etal/></person-group>., <year>2022</year>. <article-title>Are formal methods applicable to machine learning and artificial intelligence?</article-title> <conf-name>Riyadh, Saudi Arabia, 2022 2nd International Conference of Smart Systems and Emerging Technologies (SMARTTECH)</conf-name>.</mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Larsen</surname><given-names>K.</given-names></name> <name><surname>Legay</surname><given-names>A.</given-names></name> <name><surname>Nolte</surname><given-names>G.</given-names></name> <name><surname>Schl&#x00FC;ter</surname><given-names>M.</given-names></name> <name><surname>Stoelinga</surname><given-names>M.</given-names></name> <name><surname>Steffen</surname><given-names>B.</given-names></name></person-group> (<year>2022</year>). <source>Formal methods meet machine learning (F3ML)</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>Z.</given-names></name> <name><surname>Liu</surname><given-names>W.-W.</given-names></name> <name><surname>Song</surname><given-names>F.</given-names></name> <name><surname>Xue</surname><given-names>B.</given-names></name> <name><surname>Yang</surname><given-names>W.-J.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Qualitative and quantitative model checking against recurrent neural networks</article-title>. <source>J. Comput. Sci. Technol.</source> <volume>39</volume>, <fpage>1292</fpage>&#x2013;<lpage>1311</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11390-023-2703-2</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meng</surname><given-names>M. H.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Adversarial robustness of deep neural networks: a survey from a formal verification perspective</article-title>. <source>IEEE Trans. Dependable Secur. Comput.</source></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meyer</surname><given-names>P.-J.</given-names></name></person-group> (<year>2023</year>). <article-title>Reachability analysis of neural networks with uncertain parameters</article-title>. <source>IFAC-PapersOnLine</source> <volume>56</volume>, <fpage>4822</fpage>&#x2013;<lpage>4827</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ifacol.2023.10.1249</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Meyer</surname><given-names>W.</given-names></name> <name><surname>Oosthuizen</surname><given-names>R.</given-names></name></person-group>, <year>2023</year>. <source>Verification &#x0026; Validation Methods for complex AIenabled cyber-physical learning-based systems: A systematic literature review</source>. <publisher-loc>Edinburgh, UK</publisher-loc>, <conf-name>2023 IEEE International Conference on Engineering, Technology and Innovation (ICE/ITMC)</conf-name>.</mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mistry</surname><given-names>S.</given-names></name> <name><surname>Saha</surname><given-names>I.</given-names></name> <name><surname>Biswas</surname><given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>An MILP encoding for efficient verification of quantized deep neural networks</article-title>. <source>IEEE Trans. Comput. Aided Des. Integr. Circuits Syst.</source> <volume>41</volume>, <fpage>4445</fpage>&#x2013;<lpage>4456</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TCAD.2022.3197697</pub-id></mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Moradkhani</surname><given-names>F.</given-names></name> <name><surname>Fibich</surname><given-names>C.</given-names></name> <name><surname>Fr&#x00E4;nzle</surname><given-names>M.</given-names></name></person-group> (<year>2023</year>). <source>Verification of LSTM neural networks with non-linear activation functions</source>. <publisher-loc>Switzerland, Springer, Cham</publisher-loc>: <publisher-name>Cham</publisher-name>.</mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Naseer</surname><given-names>M.</given-names></name> <etal/></person-group>., <year>2020</year>. <article-title>FANNet: formal analysis of noise tolerance, training Bias and input sensitivity in neural networks</article-title>. <conf-name>Grenoble, France, 2020 Design, Automation &#x0026; Test in Europe Conference &#x0026; Exhibition (DATE)</conf-name>.</mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Nenchev</surname><given-names>V.</given-names></name></person-group> (<year>2025</year>). &#x201C;<article-title>One stack, diverse vehicles: checking safe portability of automated driving software</article-title>&#x201D; in <source>2025 IEEE/SICE international symposium on system integration (SII)</source> (<publisher-loc>Munich, Germany</publisher-loc>).</mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Newcomb</surname><given-names>A.</given-names></name> <name><surname>Couder</surname><given-names>J.</given-names></name> <name><surname>Ochoa</surname><given-names>O.</given-names></name></person-group>, <year>2024</year>. <article-title>Supporting formal methods for machine learning verification in urban air mobility</article-title>. <conf-name>Laguna Hills, CA, USA, 2024 Conference on AI, Science, Engineering, and Technology (AIxSET)</conf-name>.</mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Nuhu</surname><given-names>A.-R.</given-names></name> <etal/></person-group>. (<year>2022</year>). &#x201C;<article-title>Negative selection approach to support formal verification and validation of BlackBox models' input constraints</article-title>&#x201D; in <source>Singapore, Singapore, 2022 IEEE symposium series on computational intelligence (SSCI)</source>.</mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pal</surname><given-names>N.</given-names></name> <name><surname>Lee</surname><given-names>S.</given-names></name> <name><surname>Johnson</surname><given-names>T. T.</given-names></name></person-group> (<year>2023</year>). <source>Benchmark: Formal verification of semantic segmentation neural networks</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pal</surname><given-names>N.</given-names></name> <name><surname>Lopez</surname><given-names>D. M.</given-names></name> <name><surname>Johnson</surname><given-names>T. T.</given-names></name></person-group> (<year>2023</year>). <source>Robustness verification of deep neural networks using star-based reachability analysis with variable-length time series input</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Parameshwaran</surname><given-names>A.</given-names></name> <name><surname>Wang</surname><given-names>Y.</given-names></name></person-group> (<year>2025</year>). <source>Scalable and interpretable verification of image-based neural network controllers for autonomous vehicles</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Paterson</surname><given-names>C.</given-names></name> <name><surname>Wu</surname><given-names>H.</given-names></name> <name><surname>Grese</surname><given-names>J.</given-names></name> <name><surname>Calinescu</surname><given-names>R.</given-names></name> <name><surname>P&#x0103;s&#x0103;reanu</surname><given-names>C. S.</given-names></name> <name><surname>Barrett</surname><given-names>C.</given-names></name></person-group> (<year>2021</year>). <source>DeepCert: Verification of contextually relevant robustness for neural network image classifiers</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Paul</surname><given-names>S.</given-names></name> <name><surname>Cruz</surname><given-names>E.</given-names></name> <name><surname>Dutta</surname><given-names>A.</given-names></name> <name><surname>Bhaumik</surname><given-names>A.</given-names></name> <name><surname>Blasch</surname><given-names>E.</given-names></name> <name><surname>Agha</surname><given-names>G.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Formal verification of safety-critical aerospace systems</article-title>. <source>IEEE Aerosp. Electron. Syst. Mag.</source> <volume>38</volume>, <fpage>72</fpage>&#x2013;<lpage>88</lpage>. doi: <pub-id pub-id-type="doi">10.1109/MAES.2023.3238378</pub-id></mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="other"><collab id="coll1">PyTorch</collab>. (<year>2025</year>). PyTorch. Available online at: <ext-link xlink:href="https://pytorch.org/" ext-link-type="uri">https://pytorch.org/</ext-link> (Accessed 8 August 2025).</mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Samadi</surname><given-names>A.</given-names></name> <name><surname>Harous</surname><given-names>A.</given-names></name> <name><surname>Mohamed</surname><given-names>O. A.</given-names></name> <name><surname>Boukadoum</surname><given-names>M.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Advanced SEU and MBU vulnerability assessment of deep neural networks in air-to-air collision avoidance systems via SAT-based techniques</article-title>&#x201D; in <source>Springfield, MA, USA, 2024 IEEE 67th international Midwest symposium on circuits and systems (MWSCAS)</source>.</mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tambon</surname><given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>How to certify machine learning based safety-critical systems? A systematic literature review</article-title>. <source>Autom. Softw. Eng.</source> <volume>29</volume>.</mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Tan</surname><given-names>D. C.</given-names></name> <etal/></person-group>., <year>2024</year>. <article-title>Safe value functions: learned critics as hard safety constraints</article-title>. <conf-name>Bari, Italy, 20th International Conference on Automation Science and Engineering (CASE)</conf-name>.</mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tang</surname><given-names>X.</given-names></name> <name><surname>Zheng</surname><given-names>Y.</given-names></name> <name><surname>Liu</surname><given-names>J.</given-names></name></person-group> (<year>2023</year>). <source>Boosting multi-neuron convex relaxation for neural network verification</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Tao</surname><given-names>X.</given-names></name> <etal/></person-group>., <year>2025</year>. <article-title>ReLVaaS: verification-as-a-service to analyze trustworthiness of RL-based solutions in 6G networks</article-title>. <conf-name>Bengaluru, India, 2025 17th International Conference on COMmunication Systems and NETworks (COMSNETS)</conf-name>.</mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="other"><collab id="coll2">TensorFlow</collab>. (<year>2025</year>). <italic>An End-to-End Platform for Machine Learning</italic>. Available online at: <ext-link xlink:href="https://www.tensorflow.org/" ext-link-type="uri">https://www.tensorflow.org/</ext-link> (Accessed 8 August 2025).</mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tran</surname><given-names>H.-D.</given-names></name> <name><surname>Pal</surname><given-names>N.</given-names></name> <name><surname>Musau</surname><given-names>P.</given-names></name> <name><surname>Lopez</surname><given-names>D. M.</given-names></name> <name><surname>Hamilton</surname><given-names>N.</given-names></name> <name><surname>Yang</surname><given-names>X.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>Robustness verification of semantic segmentation neural networks using relaxed reachability</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Tripuramallu</surname><given-names>D.</given-names></name> <etal/></person-group>. (<year>2024</year>). <source>Runtime verified neural networks for cyber-physical systems</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Vidot</surname><given-names>G.</given-names></name> <name><surname>Ducoffe</surname><given-names>M.</given-names></name> <name><surname>Gabreau</surname><given-names>C.</given-names></name> <name><surname>Ober</surname><given-names>I.</given-names></name> <name><surname>Ober</surname><given-names>I.</given-names></name></person-group> (<year>2022</year>). <source>Formal monotony analysis of neural networks with mixed inputs: An asset for certification</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref56"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Tepfenhart</surname><given-names>W.</given-names></name></person-group> (<year>2020</year>). &#x201C;<article-title>Preface</article-title>&#x201D; in <source>Formal methods in computer science</source> (<publisher-loc>Boca Raton, Florida</publisher-loc>: <publisher-name>CRC Press Taylor &#x0026; Francis Group</publisher-name>), <fpage>xi</fpage>.</mixed-citation></ref>
<ref id="ref57"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Xiao</surname><given-names>W.</given-names></name> <name><surname>Lyu</surname><given-names>Y.</given-names></name> <name><surname>Dolan</surname><given-names>J.</given-names></name></person-group> (<year>2023</year>). <source>Model-based dynamic shielding for safe and efficient multi-agent reinforcement learning</source>. <publisher-loc>SC, International Foundation for Autonomous Agents and Multiagent Systems</publisher-loc>: <publisher-name>Richland</publisher-name>.</mixed-citation></ref>
<ref id="ref58"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Song</surname><given-names>F.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name></person-group> (<year>2023</year>). <source>QEBVerif: Quantization error bound verification of neural networks</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref59"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <source>QVIP: An ILP-based formal verification approach for quantized neural networks</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
<ref id="ref60"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>Z.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Chen</surname><given-names>G.</given-names></name> <name><surname>Song</surname><given-names>F.</given-names></name> <name><surname>Chen</surname><given-names>T.</given-names></name> <name><surname>Liu</surname><given-names>J.</given-names></name></person-group> (<year>2022</year>). <source>CLEVEREST: Accelerating CEGAR-based neural network verification via adversarial attacks</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref61"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>H.</given-names></name> <etal/></person-group>., <year>2023</year>. <article-title>Safe DNN-type controller synthesis for nonlinear systems via Meta reinforcement learning</article-title>. <conf-name>San Francisco, CA, USA, 2023 60th ACM/IEEE Design Automation Conference (DAC)</conf-name>.</mixed-citation></ref>
<ref id="ref62"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhong</surname><given-names>Y.</given-names></name> <name><surname>Ta</surname><given-names>Q.-T.</given-names></name> <name><surname>Khoo</surname><given-names>S.-C.</given-names></name></person-group> (<year>2023</year>). <source>ARENA: Enhancing abstract refinement for neural network verification</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref63"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>Y.</given-names></name> <name><surname>Tripakis</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). <source>Compositional inductive invariant based verification of neural network controlled systems</source>. <publisher-loc>Switzerland, Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref64"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>Y.</given-names></name> <etal/></person-group>. (<year>2015</year>). <source>Quality assessment of systematic reviews in software engineering: A tertiary study</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>NY, USA, Association for Computing Machinery</publisher-name>.</mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3107237/overview">Antonio Di Stasio</ext-link>, City University of London, United Kingdom</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3301937/overview">Rapha&#x00EB;l Berthon</ext-link>, Universit&#x00E9; Paris-Saclay, France</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3319163/overview">Giovanni Varricchione</ext-link>, Utrecht University, Netherlands</p>
</fn>
</fn-group>
</back>
</article>