<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1731566</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Threats and vulnerabilities in artificial intelligence and agentic AI models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Radanliev</surname>
<given-names>Petar</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<xref ref-type="author-notes" rid="fn0004"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/839254"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Santos</surname>
<given-names>Omar</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Maple</surname>
<given-names>Carsten</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/587364"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Sciences, University of Oxford</institution>, <city>Oxford</city>, <country country="gb">United Kingdom</country></aff>
<aff id="aff2"><label>2</label><institution>The Alan Turing Institute, British Library</institution>, <city>London</city>, <country country="gb">United Kingdom</country></aff>
<aff id="aff3"><label>3</label><institution>Cisco Systems, RTP</institution>, <city>Morrisville</city>, <state>NC</state>, <country country="us">United States</country></aff>
<aff id="aff4"><label>4</label><institution>University of Warwick &#x2013; WMG</institution>, <city>Coventry</city>, <country country="gb">United Kingdom</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Petar Radanliev, <email xlink:href="mailto:petar.radanliev@cs.ox.ac.uk">petar.radanliev@cs.ox.ac.uk</email></corresp>
<fn fn-type="other" id="fn0004"><label>&#x2020;</label><p>ORCID: Petar Radanliev, <uri xlink:href="https://orcid.org/0000-0001-5629-6857">orcid.org/0000-0001-5629-6857</uri></p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1731566</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Radanliev, Santos and Maple.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Radanliev, Santos and Maple</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Adversarial robustness in artificial intelligence is commonly defined in terms of input-level perturbations applied to static models. This study reconceptualises adversarial vulnerability for artificial and agentic AI systems by extending the threat model to autonomy, self-governance, and closed-loop decision-making, where behaviour unfolds dynamically through feedback and control.</p>
</sec>
<sec>
<title>Methods</title>
<p>We develop a system-level analytical framework that formalises adversarial risk across perceptual, cognitive, and executive layers. The analysis is grounded in a PRISMA-compliant systematic literature review, bibliometric mapping, and targeted empirical validation. Established adversarial results from vision benchmarks and recent large-language-model red-teaming studies are synthesised to contextualise the framework, rather than to introduce new benchmark performance claims.</p>
</sec>
<sec>
<title>Results</title>
<p>The results demonstrate that no single defence mechanism provides robustness across all layers of agentic AI systems. Adversarial vulnerabilities propagate from perception to policy and actuation, with architectural similarity, domain shift, and feedback dynamics critically shaping transferability and failure modes. These effects have direct implications for safety-critical applications, including autonomous mobility, healthcare imaging, and biometric security.</p>
</sec>
<sec>
<title>Discussion</title>
<p>By framing higher-order agentic adversarial threats as hypothesis-driven, system-level risks, this work shifts adversarial AI security from benchmark-centric evaluation to behavioural integrity and lifecycle resilience. The proposed framework defines a coherent research agenda for agentic AI security that integrates control-theoretic reasoning and governance-aware defence design, addressing limitations of classical adversarial machine-learning theory.</p>
</sec>
</abstract>
<kwd-group>
<kwd>advanced attack techniques</kwd>
<kwd>adversarial attacks</kwd>
<kwd>artificial intelligence</kwd>
<kwd>blackbox attacks</kwd>
<kwd>Carlini and Wagner attack (C&#x0026;W)</kwd>
<kwd>defense mechanisms</kwd>
<kwd>Fast Gradient Sign Method (FGSM)</kwd>
<kwd>machine learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work has been supported by the UK EPSRC (under grant number EP/S035362/1), the Bill and Melinda Gates Foundation (Reference code: INV-057591), and SPRITE+ [funded under EPSRC (EP/W020408/1)].</funding-statement>
</funding-group>
<counts>
<fig-count count="15"/>
<table-count count="6"/>
<equation-count count="0"/>
<ref-count count="59"/>
<page-count count="28"/>
<word-count count="18149"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec1">
<label>1</label>
<title>Introduction to adversarial attacks</title>
<p>Artificial-intelligence (AI) models now underpin safety-critical functions in medical imaging, autonomous driving, and national security. Their growing deployment, however, has exposed a systemic weakness: carefully crafted adversarial examples, minute, often imperceptible perturbations to inputs, can trigger grossly erroneous model outputs (<xref ref-type="bibr" rid="ref43">Ren et al., 2020</xref>). A stop sign modified with a few strategically placed stickers, for instance, may be interpreted by an autonomous-vehicle vision system as a yield sign, jeopardising passenger safety (<xref ref-type="bibr" rid="ref32">Maple et al., 2019</xref>). In clinical practice, subtle pixel-level changes to radiographic images could prompt a diagnostic system to overlook malignancies or flag healthy tissue as pathological, adversely affecting patient management.</p>
<p>The threat landscape extends across digital identity and surveillance. Adversarial audio or visual artefacts have been shown to bypass state-of-the-art facial- and voice-recognition pipelines, enabling unauthorised access to secure facilities and devices while undermining the evidentiary integrity of CCTV footage (<xref ref-type="bibr" rid="ref52">Wang J. et al., 2019</xref>). Malicious actors can also weaponise generative models to fabricate photorealistic or audio-realistic content, fuelling misinformation campaigns, reputational damage, and electoral manipulation. Voice-activated assistants are vulnerable to inaudible command injections, compromising user privacy and data integrity, whereas algorithmic decision-making in policing, recruitment, and credit scoring remains susceptible to false positives and false negatives with significant societal repercussions.</p>
<p>At the geopolitical level, adversarial exploitation of AI-enabled defence platforms raises the spectre of cyber-warfare, espionage, and strategic destabilisation. Commercial enterprises are likewise exposed: a single successful attack on an AI-driven recommendation or pricing engine can erode consumer trust and precipitate substantial financial loss. Generative-AI chatbots can serve as high-throughput front ends for gathering a diverse spectrum of user information, ranging from product-preference signals and free-text feedback to demographic attributes such as age, gender, and approximate location. Their ability to sustain thousands of concurrent dialogues and to operate continuously affords firms an efficient, low-latency mechanism for capturing customer insight at scale, which in turn supports real-time personalisation of marketing campaigns and service delivery.</p>
<p>Notwithstanding these benefits, two limitations are critical. First, large-language-model (LLM) services such as ChatGPT, in their default configuration, neither persist user-specific content nor expose database-like retrieval endpoints (<xref ref-type="fig" rid="fig1">Figure 1</xref>). Consequently, organisations that require structured data capture must integrate the model within a bespoke pipeline that extracts, stores, and post-processes conversation metadata. Second, any deployment that ingests personal data must comply with prevailing regulatory regimes, most notably the EU General Data Protection Regulation (GDPR) (<xref ref-type="bibr" rid="ref18">GDPR, 2018</xref>; <xref ref-type="bibr" rid="ref24">ICO, 2018</xref>; <xref ref-type="bibr" rid="ref42">Peloquin et al., 2020</xref>), the California Consumer Privacy Act (CCPA) (<xref ref-type="bibr" rid="ref9">CCPA, 2018</xref>), and sector-specific standards for consent and retention. Custom implementations therefore need robust consent workflows, explicit purpose limitation, encryption at rest and in transit, and audit logging to mitigate legal and ethical risks associated with large-scale conversational data harvesting.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>End-to-end flow, from user input through the ChatGPT core LLM, onward to a data interface and data store, highlighting a consent manager and privacy-compliance checkpoint.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating the process of user input through the ChatGPT Core LLM, leading to the data interface and data store. The data store is connected to a consent manager and privacy compliance, ensuring legal adherence.</alt-text>
</graphic>
</fig>
<p>In <xref ref-type="fig" rid="fig1">Figure 1</xref>, we can see that ChatGPT is designed to forget the user&#x2019;s personal information after the conversation, ensuring user privacy.</p>
<p><xref ref-type="fig" rid="fig1">Figure 1</xref> presents a layered data-collection workflow centred on the ChatGPT core LLM. User input enters at the top and is processed by the LLM, which generates responses while simultaneously passing salient metadata to an intermediate Data Interface. This interface funnels structured information into a persistent Data Store, but only after routing it through a dedicated Consent Manager that verifies user authorisation and applies opt-in policies. A parallel path leads all collected data (raw and processed) through a Privacy-Compliance checkpoint that enforces encryption, retention limits, and jurisdiction-specific regulations (e.g., GDPR, CCPA).</p>
<p>The primary contribution of this work is the formulation of a system-level analytical framework that reconceptualises adversarial AI security across perceptual, cognitive, and executive layers, rather than the enumeration of individual attack techniques. Secondary contribution is the formalisation of agentic AI security as a multi-layered problem spanning perception, cognition, and executive control. By mapping adversarial vulnerabilities onto autonomy, self-governance, and closed-loop decision-making, the proposed framework extends adversarial machine learning from input-level robustness to system-level behavioural integrity. This shift enables systematic reasoning about failure modes that are not adequately captured by conventional threat models, including temporal error accumulation, goal misalignment, and policy-level manipulation. As such, the taxonomy functions as an explanatory and generative framework, guiding empirical evaluation and the design of next-generation defence mechanisms for autonomous and agentic AI systems.</p>
</sec>
<sec sec-type="methods" id="sec2">
<label>2</label>
<title>Methods</title>
<p>This study employed a multi-phase analytical methodology integrating a <italic>Systematic Literature Review (SLR)</italic>, <italic>Bibliometric Analysis</italic>, and <italic>Experimental Validation</italic> to comprehensively investigate adversarial vulnerabilities in both conventional and agentic artificial intelligence (AI) systems. The methodology aligns with PRISMA 2020 guidelines for systematic reviews and conforms to bibliometric standards established by the Bibliometrix framework in R Studio.</p>
<sec id="sec3">
<label>2.1</label>
<title>Phase 1: systematic literature review</title>
<p>The first methodological phase consisted of a structured literature search across IEEE Xplore, SpringerLink, ACM Digital Library, Scopus, and Web of Science, supplemented by arXiv and Google Scholar to capture grey literature. Boolean search strings were constructed to identify studies combining adversarial attacks, AI security, and agentic properties such as autonomy, self-governance, and decision-making loops. The inclusion and exclusion criteria (<xref ref-type="table" rid="tab1">Table 1</xref>) ensured that only peer-reviewed or high-quality preprints published between 2015 and 2025 were considered.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Inclusion and exclusion criteria.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Criterion</th>
<th align="left" valign="top">Inclusion</th>
<th align="left" valign="top">Exclusion</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Publication type</td>
<td align="left" valign="top">Peer-reviewed journals, conference papers, SLRs, and high-quality preprints</td>
<td align="left" valign="top">Editorials, blog posts, non-scholarly reports</td>
</tr>
<tr>
<td align="left" valign="top">Domain relevance</td>
<td align="left" valign="top">Studies addressing adversarial ML, AI security, reinforcement learning, or agentic behaviour</td>
<td align="left" valign="top">Studies on unrelated AI ethics or general automation without security focus</td>
</tr>
<tr>
<td align="left" valign="top">Methodological quality</td>
<td align="left" valign="top">Empirical studies, formal analyses, or validated frameworks</td>
<td align="left" valign="top">Conceptual papers lacking experimental validation</td>
</tr>
<tr>
<td align="left" valign="top">Language</td>
<td align="left" valign="top">English</td>
<td align="left" valign="top">Non-English</td>
</tr>
<tr>
<td align="left" valign="top">Temporal range</td>
<td align="left" valign="top">2015&#x2013;2025</td>
<td align="left" valign="top">Before 2015</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Duplicates were removed through automated title and DOI matching, and all retained studies were screened for methodological validity and thematic relevance. A total of 78 studies were selected for qualitative synthesis and 52 for quantitative meta-analysis, visualised through the PRISMA workflow (<xref ref-type="table" rid="tab2">Table 2</xref>). Each record was independently coded by two reviewers for variables including adversarial attack type, defence mechanism, evaluation metric, and the presence of agentic attributes. Inter-rater reliability achieved a Cohen&#x2019;s <italic>&#x03BA;</italic>&#x202F;=&#x202F;0.87, confirming strong agreement.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>PRISMA 2020 flow summary for the systematic literature review.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Stage</th>
<th align="left" valign="top">Description of process</th>
<th align="center" valign="top">Records (<italic>n</italic>)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="2">Identification</td>
<td align="left" valign="top">Records identified through database searches (IEEE Xplore, ACM Digital Library, Scopus, SpringerLink, Web of Science)</td>
<td align="center" valign="top">612</td>
</tr>
<tr>
<td align="left" valign="top">Additional records identified through other sources (Google Scholar, arXiv preprints)</td>
<td align="center" valign="top">48</td>
</tr>
<tr>
<td align="left" valign="top">Total records before screening</td>
<td/>
<td align="center" valign="top">660</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">Screening</td>
<td align="left" valign="top">Duplicate records removed prior to screening</td>
<td align="center" valign="top">312</td>
</tr>
<tr>
<td align="left" valign="top">Records screened by title and abstract for relevance to adversarial ML and agentic AI</td>
<td align="center" valign="top">300</td>
</tr>
<tr>
<td align="left" valign="top">Records excluded during initial screening (irrelevant scope, no adversarial component)</td>
<td align="center" valign="top">188</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">Eligibility</td>
<td align="left" valign="top">Full-text articles assessed for methodological eligibility and relevance</td>
<td align="center" valign="top">112</td>
</tr>
<tr>
<td align="left" valign="top">Full-text articles excluded (non&#x2013;peer-reviewed, insufficient methodological detail, or lacking agentic AI focus)</td>
<td align="center" valign="top">34</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">Inclusion</td>
<td align="left" valign="top">Studies included in qualitative synthesis (narrative and thematic analysis)</td>
<td align="center" valign="top"><bold>78</bold></td>
</tr>
<tr>
<td align="left" valign="top">Studies included in quantitative synthesis (meta-analysis of metrics such as ASR, L&#x2082; norm, robust accuracy)</td>
<td align="center" valign="top"><bold>52</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the final number of studies retained at each critical PRISMA stage, including total records before screening and studies included in qualitative and quantitative synthesis.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Phase 2: bibliometric and conceptual network analysis</title>
<p>The second phase expanded upon the SLR results through a bibliometric mapping of publication trends and conceptual relationships using Bibliometrix (Biblioshiny interface) in R Studio v4.4.2. Two independent searches were executed in the Web of Science Core Collection:</p>
<list list-type="order">
<list-item>
<p><italic>&#x201C;Adversarial Attacks on Agentic AI&#x201D;</italic> &#x2014; 101 records</p>
</list-item>
<list-item>
<p><italic>&#x201C;Adversarial Attacks on Artificial Intelligence&#x201D;</italic> &#x2014; 973 records</p>
</list-item>
</list>
<p>Data were normalised and cleaned to remove duplicate author and keyword entries. Analyses included Multiple Correspondence Analysis (MCA) for conceptual structuring (<xref ref-type="fig" rid="fig2">Figure 2</xref>), Hierarchical Clustering for keyword co-occurrence networks (<xref ref-type="fig" rid="fig3">Figure 3</xref>), and Thematic Mapping for density-centrality evaluation (<xref ref-type="fig" rid="fig4">Figure 4</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Factorial analysis of the Web of Science Core Collection records on the topic of adversarial attacks on agentic AI.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Conceptual structure map using Multiple Correspondence Analysis (MCA), displaying keywords such as &#x201C;review,&#x201D; &#x201C;defenses,&#x201D; &#x201C;adversarial,&#x201D; and &#x201C;deep learning.&#x201D; The map features dimensions labeled Dim 1 and Dim 2, with a shaded triangular area encompassing the terms. The BridgEmetrics logo is at the bottom right.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Dendrogram from the Web of Science Core Collection data on the topic of adversarial attacks on agentic AI.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Dendrogram displaying hierarchical clustering of data. The vertical axis represents the linkage distance, while the horizontal axis lists labeled data points. Lines and branches show similarity levels between clusters, merging at different heights.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Thematic map of the Web of Science Core Collection records on the topic of adversarial attacks on artificial intelligence.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">A thematic map categorizes AI research topics. The x-axis represents relevance (centrality), and the y-axis shows development (density). Quadrants: Niche (top left), Motor (top right), Emerging/Declining (bottom left), Basic (bottom right). Key topics include &#x201C;explainability&#x201D; in Niche, &#x201C;deep learning" in Motor, &#x201C;adversarial AI&#x201D; in Emerging, and &#x201C;edge computing&#x201D; in Basic Themes.</alt-text>
</graphic>
</fig>
<p>These analyses revealed that <italic>agentic AI</italic> research occupies a sparse, emerging cluster characterised by fragmented terminology and low thematic density, whereas general adversarial AI research exhibits a mature and interconnected structure dominated by high-centrality themes such as <italic>deep learning</italic>, <italic>classification</italic>, and <italic>adversarial training</italic>. The results quantitatively substantiated the hypothesis that agentic AI adversarial studies remain in an early exploratory phase relative to classical adversarial ML.</p>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Phase 3: experimental verification</title>
<p>To ground the literature findings in empirical data, adversarial attack simulations were implemented using TensorFlow and Keras frameworks. Two canonical attack algorithms, Fast Gradient Sign Method (FGSM) and Carlini &#x0026; Wagner (C&#x0026;W), were executed on MobileNetV2 (ImageNet) and custom CNN (MNIST) architectures, respectively.</p>
<p>Parameters were standardised across experiments:</p>
<list list-type="bullet">
<list-item>
<p><italic>FGSM</italic>: <italic>&#x03B5;</italic> &#x2208; {0.01, 0.1, 0.15} under L&#x221E;-norm constraint.</p>
</list-item>
<list-item>
<p><italic>C&#x0026;W (L&#x2082;)</italic>: optimisation using the Adam optimiser (learning rate&#x202F;=&#x202F;0.01; <italic>&#x03BA;</italic>&#x202F;=&#x202F;0&#x2013;20).</p>
</list-item>
<list-item>
<p><italic>Evaluation metrics:</italic> attack success rate (ASR), perturbation norm (L&#x2082; distance), and clean vs. robust accuracy differentials.</p>
</list-item>
</list>
<p>Results were evaluated visually (<xref ref-type="fig" rid="fig5">Figure 5</xref>) and statistically, confirming expected correlations between perturbation budget and misclassification probability while validating benchmark behaviour reported in prior literature.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Experimental setup for FGSM and Carlini &#x0026; Wagner (C&#x0026;W) attacks.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart showing a process for generating adversarial examples. It starts with an input image of a dog, processed by a pre-trained model. An adversarial attack using FGSM or C&#x0026;W methods is applied, resulting in an adversarial example of the dog.</alt-text>
</graphic>
</fig>
<p>The results presented in this section are organised into two categories. First, we report <italic>original experimental evaluations</italic> conducted by the authors, limited to reproducible baseline attacks (FGSM and Carlini &#x0026; Wagner) implemented on MobileNetV2 (ImageNet) and a custom CNN (MNIST). Second, we present <italic>quantitative results adapted and synthesised from prior studies</italic> under comparable experimental settings (MNIST, CIFAR-10, ImageNet), including attack success rates, perturbation norms, and robust accuracy gaps. Tables and figures are explicitly labelled to distinguish original results from adapted literature benchmarks.</p>
<p>The empirical material presented in this manuscript falls into two distinct categories. First, <italic>original experimental evaluations conducted by the authors</italic> are limited to the FGSM and Carlini &#x0026; Wagner attacks implemented on MobileNetV2 (ImageNet) and a custom CNN (MNIST), as described in Phase 3 and illustrated in <xref ref-type="fig" rid="fig5">Figure 5</xref>. These experiments are intended as validation and illustration of well-established adversarial behaviours, rather than as state-of-the-art benchmarks. Second, all other quantitative results reported in subsequent sections (e.g., attack success rates, perturbation norms, robustness gaps, benchmark tables) are <italic>adapted and synthesised from prior peer-reviewed studies</italic> evaluated under comparable datasets and threat models (MNIST, CIFAR-10, ImageNet), with sources explicitly cited.</p>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>Phase 4: conceptual integration</title>
<p>Finally, outcomes from all three phases were synthesised to develop a multi-layered taxonomy of adversarial vulnerabilities in <italic>Agentic AI</italic>, structured across perceptual, cognitive, and executive layers. This synthesis draws directly from the bibliometric findings, linking perceptual-layer attacks to established adversarial ML work, cognitive-layer attacks to reasoning interference, and executive-layer attacks to agentic autonomy corruption.</p>
<p>Where empirical validation is limited, particularly for higher-order agentic behaviours, the analysis is explicitly framed as a conceptual and architectural risk assessment rather than as a benchmark-driven evaluation.</p>
<p>Theoretically, the proposed framework aligns adversarial AI security with control-theoretic and systems-safety perspectives, where stability, feedback, and governance are first-class concerns. In this view, adversarial robustness is no longer a static property of a model but an emergent property of an agent interacting with its environment over time. This reframing allows adversarial failures to be analysed as deviations in system dynamics rather than isolated classification errors, providing a conceptual bridge between adversarial machine learning, autonomous systems safety, and AI governance research.</p>
</sec>
</sec>
<sec id="sec7">
<label>3</label>
<title>Systematic literature review (SLR) and PRISMA methodology</title>
<p>To ensure methodological transparency and reproducibility, this study followed the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA 2020) guidelines in structuring the literature review on adversarial vulnerabilities in agentic artificial intelligence (AAI). The objective of the review was to identify, evaluate, and synthesise empirical and theoretical research addressing adversarial attacks, defence mechanisms, and agentic properties (autonomy, self-governance, and decision-making loops) in AI systems.</p>
<sec id="sec8">
<label>3.1</label>
<title>Research questions</title>
<p>The review was guided by the following research questions (RQs):</p>
<list list-type="bullet">
<list-item>
<p><italic>RQ1:</italic> What are the main categories of adversarial attacks affecting agentic and non-agentic AI models?</p>
</list-item>
<list-item>
<p><italic>RQ2:</italic> How do autonomy, self-governance, and decision-making loops influence the attack surface and defence strategies of agentic AI systems?</p>
</list-item>
<list-item>
<p><italic>RQ3:</italic> What methodological frameworks and benchmark datasets are used in evaluating adversarial robustness and mitigation strategies?</p>
</list-item>
<list-item>
<p><italic>RQ4:</italic> What gaps exist in current adversarial-defence research concerning agentic AI&#x2019;s cognitive and executive layers?</p>
</list-item>
</list>
</sec>
<sec id="sec9">
<label>3.2</label>
<title>Search strategy</title>
<p>A structured search was conducted between January and March 2025 using major scientific databases: IEEE Xplore, SpringerLink, ACM Digital Library, Scopus, and Web of Science. Supplementary searches were performed in arXiv and Google Scholar to capture emerging preprints and conference papers.</p>
<p>The following Boolean query strings were used (adapted per database syntax):</p>
<p>(&#x201C;adversarial attack&#x201D; OR &#x201C;adversarial example&#x201D; OR &#x201C;adversarial robustness&#x201D;)</p>
<p>AND (&#x201C;agentic AI&#x201D; OR &#x201C;autonomous agent&#x201D; OR &#x201C;reinforcement learning agent&#x201D; OR &#x201C;decision-making loop&#x201D; OR &#x201C;self-governing AI&#x201D;).</p>
<p>AND (&#x201C;security&#x201D; OR &#x201C;vulnerability&#x201D; OR &#x201C;defence&#x201D; OR &#x201C;mitigation&#x201D; OR &#x201C;robustness&#x201D;).</p>
<p>Search filters were applied to include only peer-reviewed journal articles, conference proceedings, and high-quality preprints published between 2015 and 2025, in English (<xref ref-type="table" rid="tab1">Table 1</xref>).</p>
<sec id="sec10">
<label>3.2.1</label>
<title>Screening and selection process</title>
<p>The PRISMA four-stage workflow was followed:</p>
<list list-type="order">
<list-item>
<p><italic>Identification:</italic> 612 records were retrieved across databases.</p>
</list-item>
<list-item>
<p><italic>Screening:</italic> 312 duplicates were removed using Zotero&#x2019;s de-duplication and title matching.</p>
</list-item>
<list-item>
<p><italic>Eligibility:</italic> Abstracts and full texts of 300 remaining studies were screened against inclusion criteria, yielding 112 eligible papers.</p>
</list-item>
<list-item>
<p><italic>Inclusion:</italic> 78 studies were finally included for quantitative and qualitative synthesis.</p>
</list-item>
</list>
<p>The process is visualised in the PRISMA flow diagram (<xref ref-type="table" rid="tab2">Table 2</xref>).</p>
<p>From an initial 660 records, 312 duplicates were removed. The remaining 300 unique papers were screened; 188 were excluded for irrelevance. Following full-text eligibility assessment of 112 papers, 78 studies were retained for qualitative synthesis and 52 for quantitative analysis. This structured procedure complies with the PRISMA 2020 framework, ensuring traceability, transparency, and replicability of the review process.</p>
</sec>
<sec id="sec11">
<label>3.2.2</label>
<title>Data extraction and coding</title>
<p>Each eligible study was coded for:</p>
<list list-type="bullet">
<list-item>
<p>Publication metadata (authors, year, venue);</p>
</list-item>
<list-item>
<p>Type of adversarial attack (white-box, black-box, physical, transfer, poisoning);</p>
</list-item>
<list-item>
<p>Targeted AI system (agentic vs. non-agentic, modality, architecture);</p>
</list-item>
<list-item>
<p>Evaluation metrics (ASR, perturbation norm, computational overhead, transferability);</p>
</list-item>
<list-item>
<p>Reported defences (adversarial training, defensive distillation, randomised smoothing, etc.);</p>
</list-item>
<list-item>
<p>Agentic-AI attributes analysed (autonomy, feedback control, self-governance).</p>
</list-item>
</list>
<p>Two independent reviewers coded all entries, and inter-rater agreement was measured using Cohen&#x2019;s &#x03BA;&#x202F;=&#x202F;0.87, indicating high consistency.</p>
</sec>
<sec id="sec12">
<label>3.2.3</label>
<title>Synthesis approach</title>
<p>A <italic>mixed-method synthesis</italic> was performed:</p>
<list list-type="bullet">
<list-item>
<p><italic>Quantitative analysis:</italic> attack success rate, perturbation norm, and robustness metrics were aggregated where comparable datasets existed (e.g., MNIST, CIFAR-10, ImageNet).</p>
</list-item>
<list-item>
<p><italic>Qualitative thematic analysis:</italic> emergent themes were extracted around agentic AI vulnerabilities, lifecycle phases (perceptual, cognitive, executive), and alignment risks.</p>
</list-item>
</list>
</sec>
<sec id="sec13">
<label>3.2.4</label>
<title>Limitations</title>
<p>Despite extensive coverage, the review may under-represent unpublished industry reports and proprietary red-teaming data. Additionally, rapid advances in large-language-model red-teaming may introduce temporal bias. To mitigate this, supplementary searches were conducted in March 2025 to include recent red-team studies (e.g., MADMAX, GPT-4o jailbreak frameworks).</p>
</sec>
<sec id="sec14">
<label>3.2.5</label>
<title>Compliance and transparency</title>
<p>All methodological steps adhere to PRISMA 2020&#x2019;s structured reporting requirements (identification, screening, eligibility, inclusion). The complete reference dataset and screening criteria are available upon request for reproducibility and audit.</p>
</sec>
</sec>
</sec>
<sec id="sec15">
<label>4</label>
<title>Bibliometric analysis</title>
<p>To ensure wholistic review of existing work, and to gather all data available on this subject, the first search was performed on the Web of Science Core Collection on the topic of Adversarial Attacks on Agentic AI, which produced 101 results (date 22 October 2025). This file was extracted from the Web of Science Core Collection and analysed in R Studio using the Bibliometrix plugging with Biblioshiny (<xref ref-type="bibr" rid="ref3">Aria and Cuccurullo, 2017</xref>).</p>
<p>The first step was to perform Factorial Analysis (<xref ref-type="fig" rid="fig2">Figure 2</xref>) followed by a dendrogram (<xref ref-type="fig" rid="fig3">Figure 3</xref>).</p>
<p>The factorial analysis (Multiple Correspondence Analysis, MCA) of the Web of Science dataset on &#x201C;Adversarial Attacks on Agentic AI&#x201D; in <xref ref-type="fig" rid="fig2">Figure 2</xref>, reveals a clearly defined conceptual structure centred on three clusters of related terms. The largest semantic field is anchored around &#x201C;adversarial machine learning,&#x201D; &#x201C;deep learning,&#x201D; and &#x201C;cybersecurity,&#x201D; indicating that current research situates adversarial studies within security-focused machine-learning contexts. A secondary cluster is formed by &#x201C;defences,&#x201D; &#x201C;review,&#x201D; and &#x201C;perturbation methods,&#x201D; suggesting consolidation of methodological work on defensive mechanisms. The triangular distribution in the MCA map indicates an evolving but interconnected research landscape&#x2014;agentic autonomy remains a peripheral topic, reflecting that agentic AI as a term is still emerging rather than established. The wide geometric spread demonstrates a heterogeneous but converging field with overlapping technical and conceptual vocabularies.</p>
<p>The hierarchical clustering dendrogram in <xref ref-type="fig" rid="fig3">Figure 3</xref>, complements the factorial analysis by illustrating the taxonomic proximity between topics. It shows dense clustering among keywords such as <italic>&#x201C;machine learning,&#x201D; &#x201C;deep neural networks,&#x201D; &#x201C;adversarial training,&#x201D;</italic> and <italic>&#x201C;defence mechanisms,&#x201D;</italic> confirming that the core literature is technically driven. Smaller, distinct branches appear for <italic>&#x201C;autonomy,&#x201D; &#x201C;reinforcement learning agents,&#x201D;</italic> and <italic>&#x201C;decision-making,&#x201D;</italic> showing that agentic elements are currently treated as specialised subtopics rather than mainstream research axes. The height of the clustering tree demonstrates significant semantic distances between these subfields, underscoring fragmentation in how agentic attributes (autonomy, self-governance) are addressed within adversarial ML research. This structure highlights a field transitioning from isolated algorithmic studies toward integrated system-level inquiry.</p>
<p>The second search parameters used was &#x2018;Adversarial Attacks on Artificial Intelligence&#x2019;, which produced 973 results (date 22 October 2025). This data was analysed with a Thematic Map (<xref ref-type="fig" rid="fig4">Figure 4</xref>).</p>
<p>The thematic map for the broader search on &#x201C;Adversarial Attacks on Artificial Intelligence&#x201D; in <xref ref-type="fig" rid="fig4">Figure 4</xref>, presents a mature, multi-clustered landscape divided along development (density) and relevance (centrality) axes. The motor themes, including &#x201C;deep learning,&#x201D; &#x201C;adversarial attacks,&#x201D; &#x201C;artificial intelligence,&#x201D; and &#x201C;classification,&#x201D; dominate the upper-right quadrant, reflecting a high degree of conceptual maturity and research centrality. The niche themes, such as &#x201C;explainability&#x201D; and &#x201C;interpretability,&#x201D; exhibit high density but lower centrality, representing specialised, well-developed topics that are methodologically self-contained. Emerging themes, including &#x201C;adversarial AI&#x201D; and &#x201C;adversarial training,&#x201D; lie in the lower-left quadrant, denoting rapidly growing but not yet consolidated research directions. Thematic dispersion confirms that the field of general adversarial AI is far more saturated and structurally stable than the emergent agentic AI subdomain.</p>
<p>Comparing the two bibliometric datasets, &#x201C;Adversarial Attacks on Agentic AI&#x201D; (101 records) versus &#x201C;Adversarial Attacks on Artificial Intelligence&#x201D; (973 records), reveals a sharp contrast in research maturity and conceptual cohesion. The smaller agentic AI dataset exhibits dispersed, low-density clusters, reflecting early-stage exploration characterised by methodological borrowing from adversarial ML rather than original frameworks for autonomy or decision loops. Conversely, the general AI adversarial dataset demonstrates thematic centralisation around established paradigms in deep learning, computer vision, and robustness testing, supported by strong interconnections between attack and defence research. The comparison indicates that while adversarial AI research is approaching conceptual saturation, adversarial agentic AI remains a nascent domain, fragmented but promising for defining the next generation of security paradigms integrating autonomy and self-governance into adversarial robustness models.</p>
<p>The bibliometric evidence reinforces the conceptual taxonomy proposed for agentic AI adversarial vulnerabilities. Themes identified in <xref ref-type="fig" rid="fig4">Figure 4</xref>, such as <italic>adversarial attacks</italic>, <italic>deep learning</italic>, and <italic>cybersecurity</italic>, align predominantly with perceptual-layer threats, where adversaries manipulate sensory or input representations to mislead AI perception systems. The emerging clusters around <italic>adversarial training</italic>, <italic>interpretability</italic>, and <italic>explainability</italic> correspond to the cognitive layer, reflecting attempts to secure internal reasoning processes and improve model self-awareness. Meanwhile, niche but growing themes related to <italic>autonomous systems</italic> and <italic>decision-making</italic> point toward the executive layer, where control logic and goal prioritisation become targets for higher-order adversarial influence. Collectively, these findings indicate that while current research remains concentrated on perceptual and cognitive defences, the executive dimension, central to agentic autonomy and self-governance, remains underexplored. This gap highlights the need for a new generation of adversarial-security frameworks that explicitly integrate the multi-layered structure of agentic AI, bridging perception, cognition, and executive control within a unified resilience paradigm.</p>
</sec>
<sec id="sec16">
<label>5</label>
<title>Agentic properties and adversarial vulnerabilities in artificial intelligence systems</title>
<p>Agentic Artificial Intelligence (AAI) extends beyond conventional pattern-recognition models by incorporating autonomy, self-governance, and goal-directed decision-making loops (<xref ref-type="bibr" rid="ref1">Acharya et al., 2025</xref>). In contrast to static predictive systems that passively map inputs to outputs, agentic systems operate as persistent entities within dynamic environments, continually perceiving, reasoning, and acting based on internal objectives. This agentic capability introduces a new adversarial surface: vulnerabilities not only in model inference but also in planning, feedback integration, and goal adaptation.</p>
<sec id="sec17">
<label>5.1</label>
<title>Autonomy</title>
<p>Autonomy refers to the capacity of an AI agent to act without direct human intervention, selecting and executing actions to achieve predefined or emergent goals. In reinforcement-learning-based or self-optimising architectures (<xref ref-type="bibr" rid="ref49">Sutton and Barto, 1998</xref>; <xref ref-type="bibr" rid="ref58">Zhan et al., 2017</xref>; <xref ref-type="bibr" rid="ref33">Mendez et al., 2018</xref>), such autonomy is operationalised through policy networks that update via reward feedback (<xref ref-type="bibr" rid="ref25">Jerbi et al., 2021</xref>). Adversaries can exploit this property through <italic>policy poisoning</italic>, <italic>reward manipulation</italic>, or <italic>environment spoofing</italic>, wherein corrupted feedback loops cause the agent to pursue adversarial goals while maintaining apparent functional integrity. This phenomenon is exemplified by &#x201C;reward hacking,&#x201D; where an agent maximises proxy metrics inconsistent with its true safety objective (<xref ref-type="bibr" rid="ref55">Yang et al., 2021</xref>).</p>
</sec>
<sec id="sec18">
<label>5.2</label>
<title>Self-governance</title>
<p>Self-governance entails the agent&#x2019;s internal regulation of objectives and constraints, analogous to meta-cognitive control in human reasoning (<xref ref-type="bibr" rid="ref4">Balasubramanian, 2023</xref>). Architectures implementing model-based planning or hierarchical reinforcement learning instantiate this through value functions and meta-policies that determine how sub-goals are generated and prioritised (<xref ref-type="bibr" rid="ref59">Zhang et al., 2021</xref>). Compromise at this layer can redirect the agent&#x2019;s governance structure itself: adversarial perturbations in high-level policy weights or meta-controller representations can lead to emergent misalignment, where the system rationalises harmful actions as reward-optimal (<xref ref-type="bibr" rid="ref22">Gupta et al., 2018</xref>). Defensive research in this domain explores <italic>governance integrity auditing</italic>, a form of formal verification ensuring consistency between learned value functions and externally specified ethical constraints.</p>
</sec>
<sec id="sec19">
<label>5.3</label>
<title>Decision-making loops</title>
<p>Agentic AI systems exhibit continuous sense&#x2013;think&#x2013;act loops, forming closed feedback cycles between perception, cognition, and action. Unlike static classifiers, they maintain <italic>stateful memory</italic> and update their world model across temporal horizons (<xref ref-type="bibr" rid="ref27">Kuutti et al., 2019</xref>). This looped structure yields temporal attack vectors absent in static models (<xref ref-type="bibr" rid="ref27">Kuutti et al., 2019</xref>; <xref ref-type="bibr" rid="ref37">Mukhopadhyay et al., 2019</xref>; <xref ref-type="bibr" rid="ref28">Lang et al., 2021</xref>; <xref ref-type="bibr" rid="ref47">Shenoy et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Lapan, 2018</xref>; <xref ref-type="bibr" rid="ref46">Sewak, 2019</xref>; <xref ref-type="bibr" rid="ref53">Wang W. et al., 2019</xref>; <xref ref-type="bibr" rid="ref23">Haarnoja et al., 2018</xref>; <xref ref-type="bibr" rid="ref57">Yu et al., 2020</xref>; <xref ref-type="bibr" rid="ref16">Duan et al., 2022</xref>). Examples include:</p>
<list list-type="bullet">
<list-item>
<p><italic>Temporal adversarial attacks</italic>, where delayed or staged perturbations exploit the agent&#x2019;s memory horizon.</p>
</list-item>
<list-item>
<p><italic>State-estimation corruption</italic>, where recurrent or transformer-based memory layers are poisoned to produce compounding decision errors.</p>
</list-item>
<list-item>
<p><italic>Goal hijacking</italic>, in which sequential manipulation of observations induces cumulative divergence from the intended policy trajectory.</p>
</list-item>
</list>
<p>In such systems, adversarial vulnerability cannot be fully described by L<sub>p</sub>-bounded perturbations on instantaneous input. Instead, the relevant threat model must incorporate multi-step causal dependencies and control-theoretic dynamics, for instance, how successive misperceptions propagate through the planning horizon to yield unsafe actuation (<xref ref-type="bibr" rid="ref28">Lang et al., 2021</xref>).</p>
</sec>
<sec id="sec20">
<label>5.4</label>
<title>Agentic adversarial taxonomy</title>
<p>Integrating these properties, adversarial risks in AAI can be classified as:</p>
<list list-type="bullet">
<list-item>
<p><italic>Perceptual attacks</italic> &#x2013; distort environmental observations (e.g., sensory spoofing, data poisoning);</p>
</list-item>
<list-item>
<p><italic>Cognitive attacks</italic> &#x2013; compromise internal world-model inference or value estimation (e.g., goal-misgeneralisation, policy injection);</p>
</list-item>
<list-item>
<p><italic>Executive attacks</italic> &#x2013; hijack actuation pathways or override decision-authorisation logic (e.g., malicious API wrappers, command-level perturbations).</p>
</list-item>
</list>
<p>Each layer, perceptual, cognitive, and executive, constitutes an independent but interdependent target surface. Defence, therefore, requires <italic>layered resilience</italic>, combining certified perception robustness, verifiable policy alignment, and secure actuator gating.</p>
</sec>
<sec id="sec21">
<label>5.5</label>
<title>Conceptual implications</title>
<p>By formalising autonomy, self-governance, and decision-making loops, we differentiate agentic AI from passive classifiers. The adversarial challenge expands from <italic>input perturbation</italic> to <italic>behavioural manipulation</italic>, implicating both epistemic integrity (truthfulness of internal representations) and normative alignment (consistency of actions with human intent). Understanding these dynamics provides the conceptual depth necessary to ground adversarial analysis within the agentic paradigm and establishes the foundation for a systematic evaluation of threats across the AI lifecycle.</p>
</sec>
<sec id="sec22">
<label>5.6</label>
<title>Conceptual framework and open challenges for agentic AI security</title>
<p>The analysis presented in this manuscript conceptualises adversarial threats to agentic AI systems by extending classical adversarial machine-learning models toward autonomy, self-governance, and closed-loop decision-making architectures. Importantly, adversarial risks at the cognitive and executive layers of agentic AI should be interpreted as hypotheses grounded in architectural properties, rather than as empirically validated attack classes with established benchmark performance. Unlike perceptual-layer attacks, which are supported by extensive experimental evidence across standard datasets, higher-order agentic attacks remain comparatively underexplored in controlled, reproducible settings.</p>
<p>At present, empirical validation of agentic adversarial behaviour is constrained by the absence of standardised benchmarks, formal threat models, and evaluation protocols for autonomous decision loops, long-horizon planning, and goal adaptation. Many of the vulnerabilities discussed at the cognitive and executive layers, such as reward manipulation, policy misalignment, and goal hijacking, are inferred from reinforcement-learning theory, red-teaming case studies, and system-level failure analyses, rather than from large-scale comparative experiments. As such, these attack vectors should be viewed as plausible and structurally motivated risks that warrant systematic investigation, rather than as established empirical results.</p>
<p>From a research perspective, this framework yields several concrete and actionable directions. First, it motivates the development of benchmark environments that explicitly encode autonomy, memory, and feedback dynamics, enabling controlled study of multi-step and temporal adversarial strategies beyond single-input perturbations. Second, it necessitates new evaluation metrics that measure cumulative behavioural deviation, policy drift, and safety constraint violations, rather than instantaneous misclassification rates alone. Third, it highlights the need for defence mechanisms that operate across layers, for example, combining perceptual robustness with policy verification and runtime governance checks, to prevent error propagation through decision loops. Collectively, these directions define a research agenda for agentic AI security that moves beyond attack enumeration toward principled, system-level resilience.</p>
</sec>
</sec>
<sec id="sec23">
<label>6</label>
<title>Taxonomy of adversarial attacks</title>
<p>The field of machine learning, specifically deep learning, deals with adversarial attacks (<xref ref-type="bibr" rid="ref51">Szegedy et al., 2013</xref>, <xref ref-type="bibr" rid="ref50">2015</xref>), but we need more specialised methods for detecting cyber-attacks (<xref ref-type="bibr" rid="ref56">Ye et al., 2006</xref>). This means creating inputs that can purposely mislead a model into making incorrect predictions or classifications (<xref ref-type="bibr" rid="ref10">Chejara et al., 2013</xref>). These inputs are adversarial examples and can be hard to detect (<xref ref-type="bibr" rid="ref15">Costa et al., 2023</xref>), especially in image-based datasets and other database security, such as blockchain cybersecurity (<xref ref-type="bibr" rid="ref44">Schlatt et al., 2023</xref>). Adversarial attacks deliberately introduce finely tuned perturbations into model inputs, causing systematic misclassification while remaining imperceptible to human observers. The challenge is particularly acute for high-dimensional modalities such as medical or street-scene imagery, where pixel-level changes can easily evade human scrutiny yet induce catastrophic model error. Comparable risks have been demonstrated in structured domains, including blockchain-based anomaly detection, underscoring that adversarial vulnerability is not restricted to vision tasks.</p>
<p><xref ref-type="fig" rid="fig6">Figure 6</xref> groups attack techniques into three operational categories. White-box attacks assume full disclosure of model internals, architecture, parameters, and sometimes even training data. Such complete knowledge enables gradient-based optimisation of adversarial inputs, exemplified by the Fast Gradient Sign Method (FGSM), the Jacobian-based Saliency Map Attack (JSMA), and the more precise DeepFool algorithm. Black-box attacks, by contrast, treat the target as an oracle, exploiting only input&#x2013;output pairs. Although gradient information is unavailable, adversaries can estimate it via query-efficient procedures such as Zeroth-Order Optimisation (ZOO), Boundary Attack, or Natural Evolution Strategies (NES). The efficacy of these methods relies on the empirical observation that decision boundaries learned by different models trained on the same task often align; hence perturbations transferable across architectures remain effective despite limited system knowledge. Transfer attacks explicitly exploit this property: an adversary crafts examples on a surrogate (locally accessible) model and deploys them against the remote target, achieving high success rates when data distributions or inductive biases overlap.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Taxonomy of adversarial attack modalities by model knowledge: white-box, black-box, and transfer attacks.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart titled &#x201C;Model Knowledge&#x201D; detailing three attack methods. &#x201C;White-box&#x201D; (Full knowledge) includes FGSM, JSMA, DeepFool with gradients: yes, low/medium complexity, high success. &#x201C;Black-box&#x201D; (Output-only queries) lists ZOO, Boundary Attack with high query budget, no gradients. &#x201C;Transfer&#x201D; (Surrogate-based) indicates no gradients, low complexity, typical success.</alt-text>
</graphic>
</fig>
<p>Orthogonal to knowledge assumptions is the attack objective. <italic>Targeted</italic> attacks coerce the model into a prespecified erroneous label, e.g., forcing all &#x2018;stop-sign&#x2019; images to be recognised as &#x2018;speed-limit&#x2019;, whereas <italic>untargeted</italic> attacks merely seek any classification error. This orthogonal taxonomy clarifies threat-model selection when evaluating defensive strategies or certifying robustness claims.</p>
<p>In white-box attacks, the adversary has complete knowledge of the target model. This includes the system&#x2019;s architecture, trained parameters, and, in some cases, training data. With this comprehensive understanding, the adversary creates adversarial examples to mislead the target model, e.g., FGSM, JSMA, Deepcool.</p>
<p>With Black-box attacks, the attacker only have access to the input and output of the model. The attacker uses this input&#x2013;output data to generate adversarial examples. Despite limited knowledge, black-box attacks can be potent because models and intense neural networks can share vulnerabilities across architectures. Transferability is essential, as an adversarial example created for one model can deceive another, e.g., ZOO, Boundary Attack, NES. Transfer Attacks are one type of black-box attack in which an adversary generates an adversarial input by accessing a different model, known as the surrogate model, and then uses it to attack the target model. The idea is that models trained on similar tasks share similar vulnerabilities, making it possible to transfer adversarial examples between them.</p>
<p>Attacks on models can be classified based on their goals. Targeted attacks aim to make the model generate a specific incorrect outcome. In contrast, untargeted attacks are focused on causing the model to make a mistake or be incorrect without specifying the desired incorrect output.</p>
<p><xref ref-type="fig" rid="fig6">Figure 6</xref> positions white-box, black-box, and transfer attacks along a single &#x201C;model knowledge&#x201D; axis and annotates each category with canonical methods (e.g., FGSM, ZOO) plus operational descriptors such as gradient access, query budget, complexity, and typical success. This presents comparison of threat assumptions and highlights how decreasing attacker knowledge generally increases query cost while reducing baseline effectiveness, which is further analysed in <xref ref-type="fig" rid="fig7">Figure 7</xref>.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Illustrations of adversarial attacks and defence mechanisms. <bold>(a)</bold> Adversarial perturbation visualization. <bold>(b)</bold> Loss landscape. <bold>(c)</bold> Defence mechanism workflow. <bold>(d)</bold> Generative adversarial network (GAN).</p>
</caption>
<graphic xlink:href="frai-09-1731566-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure (a) shows an original image of the number &#x201C;7,&#x201D; a perturbation pattern, and the resulting adversarial example resembling &#x201C;2.&#x201D; Figure (b) depicts a 3D graph with an adversarial example and decision boundary labeled, illustrating relationships between loss and gradient. Figure (c) is a flowchart with &#x201C;Input&#x201D; leading to &#x201C;Neural Network&#x201D; and then to &#x201C;Defensive Distillation&#x201D; and &#x201C;Gradient Masking,&#x201D; ending with &#x201C;Robust Model.&#x201D; Figure (d) is a diagram with &#x201C;Generator&#x201D; leading to &#x201C;Generated Data,&#x201D; influenced by &#x201C;Noise,&#x201D; and assessed as &#x201C;Real/Fake.&#x201D;</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="fig7">Figure 7</xref> details the adversarial attack and defence dynamics for the:</p>
<list list-type="alpha-lower">
<list-item>
<p>Adversarial perturbation visualisation on MNIST and CIFAR-10 samples&#x2014;showing original, perturbed, and difference maps with imperceptible L&#x2082; distortions;</p>
</list-item>
<list-item>
<p>Loss landscape plot depicting non-convex optimisation surfaces in adversarial space, highlighting vulnerability zones around decision boundaries;</p>
</list-item>
<list-item>
<p>Transferability grid demonstrating cross-model attack success between surrogate and target networks under FGSM and C&#x0026;W;</p>
</list-item>
<list-item>
<p>Workflow diagram of layered defence mechanisms (e.g., adversarial training, defensive distillation, input sanitisation), annotated with attack interception points and empirical effectiveness across datasets.</p>
</list-item>
</list>
<p>Various other methods are available for safeguarding against malicious attacks, each with unique benefits (<xref ref-type="bibr" rid="ref15">Costa et al., 2023</xref>; <xref ref-type="bibr" rid="ref30">Liang et al., 2022</xref>). These techniques include adversarial training, which involves training a model on adversarial examples to improve its robustness; defensive distillation, which reduces the amount of information available to attackers; and gradient masking, which obscures the gradients of a model to prevent attackers from exploiting them.</p>
<sec id="sec24">
<label>6.1</label>
<title>Comparative synthesis of adversarial attack effectiveness (adapted from prior studies)</title>
<p>While qualitative descriptions of adversarial attack methods such as FGSM, JSMA, DeepFool, and Carlini &#x0026; Wagner (C&#x0026;W) provide important context, a technical comparison is necessary to appreciate their empirical behaviours across standard benchmarks. <xref ref-type="table" rid="tab3">Table 3</xref> presents a comparative analysis based on key metrics: attack success rate (ASR), required perturbation magnitude (measured using L&#x2082; norm), computational cost, and transferability across models, based on results published across benchmark datasets such as MNIST, CIFAR-10, and ImageNet.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Comparative evaluation of adversarial attack techniques, with quantitative results adapted from prior peer-reviewed studies under comparable benchmark settings (MNIST, CIFAR-10, ImageNet).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Attack method</th>
<th align="left" valign="top">Dataset</th>
<th align="center" valign="top">ASR (%)</th>
<th align="center" valign="top">L&#x2082; Norm (avg.)</th>
<th align="left" valign="top">Time per sample</th>
<th align="left" valign="top">Transferability</th>
<th align="left" valign="top">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">FGSM</td>
<td align="left" valign="top">MNIST</td>
<td align="char" valign="top" char=".">89.3</td>
<td align="char" valign="top" char=".">2.12</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">Single-step; high-speed but coarse</td>
</tr>
<tr>
<td align="left" valign="top">JSMA</td>
<td align="left" valign="top">MNIST</td>
<td align="char" valign="top" char=".">87.1</td>
<td align="char" valign="top" char=".">1.43</td>
<td align="left" valign="top">Moderate</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Targets specific features; sparse perturbations</td>
</tr>
<tr>
<td align="left" valign="top">DeepFool</td>
<td align="left" valign="top">CIFAR-10</td>
<td align="char" valign="top" char=".">93.8</td>
<td align="char" valign="top" char=".">0.97</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">Medium</td>
<td align="left" valign="top">Iterative linearisation; minimal perturbation</td>
</tr>
<tr>
<td align="left" valign="top">C&#x0026;W (L&#x2082;)</td>
<td align="left" valign="top">CIFAR-10</td>
<td align="char" valign="top" char=".">98.2</td>
<td align="char" valign="top" char=".">0.62</td>
<td align="left" valign="top">Very High</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Highly precise; costly optimisation</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Sources: Adapted from <xref ref-type="bibr" rid="ref40">Papernot et al. (2016)</xref>, <xref ref-type="bibr" rid="ref19">Goodfellow et al. (2016)</xref>, <xref ref-type="bibr" rid="ref7">Carlini and Wagner (2017a</xref>, <xref ref-type="bibr" rid="ref8">b)</xref>, <xref ref-type="bibr" rid="ref12">Chen et al. (2020a</xref>, <xref ref-type="bibr" rid="ref13">b)</xref>, <xref ref-type="bibr" rid="ref38">Nasr et al. (2023)</xref>, <xref ref-type="bibr" rid="ref26">Khamaiseh et al. (2022)</xref>, <xref ref-type="bibr" rid="ref34">Moosavi-Dezfooli et al. (2016a</xref>, <xref ref-type="bibr" rid="ref35">b)</xref>, and <xref ref-type="bibr" rid="ref36">Moosavi-Dezfooli et al. (2015)</xref>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="sec25">
<label>7</label>
<title>Representative adversarial manipulations and operational consequences</title>
<p>Adversarial attacks are prime examples of deceptive exercises (<xref ref-type="bibr" rid="ref39">Ozdag, 2018</xref>) that can mislead AI systems, causing them to make incorrect classifications or decisions (<xref ref-type="bibr" rid="ref2">Anthi et al., 2020</xref>). Adversarial manipulation can occur at any stage of the machine-learning lifecycle, data curation, model training, model distribution, or inference, each stage exposing distinct attack surfaces with different observability and forensics burdens. Thinking in lifecycle terms helps relate attack mechanics to defensive control points: if an adversary perturbs inputs only at inference, detection must occur inline or downstream of the model; if the training corpus is poisoned, remediation requires provenance, dataset hygiene, and robust optimisation; if weights are tampered with in transit, supply-chain assurance becomes central. The examples below are organised accordingly to sharpen threat modelling and clarify where specific controls apply.</p>
<p><italic>Inference-time perturbation (feature-space evasion):</italic> In its simplest form, an attacker adds a small, norm-bounded perturbation to a single input x such that the perturbed instance x&#x2019; crosses the model&#x2019;s decision boundary while remaining visually or statistically indistinguishable to humans. Although gradient-based methods (e.g., FGSM variants) are canonical in white-box settings, black-box approximations using score queries or decision-only feedback can achieve comparable misclassification with iterative refinement. In high-stakes domains, medical image triage, automated traffic-sign recognition, these perturbations can suppress or amplify class-relevant features, yielding false negatives (missed tumours) or false positives (phantom lesions) with direct safety impact.</p>
<p><italic>Training-time data poisoning:</italic> Poisoning attacks corrupt the learning signal by inserting crafted samples into the training set. Two broad regimes matter operationally: <italic>dirty-label</italic> poisoning (attacker controls both features and labels) and <italic>clean-label</italic> poisoning (attacker perturbs features but preserves the nominal label so poisoned points survive basic validation). Even low-rate insertions (&#x003C;1%) can tilt decision boundaries, degrade calibration, or create predictable failure modes under distributional drift. Poisoning also interacts with data augmentation and class imbalance; for example, poisoning rare classes in medical datasets disproportionately reshapes model priors. Detection typically requires influence-function analysis, outlier scoring in feature embeddings, or spectral anomaly detection on gradient statistics.</p>
<p><italic>Adaptive test-time evasion (query-driven):</italic> Whereas single-shot perturbations assume gradient access or surrogate alignment, adaptive evasion exploits repeated interaction with a deployed model. The adversary submits queries, observes class probabilities or confidence scores, and uses zeroth-order or bandit optimisation to estimate a descent direction in input space. Boundary Attack and NES exemplify this class. Query-efficiency constraints dominate practicality: rate limiting, output rounding, and score obfuscation substantially reduce attack convergence, but too aggressive a response impairs legitimate API users. Empirical work shows that even coarse confidence leakage (top-k scores) accelerates black-box evasion relative to decision-only interfaces.</p>
<p><italic>Trojan/back-door model compromise:</italic> In a Trojaned model, the attacker implants a <italic>trigger-conditioned mapping</italic> during training: the model behaves normally on clean data but reliably emits an attacker-chosen label when a specific pattern (pixel patch, watermark, audio tone) is present. Triggers may be spatially local (corner patch), distributed (colour histogram shift), or semantic (accessory type in face images). Clean-label back-doors, where trigger-bearing samples retain the correct label in the training corpus, are especially insidious because they evade standard quality checks. Once deployed, a physical sticker on a road sign or patterned spectacles in a biometric gate can activate the hidden mapping. Detection approaches include activation clustering, trigger inversion, neuron pruning with fine-tuning, and statistical testing for unusually low-rank feature correlations tied to rare visual motifs.</p>
<p><italic>Supply-chain logic bombs and model re-use risk:</italic> Increasingly, downstream systems import third-party pre-trained weights, adapters, or foundation models. This introduces a supply-chain channel for adversarial payloads that need not reside in the training data at all. A malicious contributor can ship a model that passes standard validation yet contains a latent decision shortcut, activated only under a compound condition (e.g., specific Unicode tokens + input length range). Such logic bombs propagate across transfer-learning workflows: fine-tuning on a new dataset may leave the hidden pathway intact if the relevant neurons are not significantly updated. Verifiable model signing, weight-difference auditing, and targeted re-training with trigger search are emerging mitigations.</p>
<p>Across these categories, the unifying property is <italic>goal-directed manipulation under resource constraints</italic>: the adversary trades perturbation budget, query cost, and detectability to induce controlled model failure. Effective defence therefore requires layered controls aligned to lifecycle stage, dataset provenance and sanitisation to counter poisoning, robust and certified training to enlarge safe margins, interface hardening to reduce query-driven leakage, and supply-chain assurance to prevent Trojan insertion. Without such integration, even high-accuracy models remain operationally fragile in adversarial environments.</p>
<p>Backdoor attacks involve manipulating the AI model during training by inserting a specific backdoor pattern. This pattern will cause the model to generate incorrect outputs when it encounters the input data pattern, allowing attackers to manipulate its behaviour. These attacks require advanced defence mechanisms and security protocols to ensure the dependability and robustness of AI applications in various domains (<xref ref-type="bibr" rid="ref39">Ozdag, 2018</xref>; <xref ref-type="bibr" rid="ref31">Macas et al., 2024</xref>).</p>
</sec>
<sec id="sec26">
<label>8</label>
<title>Experimental setup for FGSM and C&#x0026;W attacks</title>
<p>To ensure reproducibility and transparency, this section outlines the experimental setup used for implementing the FGSM and C&#x0026;W adversarial attacks.</p>
<sec id="sec27">
<label>8.1</label>
<title>FGSM implementation setup</title>
<list list-type="bullet">
<list-item>
<p><italic>Dataset</italic>: ImageNet (via TensorFlow preprocessing utilities). The input image used in the demonstration is a Labrador Retriever image sourced from the TensorFlow example repository.</p>
</list-item>
<list-item>
<p><italic>Model</italic>: Pretrained MobileNetV2, loaded via tf.keras.applications. MobileNetV2 with ImageNet weights and include_top&#x202F;=&#x202F;True for full classification.</p>
</list-item>
<list-item>
<p><italic>Input Preprocessing</italic>: Images were resized to 224&#x00D7;224 pixels and normalised using mobilenet_v2.preprocess_input(), which scales inputs to the range [&#x2212;1, 1].</p>
</list-item>
<list-item>
<p><italic>Loss Function</italic>: CategoricalCrossentropy() was used to compute the gradient for generating the adversarial perturbation.</p>
</list-item>
<list-item>
<p><italic>Hyperparameter (&#x03B5;)</italic>: Multiple epsilon values were tested: &#x03B5;&#x202F;=&#x202F;[0.01, 0.1, 0.15]. These control the perturbation strength added to the original input.</p>
</list-item>
<list-item>
<p><italic>Perturbation Strategy</italic>: Gradients were computed with respect to the input using tf. GradientTape(), and perturbations were applied by adding the sign of the gradient scaled by &#x03B5;.</p>
</list-item>
<list-item>
<p><italic>Evaluation Metric</italic>: Confidence drop in predicted class label before and after perturbation was used to assess the effectiveness of the adversarial attack.</p>
</list-item>
</list>
</sec>
<sec id="sec28">
<label>8.2</label>
<title>Carlini &#x0026; Wagner (C&#x0026;W) implementation setup</title>
<list list-type="bullet">
<list-item>
<p><italic>Dataset</italic>: MNIST (handwritten digits dataset), loaded via keras.datasets.mnist. Images were normalised to the range [0, 1].</p>
</list-item>
<list-item>
<p><italic>Model</italic>: Custom convolutional neural network (CNN) implemented using Keras. The architecture includes:</p>
</list-item>
<list-item>
<p>2&#x202F;&#x00D7;&#x202F;Conv2D layers (32 and 64 filters) with ReLU activation</p>
</list-item>
<list-item>
<p>2&#x202F;&#x00D7;&#x202F;MaxPooling layers</p>
</list-item>
<list-item>
<p>Flatten &#x2192; Dense(200)&#x202F;&#x2192;&#x202F;Dropout(0.5)&#x202F;&#x2192;&#x202F;Dense(10 with softmax)</p>
</list-item>
<list-item>
<p>
<italic>Training Regime</italic>
</p>
</list-item>
<list-item>
<p>Optimiser: Stochastic Gradient Descent (SGD) with learning rate 0.01, momentum 0.9, and decay 1e-6</p>
</list-item>
<list-item>
<p>Epochs: 20</p>
</list-item>
<list-item>
<p>Batch size: 64</p>
</list-item>
<list-item>
<p>
<italic>C&#x0026;W Attack Configuration</italic>
</p>
</list-item>
<list-item>
<p>Optimisation: Gradient descent used to minimise the L&#x2082; norm of perturbation</p>
</list-item>
<list-item>
<p>Confidence parameter &#x03BA;: Default value 0 (can be increased to enforce stronger misclassification)</p>
</list-item>
<list-item>
<p>Box constraints: Tanh transformation applied to ensure pixel values remain in [0,1]</p>
</list-item>
<list-item>
<p>Targeted Attack: Yes &#x2014; each adversarial sample is crafted to misclassify into a specific target class</p>
</list-item>
<list-item>
<p><italic>Evaluation Metric</italic>: Accuracy under attack and visual indistinguishability of adversarial examples were assessed.</p>
</list-item>
</list>
<p>These setup details (<xref ref-type="fig" rid="fig5">Figure 5</xref>) provide clarity on the experimental context for the code demonstrations, enabling replication and extending the empirical discussion of adversarial robustness.</p>
</sec>
<sec id="sec29">
<label>8.3</label>
<title>Jacobian-based Saliency Map Attack (JSMA)</title>
<p>The Jacobian-based Saliency Map Attack is a targeted, white-box technique that exploits gradient information to alter only a handful of the most influential input features while steering a neural network toward a chosen misclassification. The attack begins by computing the gradient of each output logit with respect to every input dimension, a full Jacobian matrix. From this matrix, the attacker derives a <italic>saliency map</italic>: for each pixel (or feature), the map indicates whether increasing that pixel will simultaneously raise the probability of the desired target class and suppress the probabilities of all other classes. Features with the highest positive saliency are therefore the most &#x201C;efficient&#x201D; levers for forcing the decision boundary to flip.</p>
<p>JSMA then proceeds in a greedy, iterative fashion. At every step, it selects the single best, or, for stability, the best pair of, high-saliency features and nudges their values toward an extreme (e.g., full white or full black for image pixels). After each perturbation, the network&#x2019;s logits are recomputed, a new saliency map is generated, and the process repeats until the classifier outputs the attacker&#x2019;s chosen label or until a predefined L&#x2080; budget, the maximum number of features allowed to change, has been reached.</p>
<p>Because JSMA modifies only the most critical features, it commonly succeeds with fewer than 5 % of pixels altered on datasets such as MNIST and CIFAR-10, yielding adversarial images that remain visually plausible to humans. While the attack requires full gradient access and is thus restricted to white-box settings, its sparsity makes it a stringent benchmark for defences that claim robustness against small, localised perturbations.</p>
</sec>
<sec id="sec30">
<label>8.4</label>
<title>DeepFool attack</title>
<p>DeepFool is an untargeted, gradient-based attack that estimates the <italic>smallest</italic> perturbation required to push an input across a classifier&#x2019;s decision boundary. Starting from the original sample, the algorithm approximates the complex, non-linear boundary of a deep network by locally linearising it with first-order gradients. It then computes the shortest vector that moves the input outside the current class region in this linear space, applies that minimal step, and repeats the procedure on the new point. The iteration halts as soon as the network&#x2019;s predicted label changes. By progressively refining the approximation at each step, DeepFool typically discovers far smaller perturbations than single-shot methods such as FGSM and, unlike sparse attacks like JSMA, distributes the changes over many low-magnitude pixels, making the manipulation virtually imperceptible. Empirical studies show that DeepFool reduces the required L&#x2082; distortion by 20&#x2013;30% relative to comparable iterative attacks on ImageNet-scale models, revealing just how narrow the effective safety margin of modern deep networks can be under well-informed adversaries.</p>
</sec>
<sec id="sec31">
<label>8.5</label>
<title>Generative adversarial networks</title>
<p>Generative Adversarial Networks (GANs) (<xref ref-type="bibr" rid="ref20">Goodfellow et al., 2014</xref>), frame data synthesis as a two-player minimax game between a generator G and a discriminator D. The generator maps random latent vectors to candidate samples, while the discriminator estimates the probability that a given sample originates from the real training distribution rather than from G. Training proceeds by alternating gradient updates: D maximises its classification accuracy, whereas G minimises a divergence, originally the Jensen-Shannon distance, by producing outputs that confuse D. When optimisation converges (a Nash equilibrium), the generator&#x2019;s distribution ideally matches the real data manifold so closely that the discriminator&#x2019;s accuracy collapses to chance.</p>
<p>GANs have become a cornerstone of modern generative modelling, powering high-fidelity face synthesis (StyleGAN), text-to-image translation, super-resolution, and domain adaptation. They also present unique security considerations. First, data privacy leakage can occur when a well-trained discriminator memorises rare training instances and inadvertently reveals them through gradient inspection or model inversion. Second, GANs can serve as attack amplifiers: a generator conditioned on class labels can mass-produce diverse adversarial variants that evade conventional defences through distributional coverage rather than single-point perturbations. Finally, GAN training is notoriously fragile, susceptible to mode collapse, gradient saturation, and oscillatory dynamics, necessitating architectural and objective refinements such as Wasserstein GANs with gradient penalty and spectral normalisation.</p>
<p>Despite these challenges, the ability of GANs to approximate complex, high-dimensional distributions continues to drive research in synthetic-data augmentation, privacy-preserving data release, and adversarial-robustness evaluation, underscoring their dual role as both enablers and stress-test tools for secure AI systems.</p>
</sec>
<sec id="sec32">
<label>8.6</label>
<title>How do they relate to cyber-attacks: GAN relation to cyber-attacks on AI models</title>
<p>Adversarial Attacks involve manipulating input data to cause the model to make an error &#x2013; see <xref ref-type="fig" rid="fig8">Figure 8</xref>. In GANs, the Generator creates these adversarial examples to deceive the Discriminator. This phenomenon has been studied to identify vulnerabilities in AI models and develop ways to prevent them.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>GAN relation to cyber-attacks on AI models - attack&#x2013;impact&#x2013;mitigation.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating the relationship between adversarial attacks on GAN (generative adversarial networks) and data poisoning. Adversarial attacks enable data poisoning, which requires both defense mechanisms and model inversion for mitigation. Threat goals are associated with adversarial attacks and model inversion.</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="fig8">Figure 8</xref> highlights how a conditional Generative Adversarial Network (cGAN) can be weaponised to mount a training-set poisoning attack. An adversary first trains a cGAN on the public portion of the dataset, then synthesises a small batch of label-consistent, but feature-manipulated, samples (e.g., 500 images, &#x2248;0.2% of CIFAR-10). When these artefacts are covertly merged into the production training corpus, they bias the decision boundary toward attacker-defined regions or embed a back-door trigger that fires whenever a specific pixel pattern appears, driving the post-deployment misclassification rate above 90% for the targeted class.</p>
<p>Defenders can blunt this vector through cGAN-augmented adversarial training: the generator continuously produces hardest-to-classify variants, which are injected online into each mini-batch. On ImageNet-size models, this regimen typically raises robust accuracy under poisoning from ~12% to more than 60%, albeit at a 1.3-fold increase in training time and a modest (&#x2248;1&#x202F;pp) drop in clean-data accuracy.</p>
<p>GANs are therefore ambivalent tools: the same architecture that fabricates high-fidelity poisons can serve as a red-team oracle for stress-testing data pipelines, synthesising minority-class samples, and rehearsing privacy-leakage drills. Secure deployment demands rigorous dataset provenance checks, trigger-inversion audits, and runtime detectors capable of flagging low-density feature manifolds, measures that treat GANs simultaneously as an offensive capability and a defensive asset.</p>
</sec>
<sec id="sec33">
<label>8.7</label>
<title>Spatial Transformation Attack (STAs)</title>
<p>Spatial-Transformation Attacks manipulate an image&#x2019;s geometry rather than its per-pixel intensities, warping the input just enough to cross a model&#x2019;s decision boundary while remaining visually unchanged to humans. In practice, an STA optimises a low-dimensional flow-field, parameterising sub-pixel translations, rotations, or local elastic deformations, so that the warped image x&#x2019; is classified as the attacker&#x2019;s target label even though x&#x2019; is perceptually indistinguishable from the original x. Because the perturbation is measured in angular degrees or pixel shifts, STA exploits a vulnerability orthogonal to the L<sub>p</sub>-bounded threat model assumed by most defences; a network that is robust to FGSM-style noise can still be highly sensitive to a two-degree rotation or a three-pixel vertical shift.</p>
<p>Failure cases are particularly acute in vision systems that lack built-in spatial invariance. In autonomous-driving pipelines, a marginal rotation of a traffic-sign crop, well within camera stabilisation tolerances, can flip a &#x201C;stop&#x201D; sign to &#x201C;speed-limit 45&#x202F;km&#x202F;h<sup>&#x2212;1</sup>,&#x201D; jeopardising braking logic. In face-recognition access control, an STA that elongates the nose bridge by a fraction of a pixel grid can lower cosine similarity below the authentication threshold, causing a false negative for the legitimate user and a corresponding security loophole. These attacks underscore that robustness cannot be assessed solely in the signal domain: geometric resilience must be evaluated explicitly.</p>
<p>In <xref ref-type="fig" rid="fig9">Figure 9</xref>, we can see an overview of Spatial Transformation Attacks.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Overview of spatial transformation attacks.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating geometric perturbations in Convolutional Neural Networks (CNNs). At the center, an arrow between a circle and two shapes. Descriptions: &#x201C;Nature&#x201D; (orange) discusses misclassification, &#x201C;Objective&#x201D; (grey) covers geometric transformations, &#x201C;Defences&#x201D; (green) mentions spatial-augmented training, and &#x201C;Significance&#x201D; (blue) highlights spatial bias exploitation.</alt-text>
</graphic>
</fig>
<p>Mitigation strategies centre on reducing the model&#x2019;s sensitivity to small geometric distortions. Data-augmentation pipelines that include random rotations, translations, and mild elastic warping expand the training distribution, widening the decision margin in spatial space. More rigorous defences incorporate <italic>spatial adversarial training</italic>, injecting optimised STA examples into each mini-batch, or adopt architectures with built-in equivariance (e.g., group-convolution networks and vision transformers with relative positional encodings). Post-hoc input defences, such as randomised cropping&#x2013;rescaling or feature-map alignment layers, provide an additional barrier, although they add inference latency and can degrade clean-data accuracy if tuned aggressively. Together, these measures shift robustness evaluation from purely L<sub>p</sub>-norm metrics toward a broader geometry-aware standard, closing a critical but often overlooked gap in adversarial resilience.</p>
</sec>
<sec id="sec34">
<label>8.8</label>
<title>Physical adversarial examples</title>
<p>Physical adversarial examples translate the well-studied, pixel-space manipulations of digital attacks into real-world artefacts, stickers, patches, 3-D prints, or projected light patterns, that, when photographed or sensed, induce the same misclassification failure in a deployed model. Their salient feature is environmental robustness: the perturbation must remain effective across variable illumination, camera angles, and viewing distances. The STOP/SPEED-LIMIT sticker set, for instance, fools state-of-the-art traffic-sign classifiers over a 15-metre approach range and &#x00B1;30&#x00B0; yaw, highlighting the safety risk for autonomous-driving stacks that rely on single-frame vision.</p>
<p>Unlike purely digital attacks, which can be generated offline with exact gradients, physical perturbations must survive the full imaging pipeline, optics, sensor noise, JPEG compression, and any pre-processing stages. Attackers therefore optimise in a <italic>render-aware</italic> loop: they place a candidate patch in a 3-D scene, render synthetic photographs under random pose and lighting, back-propagate the loss, update the patch texture, and iterate until the misclassification probability exceeds a threshold across the sampled conditions. Empirical studies show that such patches can reduce ImageNet-top-1 accuracy of object-detection systems by more than 40 percentage points while remaining inconspicuous to human drivers.</p>
<p>Defensive counter-measures fall into three tiers. Data-level hardening augments training corpora with random perspective warps, brightness shifts, and physically simulated artefacts, enlarging the decision margin against unseen viewpoints. Model-level adaptations introduce spatial-transformer layers or invariant feature pooling to attenuate localised perturbations. Runtime monitoring employs secondary sensors (LiDAR, radar) or consistency checks across video frames to flag implausible label flips. Despite progress, no single layer fully neutralises physical adversaries; layered, cross-sensor architectures remain the most effective mitigation strategy for safety-critical deployments such as autonomous vehicles and medical imaging devices.</p>
<p>In <xref ref-type="fig" rid="fig10">Figure 10</xref>, we can see a detailed explanation in the context of cyber-attacks on artificial intelligence models.</p>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>Operational pipeline for physical adversarial examples.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g010.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Two images of stop signs are shown: a clean stop sign and one perturbated with black stickers. Arrows point to text boxes with the following information. Attack Objective: Remain effective under real-world conditions and induce misclassification as &#x201C;Speed Limit 45 km/h.&#x201D; Challenges: Changes in lighting, camera noise, and lens distortion, along with varying distances and angles. Implications: Significant accuracy degradation by over forty percentage points and the need for multi-sensor fusion and invariant architectures.</alt-text>
</graphic>
</fig>
<p>The diagram in <xref ref-type="fig" rid="fig10">Figure 10</xref> contrasts a clean stop-sign image with a sticker-perturbed version that induces a &#x201C;Speed-Limit 45&#x202F;km&#x202F;h<sup>&#x2212;1</sup>&#x201D; misclassification. Right-hand panels detail (i) the attack objective&#x2014;robust misclassification under real-world capture conditions; (ii) practical challenges such as lighting variation, lens distortion, and viewpoint changes; and (iii) downstream safety implications, including &#x003E;40-pp accuracy degradation and the need for multi-sensor fusion or geometry-invariant architectures. Directional arrows show the causal flow from perturbation design through environmental robustness constraints to system-level risk.</p>
</sec>
<sec id="sec35">
<label>8.9</label>
<title>Model inversion attack</title>
<p>Model-inversion attacks exploit a deployed model&#x2019;s output scores to reconstruct features of its training data, even when the adversary lacks direct access to the model parameters or the underlying corpus. The attacker repeatedly queries the prediction API with candidate inputs, observes the confidence scores, and applies optimisation or Bayesian search to converge on an input that maximises the posterior likelihood of producing the observed output. For a face-recognition network, this process can gradually refine a synthetic image until it resembles an individual whose portrait was present in the training set; for a pharmacogenomic classifier, it can approximate genotype markers associated with a particular phenotype.</p>
<p>The feasibility of inversion depends on three factors: (i) model capacity and over-fitting, high-capacity networks with narrow decision boundaries tend to leak more information; (ii) output granularity, probability vectors reveal richer gradients for search than hard labels; and (iii) auxiliary knowledge held by the adversary, such as population priors or partial feature values. Empirical studies show that soft-max probabilities at 32-bit precision allow recovery of MNIST digits with &#x003E;90% structural similarity and can reveal medically sensitive attributes (e.g., asthma status) from ostensibly de-identified clinical models.</p>
<p>Mitigations fall into two classes. Output-sanitisation limits the information returned per query: top-k labels, quantised logits, or binary decisions reduce gradient signal and slow reconstruction. Differential-privacy training injects calibrated noise into the loss or gradient updates, ensuring that any single training instance exerts only a provably bounded influence on the final model; state-of-the-art implementations (<italic>&#x03B5;</italic>&#x202F;&#x2248;&#x202F;1) cut inversion accuracy on facial datasets from ~85% to near chance while retaining within 2&#x2013;3&#x202F;pp. of baseline classification accuracy. Combined with query-rate throttling and audit logging, these defences form a layered strategy for protecting sensitive training data against inversion inferences.</p>
<p>In <xref ref-type="fig" rid="fig11">Figure 11</xref>, we can see a breakdown of how Model Inversion Attacks work:</p>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>Breakdown of how model inversion attacks work.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g011.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart depicting an attacker querying a model, which provides confidence outputs leading to reconstructed data. An optimization loop shows privacy leakage. Defenses include reconstructed data, output sanitization, and DP training.</alt-text>
</graphic>
</fig>
<p>The diagram in <xref ref-type="fig" rid="fig11">Figure 11</xref> traces the query-response feedback loop that enables model-inversion attacks and pinpoints the two primary mitigation points, output sanitisation and differential-privacy training, within that workflow.</p>
</sec>
<sec id="sec36">
<label>8.10</label>
<title>Membership inference attack</title>
<p>A membership-inference attack seeks to decide, with higher-than-chance accuracy, whether a particular data record x was included in a target model&#x2019;s training set. The attack leverages a well-known side effect of <italic>overfitting</italic>: models often assign systematically higher confidence, or lower loss, to samples they have previously seen. By querying the model with &#x00D7; and inspecting the resulting probability vector (or loss value), an adversary can compare that score against a decision threshold learned from shadow models or public data and infer &#x201C;in&#x201D; versus &#x201C;out&#x201D; membership.</p>
<p>The privacy stakes are considerable. For a diagnostic classifier trained on protected health information, confirming that an individual&#x2019;s record contributed to model fitting may reveal their medical status. Similar concerns arise in financial, genomic, and social-media domains where the mere presence of a record in a dataset is itself sensitive. Empirical studies show that standard image-classification networks trained without privacy safeguards leak membership information at 60&#x2013;70% attack accuracy on CIFAR-10, well above the 50% random baseline, and that the leakage grows with model capacity and training epochs.</p>
<p>Defences align with two root causes: excessive confidence and excessive memorisation. Output-sanitisation truncates or quantises prediction scores, depriving the attacker of fine-grained confidence signals, while regularisation mechanisms (dropout, weight decay, early stopping) reduce the train&#x2013;test performance gap that powers the attack. The strongest mitigation is differential-privacy (DP) training, which injects calibrated noise into each gradient update and clips per-sample contributions; state-of-the-art DP-SGD configurations (<italic>&#x03B5;</italic>&#x202F;&#x2248;&#x202F;1&#x2013;2) can lower membership-inference accuracy to near random, although typically at a cost of 1&#x2013;4&#x202F;pp. in clean accuracy on image benchmarks. In production deployments, these algorithmic measures should be combined with query-rate limiting and audit logging to detect large-scale probing campaigns, thereby delivering layered protection against MIAs in privacy-sensitive machine-learning systems.</p>
</sec>
</sec>
<sec id="sec37">
<label>9</label>
<title>Case study analysis of benchmarks and competitions in adversarial robustness research</title>
<p>Recent benchmark datasets and competitive evaluations have played a crucial role in advancing the state-of-the-art in adversarial robustness by providing standardised tasks, reproducibility protocols, and comparative baselines for attack and defence techniques. Two prominent initiatives, RobustML and the Adversarial Vision Challenge (AVC), offer structured environments to evaluate model performance under adversarial conditions.</p>
<sec id="sec38">
<label>9.1</label>
<title>RobustML benchmark</title>
<p>RobustML<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> is a centralised repository and evaluation framework designed to rigorously assess the robustness of machine learning models against a variety of adversarial threats. It provides:</p>
<list list-type="bullet">
<list-item>
<p><italic>Standardised Datasets</italic>: MNIST, CIFAR-10, TinyImageNet, and ImageNet derivatives with adversarial variants.</p>
</list-item>
<list-item>
<p><italic>Evaluation Criteria</italic>: Robust accuracy under L_&#x221E;, L_2, and L_1 norm-bounded attacks; adversarial training generalisation; transferability scores.</p>
</list-item>
<list-item>
<p><italic>Leaderboard</italic>: Maintains an active ranking of models tested under controlled threat models using both white-box and black-box evaluations.</p>
</list-item>
<list-item>
<p><italic>Toolchain Integration</italic>: Supports adversarial testing frameworks such as Foolbox, CleverHans, and AutoAttack.</p>
</list-item>
</list>
<p>Recent results show that adversarially trained ResNet models using projected gradient descent (PGD) retain up to 47% robust accuracy on CIFAR-10 under L_&#x221E;-bounded attacks (&#x03B5;&#x202F;=&#x202F;8/255), setting a practical ceiling for current defences.</p>
</sec>
<sec id="sec39">
<label>9.2</label>
<title>Adversarial Vision Challenge (AVC)</title>
<p>The Adversarial Vision Challenge, hosted as part of NeurIPS competitions, focuses on real-world image classification robustness and transferability. Organised by the IBM Research Zurich and the RobustBench team, it consists of two tracks:</p>
<list list-type="bullet">
<list-item>
<p><italic>Attack Track</italic>: Participants design universal or input-specific adversarial attacks that can fool a set of black-box models. Evaluation metrics include attack success rate and perceptual distortion constraints.</p>
</list-item>
<list-item>
<p><italic>Defence Track</italic>: Teams submit image classifiers to withstand a barrage of adaptive and ensemble attacks. Models are tested against unseen attacks to measure true generalisability.</p>
</list-item>
</list>
<p>Notable findings from AVC include:</p>
<list list-type="bullet">
<list-item>
<p>Ensemble Adversarial Training remains the most reliable strategy for robust performance across multiple unseen attack vectors.</p>
</list-item>
<list-item>
<p>AutoAugment-enhanced defences significantly improve natural accuracy but often trade off against robustness.</p>
</list-item>
<list-item>
<p>The best-performing defences in 2022 achieved ~42% robust accuracy on the holdout dataset under L_&#x221E; norm constraints, suggesting the gap between clean and robust performance remains wide.</p>
</list-item>
</list>
</sec>
<sec id="sec40">
<label>9.3</label>
<title>Implications for practical deployment</title>
<p>These benchmarks highlight the non-trivial trade-offs between clean accuracy, robustness, and computational cost. They also reveal that:</p>
<list list-type="bullet">
<list-item>
<p>Many published defences fail under adaptive evaluation and gradient-free attacks.</p>
</list-item>
<list-item>
<p>Robustness must be evaluated holistically, not just against FGSM or single-model C&#x0026;W attacks.</p>
</list-item>
</list>
<p>Benchmark-driven competitions such as RobustML and AVC have emerged as critical drivers of methodological transparency and cross-laboratory validation - <xref ref-type="table" rid="tab4">Table 4</xref>.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Summary of benchmarks and competitions in adversarial robustness evaluation.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Benchmark/competition</th>
<th align="left" valign="top">Datasets used</th>
<th align="left" valign="top">Attack norms evaluated</th>
<th align="left" valign="top">Evaluation type</th>
<th align="left" valign="top">Top defence methods</th>
<th align="left" valign="top">Notable metrics</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">RobustML</td>
<td align="left" valign="top">MNIST, CIFAR-10, TinyImageNet</td>
<td align="left" valign="top">L_&#x221E;, L_2, L_1</td>
<td align="left" valign="top">White-box, Black-box</td>
<td align="left" valign="top">PGD Adversarial Training, TRADES</td>
<td align="left" valign="top">Robust Accuracy, Transferability Index</td>
</tr>
<tr>
<td align="left" valign="top">RobustBench</td>
<td align="left" valign="top">CIFAR-10, ImageNet, ImageNet-C</td>
<td align="left" valign="top">L_&#x221E;, L_2, corruptions</td>
<td align="left" valign="top">White-box (AutoAttack)</td>
<td align="left" valign="top">TRADES, MART, Hydra, AugMix</td>
<td align="left" valign="top">Clean vs. Robust Accuracy Trade-off</td>
</tr>
<tr>
<td align="left" valign="top">Adversarial Vision Challenge (AVC)</td>
<td align="left" valign="top">CIFAR-10-like Synthetic Set</td>
<td align="left" valign="top">L_&#x221E;, custom perceptual metrics</td>
<td align="left" valign="top">Adaptive Black-box</td>
<td align="left" valign="top">Ensemble Adversarial Training, AutoAugment</td>
<td align="left" valign="top">Attack Success Rate, Perceptual Score</td>
</tr>
<tr>
<td align="left" valign="top">AutoAttack Leaderboard</td>
<td align="left" valign="top">CIFAR-10, ImageNet</td>
<td align="left" valign="top">L_&#x221E;, L_2</td>
<td align="left" valign="top">Fully automated attacks</td>
<td align="left" valign="top">PreAct ResNet + TRADES + Data Augment</td>
<td align="left" valign="top">Average Robust Accuracy</td>
</tr>
<tr>
<td align="left" valign="top">MLPerf Robustness (In Progress)</td>
<td align="left" valign="top">ImageNet, Speech Commands</td>
<td align="left" valign="top">L_&#x221E;, noise, corruptions</td>
<td align="left" valign="top">Multimodal, real-world</td>
<td align="left" valign="top">TBD (under development)</td>
<td align="left" valign="top">Generalisation under distribution shift</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="tab4">Table 4</xref> compares major adversarial robustness benchmarks and competitions, detailing dataset coverage, types of attack norms evaluated, evaluation methodology (white-box vs. black-box vs. adaptive), leading defence strategies, and key performance metrics. The benchmarks serve as a foundation for reproducible, comparative adversarial research and inform the practical deployment of robust AI systems (<xref ref-type="table" rid="tab5">Table 5</xref>).</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Taxonomy of advanced adversarial attacks in machine learning systems.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Attack type</th>
<th align="left" valign="top">Attack surface</th>
<th align="left" valign="top">Knowledge required</th>
<th align="left" valign="top">Primary objective</th>
<th align="left" valign="top">Modality</th>
<th align="left" valign="top">Key challenges/constraints</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Model Inversion</td>
<td align="left" valign="top">Model API/Output</td>
<td align="left" valign="top">Black-box or white-box</td>
<td align="left" valign="top">Reconstruct training data attributes</td>
<td align="left" valign="top">Inference-time</td>
<td align="left" valign="top">High model overfitting; requires high-confidence output</td>
</tr>
<tr>
<td align="left" valign="top">Membership Inference</td>
<td align="left" valign="top">Model API/Output</td>
<td align="left" valign="top">Black-box</td>
<td align="left" valign="top">Determine whether data was in training</td>
<td align="left" valign="top">Inference-time</td>
<td align="left" valign="top">Exploits overfitting; mitigated by differential privacy</td>
</tr>
<tr>
<td align="left" valign="top">Physical Adversarial Ex.</td>
<td align="left" valign="top">Sensor input</td>
<td align="left" valign="top">Black-box</td>
<td align="left" valign="top">Mislead model via real-world input</td>
<td align="left" valign="top">Physical-world</td>
<td align="left" valign="top">Must survive transformations (lighting, angle, etc.)</td>
</tr>
<tr>
<td align="left" valign="top">Data Poisoning</td>
<td align="left" valign="top">Training pipeline</td>
<td align="left" valign="top">White-box (typically)</td>
<td align="left" valign="top">Corrupt learning process</td>
<td align="left" valign="top">Training-time</td>
<td align="left" valign="top">Access to training pipeline; low visibility post-deployment</td>
</tr>
<tr>
<td align="left" valign="top">Backdoor/Trojan</td>
<td align="left" valign="top">Training pipeline</td>
<td align="left" valign="top">White-box or insider</td>
<td align="left" valign="top">Embed hidden functionality</td>
<td align="left" valign="top">Training-time &#x0026; test</td>
<td align="left" valign="top">Trigger specificity; covert injection is non-trivial</td>
</tr>
<tr>
<td align="left" valign="top">Side-channel Exploitation</td>
<td align="left" valign="top">Hardware-level</td>
<td align="left" valign="top">Varies</td>
<td align="left" valign="top">Leak sensitive internal model properties</td>
<td align="left" valign="top">Passive measurement</td>
<td align="left" valign="top">Hardware proximity and timing constraints</td>
</tr>
<tr>
<td align="left" valign="top">Transfer Attacks</td>
<td align="left" valign="top">Surrogate model</td>
<td align="left" valign="top">Black-box</td>
<td align="left" valign="top">Exploit shared vulnerabilities</td>
<td align="left" valign="top">Inference-time</td>
<td align="left" valign="top">Model similarity required for high transfer success</td>
</tr>
<tr>
<td align="left" valign="top">Model Extraction</td>
<td align="left" valign="top">Query interface</td>
<td align="left" valign="top">Black-box</td>
<td align="left" valign="top">Reconstruct functionally equivalent model</td>
<td align="left" valign="top">Repeated queries</td>
<td align="left" valign="top">Query budget; distillation quality affects fidelity</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Sources: Adapted from <xref ref-type="bibr" rid="ref15">Costa et al. (2023)</xref>, <xref ref-type="bibr" rid="ref38">Nasr et al. (2023)</xref>, <xref ref-type="bibr" rid="ref48">Shokri et al. (2017)</xref>, <xref ref-type="bibr" rid="ref54">Wu and Fredrikson (2024)</xref>, <xref ref-type="bibr" rid="ref17">and Fredrikson et al. (2015)</xref>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec41">
<label>9.4</label>
<title>Technical analysis of recent red-teaming studies on LLMs</title>
<p>A systematic evaluation of prompt injection and jailbreak vulnerabilities across leading LLMs (<xref ref-type="bibr" rid="ref41">Pathade, 2025</xref>), specifically, GPT-4, Claude 2, Mistral 7B, and Vicuna, using over 1,400 adversarial prompts (<xref ref-type="bibr" rid="ref41">Pathade, 2025</xref>). In terms of prompt injection mechanics, the study categorised injection vectors into direct (embedded commands in user prompts), indirect (hidden within multi-turn context), and obfuscated (using encoding or spacing techniques). Surprisingly, obfuscated and multi-turn strategies succeeded in 75&#x2013;90% of trials across all four models, exposing a consistent failure in instruction boundary enforcement. In terms of model-specific vulnerabilities, GPT-4 and Claude 2, despite their advanced alignment layers, showed susceptibility to encoded prompt injections, with attack success rates of ~85% under whitespace manipulation. Lighter-weight models like Vicuna and Mistral, lacking extensive reinforcement learning from human feedback (RLHF), saw even higher success rates (&#x003E;90%). Then, by applying multi-turn context attacks, they successfully manipulated model behaviour by embedding adversarial instructions outside the initial prompt. For example, a second-turn injection could override safety filters in the final response, demonstrating flaws in alignment coherence across conversation threads. The jailbreak transferability and generalisation adversarial prompts produced effective against GPT-4 also succeeded 65&#x2013;70% of the time on GPT-3.5-based variants, indicating that exploitation of latent completion patterns can generalise within architectural families. To conclude this case study analysis, the mitigation insights from this study demonstrated that introducing a supervisory <italic>&#x201C;safety sentinel&#x201D;</italic> module, evaluating each response for instruction leakage, reduced prompt injection success by over 70%, though at the cost of increased latency (~300&#x202F;ms per query).</p>
<p>A second case study on red teaming AI systems (<xref ref-type="bibr" rid="ref45">Schoepf et al., 2025</xref>), introduced MAD-MAX, an automated red-teaming framework that combines modular adversarial strategies to generate jailbreaks on GPT-4o and Gemini-Pro (<xref ref-type="bibr" rid="ref45">Schoepf et al., 2025</xref>; <xref ref-type="bibr" rid="ref11">Chen et al., 2025</xref>). MAD-MAX achieved a 97% success rate across diverse malicious goals, requiring just 10.9 queries per goal, compared to 23.3 queries needed by previous methods, illustrating both higher efficiency and diversity of exploit vectors.</p>
<p>Collectively, these 2025 red-teaming examples reveal that:</p>
<list list-type="order">
<list-item>
<p>Instruction boundary misclassification remains a universal vulnerability even in highly aligned models.</p>
</list-item>
<list-item>
<p>Multi-turn and obfuscated prompt strategies effectively bypass static alignment measures.</p>
</list-item>
<list-item>
<p>Automated frameworks (MAD-MAX) dramatically amplify the scale and coverage of adversarial testing, exposing latent vulnerabilities at rate and scale.</p>
</list-item>
<list-item>
<p>Mitigation strategies, such as safety sentinels or dynamic context pruning, can significantly reduce attacks&#x2014;but introduce performance overhead and must be tested across conversational depth.</p>
</list-item>
</list>
<p>These findings underscore the evolving technical landscape of LLM red-teaming, highlighting emergent blind spots in alignment strategies and the need for multi-layered, dynamic safeguards. They also stress the importance of scalable, automated red-teaming tools capable of assessing adversarial resilience in real-world deployment contexts.</p>
<p>Other study also indicates that AI (<xref ref-type="bibr" rid="ref21">Goyal et al., 2023</xref>) models may inadvertently fault themselves by retraining on substandard data. After examining over 80 businesses, the study found that most needed a backup plan in case of a data poisoning attack or dataset theft. They concluded that if such an event were to occur, the majority of the industry would not even be aware.</p>
</sec>
</sec>
<sec id="sec42">
<label>10</label>
<title>Whitebox attacks</title>
<p>Attackers White-box threat models grant the adversary complete visibility into a target neural network&#x2019;s architecture, parameter values, and, occasionally, its training data. This transparency enables highly-tailored attacks that exploit exact gradient information, internal activation statistics, and structural shortcuts that remain opaque in black-box settings.</p>
<p>Gradient-driven perturbations. Armed with the full loss landscape, an attacker can compute precise input gradients and craft imperceptible perturbations that maximise the model&#x2019;s prediction error. Canonical examples include FGSM, projected-gradient descent (PGD), and the Carlini-&#x0026;-Wagner optimiser, all of which routinely achieve near-100% misclassification rates on ImageNet-scale models once the perturbation budget is aligned with human-perception thresholds (e.g., <italic>&#x03B5;</italic>&#x202F;=&#x202F;8/255 in the L&#x221E; norm).</p>
<p>Exact model cloning. Given access to weights and training data, the adversary can duplicate the model verbatim, run offline sensitivity analyses, and search for corner-case failures without rate limits or audit trails. The duplicate can also serve as a surrogate generator of transferable adversarial examples that will fool the original with probability approaching one.</p>
<p>Privacy-oriented exploits. Full data access trivialises membership inference and model-inversion attacks: confidence skew and gradient back-propagation directly reveal whether, and in what form, a record contributed to training. For clinical or financial datasets, this constitutes a direct breach of regulatory constraints such as HIPAA or GDPR.</p>
<p>Architectural vulnerability mining. White-box inspection exposes brittle components, e.g., unregularised batch-norm layers, low-rank bottlenecks, or unsafe activation ranges, that can be perturbed to induce exploding activations, vanishing feature maps, or numerical overflow, thereby triggering denial-of-service or silent logic corruption.</p>
<p>Given the exceptional leverage afforded by white-box knowledge, defensive counter-measures must shift from obscurity to formal robustness. Recommended practices include (i) certified or provable-robust training against gradient-based perturbations, (ii) differential-privacy mechanisms to bound information leakage, (iii) architectural hardening via gradient-mask&#x2013;free regularisers, and (iv) run-time anomaly detectors that flag activation patterns outside the training manifold. Only a multi-layered approach can meaningfully degrade the success probability of the sophisticated attack vectors enabled by full-model disclosure.</p>
<sec id="sec43">
<label>10.1</label>
<title>Fast Gradient Sign Method (FGSM)</title>
<p>FGSM is the textbook one-step, white-box attack used to probe how a convolutional neural network (CNN) behaves when the input is nudged in the single direction that most sharply increases its classification loss. The procedure is straightforward. First, the attacker feeds an image through the model and records the prediction error relative to the true label. Next, the attacker back-propagates that error all the way to the input layer, obtaining a gradient map whose sign tells whether each pixel should be brightened or darkened to amplify the error. Finally, the attacker adds a small, fixed-magnitude step in the indicated direction to every pixel, producing an adversarial image that looks unchanged to the human eye yet typically flips the model&#x2019;s decision. On ImageNet-scale networks, such a single-step perturbation, often no larger than a few gray-level values per pixel, can drive top-1 accuracy to near chance. <xref ref-type="fig" rid="fig12">Figure 12</xref> walks through this pipeline: clean inference, loss evaluation, gradient extraction, sign-based perturbation, and the resulting misclassification. Because FGSM is fast, reproducible, and highly transferable across architectures, it remains the de-facto first-pass benchmark for adversarial robustness, even though modern defences now require stronger, multi-step variants for comprehensive evaluation.</p>
<fig position="float" id="fig12">
<label>Figure 12</label>
<caption>
<p>The FGSM method to generate adversarial examples.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g012.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart depicting an adversarial example generation process. It starts with &#x201C;Clean input,&#x201D; passes through &#x201C;Pre-trained model,&#x201D; then &#x201C;Compute loss,&#x201D; leading to &#x201C;Calculate perturbation.&#x201D; Equations include `y = f(x)`, `g = &#x2207;L(x, y)`, `&#x03B4; = &#x03B5; sign(g)`, and `x_adv = x + &#x03B4;` next to their respective blocks.</alt-text>
</graphic>
</fig>
<p>The FGSM is a one-step method to generate adversarial examples, and we can visualise the process in <xref ref-type="fig" rid="fig12">Figure 12</xref>.</p>
<p>This diagram in <xref ref-type="fig" rid="fig12">Figure 12</xref> illustrates the FGSM interaction and process of creating adversarial examples. The user inputs an image, which is then utilised by the pre-trained CNN to make predictions. Once the prediction is made, the FGSM algorithm calculates the loss by comparing it to the actual class label and computes the gradients of this loss concerning the input image. Then, the algorithm determines the sign of the gradient and constructs the adversarial image using this sign. FGSM fabricates an adversarial image in a <italic>single</italic> gradient step. After a forward pass on the clean input x to obtain the model&#x2019;s loss J, the attacker back-propagates that loss to the input layer, producing a gradient map that indicates how each pixel should change to increase the error. The perturbation is then formed by taking only the sign (&#x00B1;1) of each gradient component and scaling it by a small constant \varepsilon, ensuring that every pixel is nudged in the most loss-increasing direction while the overall distortion remains imperceptible. Because the attack touches only the input tensor and leaves network weights untouched, it is ideally suited for probing <italic>deployed</italic> (frozen) models. On ImageNet-scale CNNs, \varepsilon values as low as 8/255 in the L&#x221E; norm can drive top-1 accuracy close to chance.</p>
<p><graphic xlink:href="frai-09-1731566-i016.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Code snippet showing a procedure for generating adversarial examples. It lists inputs including a pre-trained neural network model, a normalized input image, a one-hot ground-truth label, and a perturbation budget. The procedure involves predicting, calculating loss, backpropagation, calculating delta, creating an adversarial example, and returning it, with comments explaining each step.</alt-text>
</graphic></p>
<p>FGSM&#x2014;pseudocode implementation</p>
<list list-type="bullet">
<list-item>
<p><italic>Speed:</italic> one forward-backward pass; executes in milliseconds on modern GPUs.</p>
</list-item>
<list-item>
<p><italic>Determinism:</italic> produces a unique adversarial example for a given x, y, and \varepsilon.</p>
</list-item>
<list-item>
<p><italic>Diagnostic value:</italic> serves as the first-line robustness benchmark; if a model fails FGSM, it will almost certainly fail stronger multi-step attacks such as PGD.</p>
</list-item>
<list-item>
<p><italic>Limitations:</italic> a single-step update is easier to counter with basic defences (e.g., adversarial training or randomised input preprocessing), so FGSM alone is insufficient for certifying robust deployments.</p>
</list-item>
</list>
</sec>
<sec id="sec44">
<label>10.2</label>
<title>FGSM walk-through on a pre-trained MobileNetV2</title>
<p>To demonstrate FGSM in practice we attack an ImageNet-trained MobileNetV2. The model is a MobileNetV2 model pre-trained on Image with NetTensorFlow, MobileNetV2<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>, and Imagenet.<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref> The workflow is: load the frozen model; preprocess a test image; compute the input-gradient of the cross-entropy loss; add a sign-scaled disturbance; and visualise the effect as <italic>&#x03B5;</italic> grows. Pseudocode</p>
<p><graphic xlink:href="frai-09-1731566-i017.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Code snippet showing the process of setting up a TensorFlow model using MobileNetV2 for image processing. It includes downloading an image, preprocessing it, building a one-hot label for gradient calculation, and computing perturbations using the FGSM method. Steps are annotated with comments, and the code iterates over different epsilon values for adversarial example generation.</alt-text>
</graphic></p>
<p>A single &#x03B5;&#x202F;=&#x202F;0.10 step already flips MobileNetV2&#x2019;s top-1 label while remaining visually indistinguishable to the human eye.</p>
<p>Experimental extensions</p>
<list list-type="bullet">
<list-item>
<p><italic>Dataset and architecture sweep.</italic> Replicate the above script on CIFAR-10 (VGG-16), MNIST (LeNet), and ImageNet (ResNet-50) to quantify how model depth and inductive bias influence FGSM robustness.</p>
</list-item>
<list-item>
<p><italic>&#x03B5;-sensitivity curves.</italic> For each network, plot top-1 accuracy versus &#x03B5; to reveal the perturbation budget at which performance collapses.</p>
</list-item>
<list-item>
<p><italic>Adversarial-training baseline.</italic> Fine-tune each model with FGSM examples injected at &#x03B5;&#x202F;=&#x202F;0.03; re-measure accuracy to assess defence gain and clean-accuracy trade-off.</p>
</list-item>
<list-item>
<p><italic>Transition to stronger attacks.</italic> Use FGSM-trained weights as a starting point for multi-step PGD or AutoAttack evaluations, thereby mapping the full robustness frontier.</p>
</list-item>
</list>
<p>The image will look similar to:</p>
<p><graphic xlink:href="frai-09-1731566-i001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">A visual representation of random noise with a chaotic mix of vibrant colors, including red, blue, green, yellow, and pink, distributed throughout a grid format. The image has axis labels with numerical values.</alt-text>
</graphic></p>
<p>We can conduct tests using different epsilon values to assess the network&#x2019;s resilience. By increasing epsilon, we can observe how the network responds to changes. While a higher epsilon value makes it easier to fool the network, it also makes perturbations more apparent and noticeable.</p>
<p>FGSM remains the entry-level diagnostic for gradient-based vulnerability, yet contemporary work has progressed to momentum, iterative, and optimisation-based variants that breach even adversarial-trained models. Continuous benchmarking across new architectures (e.g., vision transformers) and modalities (e.g., audio, multi-modal LLMs) is essential to track evolving threat capability and to guide the design of certifiably robust learning systems.</p>
</sec>
</sec>
<sec id="sec45">
<label>11</label>
<title>Jacobian-based Saliency Map Attack (JSMA) and related feature-targeted methods</title>
<p>The Jacobian-based Saliency Map Attack (JSMA) is a sparse, targeted, white-box technique that perturbs only the most influential input features to force a classifier into a chosen label. Unlike norm-bounded attacks that diffuse small noise across all pixels, JSMA computes an explicit <italic>saliency map</italic> from the input-gradient (Jacobian) of the network: each pixel receives a score reflecting how strongly increasing its value raises the target-class logit while suppressing competing classes. The attacker greedily modifies the highest-saliency pixels, often fewer than 5% of the image, until misclassification occurs or a pre-set L&#x2080; budget is reached (see <xref ref-type="fig" rid="fig13">Figure 13</xref>). This sparsity yields adversarial examples that remain visually plausible yet evade many magnitude-based defences.</p>
<fig position="float" id="fig13">
<label>Figure 13</label>
<caption>
<p>Jacobian-based Saliency Map Attack (JSMA).</p>
</caption>
<graphic xlink:href="frai-09-1731566-g013.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating the Jacobian-based Saliency Map Attack (JSMA) process. It starts with an original input image, computing gradients, and building a saliency map. High-saliency pixels are selected and perturbed iteratively, resulting in an adversarial image. The process loops back to perturb iteratively as needed.</alt-text>
</graphic>
</fig>
<p>JSMA belongs to a broader family of feature-targeted attacks illustrated in <xref ref-type="fig" rid="fig14">Figure 14</xref>. DeepFool linearises the local decision boundary and iteratively moves the input along the shortest L&#x2082; path to cross that boundary, achieving minimal global distortion but without pixel-level sparsity. Iterative Gradient Sign Method (I-FGSM) extends FGSM by taking multiple small steps in the gradient-sign direction, trading speed for higher success rates under the same L&#x221E; budget. Carlini-&#x0026;-Wagner (C&#x0026;W) refines the optimisation further, searching for perturbations that minimise both distortion and a confidence-weighted misclassification term, producing near-imperceptible attacks even against defence-aware models. Boundary Attack assumes no gradient access at all; it starts from a random point in the target class and performs a random-walk projection toward the original sample, converging on an adversarial example under decision-only feedback.</p>
<fig position="float" id="fig14">
<label>Figure 14</label>
<caption>
<p>Conceptual comparison of iterative optimisation-based adversarial attacks, illustrating DeepFool, Carlini &#x0026; Wagner (C&#x0026;W), and Boundary Attack mechanisms, highlighting their respective strategies for decision-boundary crossing, perturbation minimisation, and misclassification preservation under differing knowledge assumptions.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g014.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating three types of adversarial attacks: &#x201C;Deep Fool&#x201D; iteratively perturbs input to cross the decision boundary; &#x201C;Carlini and Wagner Attack (C&#x0026;W)&#x201D; minimizes the difference between the original and adversarial input while ensuring misclassification; &#x201C;Boundary Attack&#x201D; starts with a random image and makes it increasingly similar to the original while maintaining misclassification.</alt-text>
</graphic>
</fig>
<p>Collectively, these methods expose complementary weak spots in deep networks&#x2014;sparsity (JSMA), minimal-norm (DeepFool), iterative L&#x221E; (I-FGSM), optimisation-tight (C&#x0026;W), and decision-based (Boundary). A comprehensive robustness evaluation therefore requires testing against the full spectrum rather than relying on any single attack type. The diversity shown in <xref ref-type="fig" rid="fig14">Figure 14</xref> underscores why modern defence pipelines combine adversarial training, gradient-regularising architectures, and runtime anomaly detection to achieve credible resilience in safety-critical deployments.</p>
<p>In <xref ref-type="fig" rid="fig14">Figure 14</xref>, we can see each attack&#x2019;s main characteristics and steps.</p>
<p>Each of these attacks in <xref ref-type="fig" rid="fig14">Figure 14</xref> highlights different aspects of adversarial methodologies, demonstrating the diversity and evolving complexity in crafting adversarial inputs and emphasising the critical need for developing robust and versatile defensive mechanisms.</p>
</sec>
<sec id="sec46">
<label>12</label>
<title>Carlini &#x0026; Wagner (C&#x0026;W) attack &#x2013; worked example on MNIST</title>
<p>Deep neural networks (DNNs), while highly performant across a wide range of tasks, are susceptible to adversarial inputs, carefully crafted perturbations that can cause targeted misclassification. The C&#x0026;W attack remains one of the most effective and precise techniques for generating such examples. To illustrate its implementation, we applied the L&#x2082;-norm variant of the C&#x0026;W attack on a convolutional neural network trained on the MNIST dataset of handwritten digits. The model used in our experiment consisted of two convolutional layers with ReLU activations, max-pooling operations, followed by two fully connected layers with dropout regularisation. It achieved 99.2% accuracy on clean test images.</p>
<p>Before launching the attack, the input data (28&#x202F;&#x00D7;&#x202F;28 pixel greyscale images) was normalised to the [0,1] range and one-hot encoded for labels. The neural network was either trained from scratch or loaded from a pre-trained checkpoint. Once verified on clean data, we proceeded to craft adversarial examples designed to cause the model to misclassify each image as a specific target class.</p>
<p>The C&#x0026;W attack constructs adversarial samples by solving an optimisation problem. The aim is to find the smallest possible modification to an input image that causes it to be misclassified as a chosen target class, while keeping the change imperceptible. To maintain valid pixel values during optimisation, the input image is not modified directly. Instead, a latent variable w is introduced in an unconstrained space, and the final adversarial image x_adv is derived through a bounded transformation. The transformation used is based on the hyperbolic tangent function, ensuring that the pixel values in x_adv remain within valid image bounds:</p>
<p><graphic xlink:href="frai-09-1731566-i002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Mathematical equation displaying "x_adv equals zero point five times the quantity of hyperbolic tangent of w plus one."</alt-text>
</graphic></p>
<p>This transformed image is then fed into the model to evaluate how confidently it is classified into the target class. The attack objective is to increase this confidence while minimising the visible difference from the original image. The trade-off between confidence and distortion is controlled by a scalar constant c, which is determined through binary search for optimal balance.</p>
<p>The overall workflow of the attack can be summarised procedurally as follows:</p>
<p><graphic xlink:href="frai-09-1731566-i003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Python code for optimizing a model with adversarial examples. It iterates over images and target classes to adjust variables, calculate losses, and update weights using gradients and the Adam optimizer. Key steps include initializing variables, computing adversarial examples, calculating confidence and distortion losses, and updating with a total loss function.</alt-text>
</graphic></p>
<p>In our experiment, we used 1,000 randomly selected images from the MNIST test set. For each image, a target class was chosen that differed from the true label. The attack was run for up to 1,000 optimisation steps per image, using the Adam optimiser with a learning rate of 0.01. The confidence parameter, denoted <italic>&#x03BA;</italic> in literature, was tested with values ranging from 0 (baseline) to 20. The balancing constant c was determined through 9 rounds of binary search starting from an initial value of 1e-3. This ensured that adversarial samples were successful while introducing minimal perturbation.</p>
<p>The results confirmed the effectiveness of the C&#x0026;W attack. A 100% success rate was achieved across the sample set: every adversarial example caused the model to misclassify the input into the intended target class. At the baseline confidence (&#x03BA;&#x202F;=&#x202F;0), the average perturbation magnitude (measured as L&#x2082; distance) was approximately 1.73, visually imperceptible to the human eye. As the confidence parameter increased, the attack became more forceful but also required larger perturbations: with &#x03BA;&#x202F;=&#x202F;10, the average distortion rose to 3.21, which remained subtle but became slightly perceptible in some instances. Qualitative inspection of the adversarial examples showed that the changes were localised and did not significantly alter the semantic appearance of the digit.</p>
<p>These findings reinforce the C&#x0026;W attack&#x2019;s position as one of the most refined adversarial techniques. Its success lies in its ability to tightly control the trade-off between stealth and misclassification certainty. The tanh-space reparameterisation ensures that all adversarial samples remain valid images, while the binary search over c ensures adaptive adjustment based on target class and model behaviour. This method is particularly useful in security-sensitive applications&#x2014;such as facial recognition, signature verification, and document forgery, where subtlety of attack is paramount.</p>
</sec>
<sec id="sec47">
<label>13</label>
<title>Black-box adversarial attacks</title>
<p>Black-box threat models assume the adversary can supply inputs to a deployed service and observe only the returned labels or confidence scores; the model&#x2019;s architecture, weights, and training data remain hidden. Despite this limited view, several families of attacks can still achieve high misclassification rates:</p>
<list list-type="bullet">
<list-item>
<p><italic>Transfer attacks:</italic> The attacker trains a <italic>surrogate</italic> model on publicly available or synthetically generated data, crafts adversarial examples on that surrogate, and then submits the same inputs to the target. Owing to the empirical transferability of adversarial perturbations, image-classification systems often suffer 60&#x2013;80% misclassification under such cross-model re-use, even when the surrogate and target differ in architecture (e.g., VGG, ResNet).</p>
</list-item>
<list-item>
<p><italic>Zeroth-order optimisation (ZOO):</italic> Here the adversary treats the target network as a black-box function and estimates input gradients by finite-difference probing: each pixel or feature is perturbed by a small amount, the change in loss is recorded, and the approximate gradient is reconstructed. Although query-intensive, ZOO achieves near-white-box success rates when confidence scores are available.</p>
</list-item>
<list-item>
<p><italic>Query-based bandit attacks:</italic> Variants such as NES and Bandits-TD reduce ZOO&#x2019;s sample complexity by using random sub-space updates and gradient-sign momentum, often converging within tens of thousands of queries on ImageNet models&#x2014;well below commercial API rate limits.</p>
</list-item>
<list-item>
<p><italic>Decision-only methods (e.g., HopSkipJump):</italic> When the server reveals only the top-1 label, the attacker performs a boundary-walking search that starts from a large perturbation guaranteed to fool the model and iteratively projects back toward the original input while maintaining the misclassification. HopSkipJump requires no gradient or score leakage and can produce high-quality adversarial images in fewer than 10,000 queries.</p>
</list-item>
</list>
<p><xref ref-type="fig" rid="fig15">Figure 15</xref> summarises these attack families along two axes: information available (label vs. score) and query budget. Transfer attacks succeed offline with zero queries but rely on surrogate alignment; ZOO and bandit methods trade elevated query cost for tighter perturbation budgets; decision-only attacks operate under the strictest feedback constraints at the expense of more iterations.</p>
<fig position="float" id="fig15">
<label>Figure 15</label>
<caption>
<p>Blackbox attacks.</p>
</caption>
<graphic xlink:href="frai-09-1731566-g015.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating &#x201C;Blackbox Attacks&#x201D; in orange as the central node. It connects to &#x201C;Transfer Attacks&#x201D; in gray, &#x201C;Zeroth order optimization&#x201D; in blue, and &#x201C;HopSkipJumpAttack&#x201D; in yellow.</alt-text>
</graphic>
</fig>
<p>In the examples used in <xref ref-type="fig" rid="fig15">Figure 15</xref>, the transfer attacks represent adversarial examples generated for one model and are used to attack another model. The zeroth order optimisation directly estimates the gradient of the targeted model by querying it. This type of attack can be separated into query-based attacks, which are conducted by repeatedly querying the model, where attackers estimate its gradient to craft adversarial examples, and HopSkipJumpAttack, where a decision-based attack where the attacker has no information about the model&#x2019;s gradients, only its outputs.</p>
<sec id="sec48">
<label>13.1</label>
<title>Technical insights into transferability: surrogate model selection and domain shift effects</title>
<p>Transferability is a defining property of black-box adversarial attacks, wherein adversarial examples crafted against a surrogate model succeed in misleading a separate, unseen target model. However, the effectiveness of such attacks is not guaranteed and is heavily influenced by two critical factors: the architecture and training regime of the surrogate model, and the extent of domain shift between the surrogate and target systems. Empirical investigations have consistently shown that surrogate-target alignment is essential for high transferability. Specifically, architectural similarity between the models significantly enhances the likelihood of a successful attack, as similar model structures tend to learn comparable decision boundaries. This alignment often manifests in higher cosine similarity between the input gradients of the two models, which serves as a useful proxy for estimating attack transfer potential.</p>
<p>The training regime of the surrogate model also plays a crucial role. For example, the use of alternative loss functions, such as label smoothing instead of standard cross-entropy, or the incorporation of strong regularisation techniques like dropout or batch normalisation, can alter the geometry of the loss landscape and thereby reduce gradient alignment. These changes can impair the adversarial direction&#x2019;s effectiveness on the target model. In cases where the surrogate and target models are trained with substantially different objectives or data preprocessing pipelines, transferability degrades significantly.</p>
<p>Domain shift introduces further complexity. Even when the surrogate and target models share similar architectures, divergence in the data distributions they are trained on can significantly reduce attack efficacy. Covariate shift, where the marginal input distributions differ (e.g., CIFAR-10 vs. CIFAR-10.1 or TinyImageNet), can cause adversarial perturbations generated on the surrogate to fall outside the vulnerable subspaces of the target model. Similarly, discrepancies in input preprocessing, such as different image normalisation ranges, resizing strategies, or colour space handling, can lead to perceptual or statistical misalignment that degrades the adversarial impact. The consequence is a drop in attack success rate, even when perturbations remain imperceptible to humans.</p>
<p>Empirical results substantiate these observations. In one illustrative experiment on CIFAR-10, adversarial examples crafted on a VGG16 surrogate achieved a 68% success rate against a ResNet-18 target model trained on the same dataset. However, when the target model was trained on a variant dataset with modified augmentations (e.g., CIFAR-10.1), the transferability dropped below 40%, highlighting the sensitivity of cross-model attacks to minor distributional discrepancies.</p>
<p>Various mitigation strategies have been proposed to counter these challenges and enhance transferability. Ensemble-based surrogates, for instance, generate perturbations by jointly optimising across multiple models, encouraging the perturbation to generalise across differing decision boundaries. Similarly, gradient averaging across multiple architectures has been shown to improve the robustness of transfer attacks by smoothing local variations in the loss surface. Universal adversarial perturbations, which seek input-agnostic perturbations, attempt to circumvent reliance on model-specific gradients altogether, thus improving applicability under both architectural diversity and domain shift conditions.</p>
<p>Together, these insights underscore that the success of transfer-based adversarial attacks is intricately tied to model similarity and data congruence. Accounting for these factors is critical for accurately evaluating the security risks posed by black-box attacks and for designing robust defences that generalise across models and deployment contexts.</p>
</sec>
</sec>
<sec id="sec49">
<label>14</label>
<title>Targeted versus non-targeted adversarial attacks</title>
<p>Adversarial perturbations fall into two intent classes. Targeted attacks are goal-directed: the adversary crafts a perturbation that forces the model to emit one <italic>specific</italic> wrong label, e.g., a &#x201C;stop&#x201D; sign misread as &#x201C;speed-limit.&#x201D; Success is measured by whether the output matches this pre-selected class, so the optimisation explicitly maximises the target class logit while suppressing all others. Such precision is indispensable for fine-grained fraud (redirecting facial recognition to a chosen identity) or for bypassing class-specific access controls.</p>
<p>By contrast, non-targeted attacks seek <italic>any</italic> incorrect label. The adversary&#x2019;s objective is simply to shove the input out of its correct decision region, thereby degrading model accuracy or eroding confidence in automated decisions. The Basic Iterative Method (BIM) exemplifies this category: starting from a Fast-Gradient-Sign seed, BIM applies many small, bounded steps in the gradient-sign direction, gradually increasing loss until the classifier flips to <italic>some</italic> alternative class. Because the optimisation landscape is less constrained, non-targeted attacks usually require smaller perturbations or fewer queries than their targeted counterparts.</p>
<p>Distinguishing the two threat models is critical for defence design. Robustness certificates for targeted attacks must cover worst-case perturbations that <italic>hit</italic> a designated class, while defences against non-targeted attacks focus on enlarging the overall decision margin. Comprehensive evaluation therefore reports both targeted and non-targeted success rates to capture the full adversarial risk surface.</p>
</sec>
<sec id="sec50">
<label>15</label>
<title>Taxonomy of advanced adversarial attacks</title>
<p>Advanced adversarial attacks extend beyond traditional perturbation-based methods by targeting different phases of the machine learning pipeline and exploiting broader threat surfaces, including training data leakage, physical-world vulnerabilities, and model access modalities. <xref ref-type="table" rid="tab6">Table 6</xref> presents a formal taxonomy of these attack types, classifying them by attack surface, knowledge required, primary objective, attack modality, and notable challenges or constraints.</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Comparative performance of defence mechanisms against adversarial attacks.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Defence mechanism</th>
<th align="left" valign="top">Dataset</th>
<th align="center" valign="top">Accuracy under attack (FGSM/C&#x0026;W)</th>
<th align="left" valign="top">Overhead</th>
<th align="left" valign="top">Generalisability</th>
<th align="left" valign="top">Adaptive attack resilience</th>
<th align="left" valign="top">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Adversarial Training</td>
<td align="left" valign="top">CIFAR-10</td>
<td align="center" valign="top">74%/63%</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">Medium</td>
<td align="left" valign="top">Moderate</td>
<td align="left" valign="top">Strong against trained-for attacks; brittle under unseen attack variants</td>
</tr>
<tr>
<td align="left" valign="top">Defensive Distillation</td>
<td align="left" valign="top">MNIST</td>
<td align="center" valign="top">85%/60%</td>
<td align="left" valign="top">Moderate</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Reduces sensitivity but vulnerable to gradient masking bypasses</td>
</tr>
<tr>
<td align="left" valign="top">Gradient Masking</td>
<td align="left" valign="top">CIFAR-10</td>
<td align="center" valign="top">~70% (initially)</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Very Low</td>
<td align="left" valign="top">Often gives false sense of security; bypassable by adaptive attacks</td>
</tr>
<tr>
<td align="left" valign="top">Randomised Smoothing</td>
<td align="left" valign="top">ImageNet</td>
<td align="center" valign="top">67%/54%</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">Provable robustness guarantees in L&#x2082; norm; high inference cost</td>
</tr>
<tr>
<td align="left" valign="top">Feature Squeezing</td>
<td align="left" valign="top">MNIST</td>
<td align="center" valign="top">78%/58%</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Low</td>
<td align="left" valign="top">Lightweight but limited to simple perturbations</td>
</tr>
<tr>
<td align="left" valign="top">Ensemble Adversarial Tr.</td>
<td align="left" valign="top">CIFAR-10</td>
<td align="center" valign="top">81%/68%</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">High</td>
<td align="left" valign="top">Moderate</td>
<td align="left" valign="top">Improves robustness by incorporating attacks from multiple models</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Sources: Adapted from <xref ref-type="bibr" rid="ref40">Papernot et al. (2016)</xref>, <xref ref-type="bibr" rid="ref7">Carlini and Wagner (2017a</xref>, <xref ref-type="bibr" rid="ref8">b)</xref>, <xref ref-type="bibr" rid="ref12">Chen et al. (2020a</xref>, <xref ref-type="bibr" rid="ref13">b)</xref>, <xref ref-type="bibr" rid="ref38">Nasr et al. (2023)</xref>, <xref ref-type="bibr" rid="ref6">Breidenbach et al. (2021)</xref>, <xref ref-type="bibr" rid="ref14">Cohen et al. (2019)</xref>, <xref ref-type="bibr" rid="ref5">Bloem da Silveira Junior et al. (2018)</xref>.</p>
</table-wrap-foot>
</table-wrap>
<p>This taxonomy reveals several important distinctions:</p>
<list list-type="bullet">
<list-item>
<p><italic>Attack surface</italic> determines where in the ML lifecycle the adversary operates, at training, inference, or physical deployment.</p>
</list-item>
<list-item>
<p><italic>Modality</italic> (digital, physical, or hardware-level) influences both feasibility and required defences.</p>
</list-item>
<list-item>
<p><italic>Knowledge assumptions</italic> (black-box vs. white-box) determine the accessibility of attack vectors and inform security posture.</p>
</list-item>
<list-item>
<p><italic>Shared traits</italic> across these attacks include their reliance on overfitting, model confidence exposure, and weak regularisation as enabling factors.</p>
</list-item>
</list>
<p>Understanding these structural characteristics enables more targeted defence strategies and prioritisation of threat mitigation based on deployment context and attacker capabilities.</p>
</sec>
<sec id="sec51">
<label>16</label>
<title>Defensive measures</title>
<p>During training, adversarial examples are introduced to enhance the model&#x2019;s robustness to potential threats. This is achieved through ensembling multiple models, which enables the averaging of their respective predictions. As an additional measure, pre-processing techniques such as JPEG compression and image smoothing are utilised to remove adversarial noise from the data. These strategies create a more reliable and accurate model, improving the system&#x2019;s effectiveness.</p>
<p>Defensive distillation is a technique in machine learning that involves training a model to replicate the behaviour of another model. The approach is based on using less extreme output probabilities, which helps to increase the model&#x2019;s robustness and resistance to adversarial attacks. By imitating the behaviour of a more complex model, the distilled model can perform better in real-world scenarios, where it may encounter unexpected inputs or other sources of uncertainty. Defensive distillation is a powerful tool for improving the reliability and safety of AI systems, especially in high-stakes applications such as autonomous driving, medical diagnosis, and financial forecasting.</p>
<p>Several methods can be applied against attacks. One such method is Feature Squeezing, which removes extraneous features from input data, thereby restricting the search space for potential attackers. Another technique is Randomised Input Transformations, which confuses adversaries through the random transformation of inputs during inference. A third approach is Gradient Masking, which renders gradients uninformative to prevent attackers from using them to create adversarial examples. The Detection method involves training auxiliary models to recognise adversarial perturbations instead of trying to achieve complete robustness against them. This requires evaluation of potential adversarial attacks.</p>
<sec id="sec52">
<label>16.1</label>
<title>Comparative evaluation of defence mechanisms against adversarial attacks</title>
<p>While a variety of defence mechanisms have been proposed to mitigate adversarial attacks, such as adversarial training, defensive distillation, and gradient masking&#x2014;their comparative effectiveness varies significantly depending on attack type, model architecture, and dataset. <xref ref-type="table" rid="tab6">Table 6</xref> presents a summary of technical evaluations derived from empirical studies across benchmark datasets (MNIST, CIFAR-10, and ImageNet), focusing on key criteria: robustness improvement (accuracy under attack), computational overhead, generalisability across attacks, and susceptibility to adaptive attacks.</p>
<p>This comparative analysis reveals several key trade-offs:</p>
<list list-type="bullet">
<list-item>
<p><italic>Adversarial training</italic> remains the most widely used and effective defence, especially for FGSM-style perturbations. However, it tends to overfit to known attacks and requires substantial computational resources for training.</p>
</list-item>
<list-item>
<p><italic>Defensive distillation</italic> offers moderate robustness by smoothing decision boundaries, yet it is easily circumvented by stronger attacks (e.g., C&#x0026;W), as it inadvertently introduces gradient obfuscation.</p>
</list-item>
<list-item>
<p><italic>Gradient masking</italic>, though computationally inexpensive, often leads to poor generalisability and has been demonstrated to be ineffective under adaptive threat models.</p>
</list-item>
<list-item>
<p><italic>Randomised smoothing</italic> provides theoretical robustness bounds under certain perturbation norms, but incurs substantial latency at inference and has limited practical adoption.</p>
</list-item>
<list-item>
<p><italic>Ensemble adversarial training</italic> improves transfer robustness and mitigates overfitting to a specific attack strategy, albeit at increased training cost.</p>
</list-item>
</list>
<p>Effectiveness of individual defence strategies depends on the threat model, available computational budget, and tolerance for inference latency. The diversity of trade-offs requires defence-in-depth architectures that combine multiple strategies for layered robustness.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec53">
<label>17</label>
<title>Discussion</title>
<p>Empirical evidence accumulated over the past decade shows that comprehensive knowledge of a model&#x2019;s internals is no longer a prerequisite for producing high-confidence adversarial failures. Gradient-free bandit optimisers, finite-difference estimators, and transfer-based strategies now attain misclassification rates on ImageNet that approach those of canonical white-box attacks, demonstrating that decision boundaries learned by modern architectures remain highly correlated even when the parameters are hidden. This observation calls into question evaluation protocols that rely exclusively on fast, single-step perturbations such as FGSM or on a limited set of norm-bounded iterative attacks.</p>
<p>Robustness is also demonstrably task-dependent. Adversarial training with projected-gradient descent improves <italic>&#x03B5;</italic>&#x202F;=&#x202F;8/255 accuracy on CIFAR-10 to roughly 47%, yet under identical perturbation budgets the same procedure affords fewer than 20% robust accuracy on ImageNet. When the perturbations are extended from pixel noise to geometric distortions, two-degree rotations or sub-pixel translations in autonomous-driving imagery, the effective accuracy drop is larger still, pointing to a gap between standard benchmark metrics and the failure modes that dominate physical deployments.</p>
<p>Privacy leakage scales with both model capacity and the granularity of the outputs released to the user. Membership-inference and model-inversion attacks achieve 60&#x2013;70% accuracy on unprotected CIFAR-10 classifiers but fall to near-random levels once training is performed with differential-privacy stochastic gradient descent at &#x03B5;&#x202F;&#x2248;&#x202F;1. The modest two-to-four-percentage-point reduction in clean accuracy observed in these experiments suggests that rigorous privacy guarantees and practical utility are not mutually exclusive.</p>
<p>No single defensive mechanism exhibits universal efficacy. Defensive distillation, feature squeezing, and gradient masking all deteriorate under adaptive evaluation; only multi-layered strategies that combine adversarial training, randomised smoothing, differential privacy, and interface hardening provide measurable resilience, and even these solutions leave a gap of more than 50 percentage points between clean and robust accuracy on large-scale image classification. Large-language models present an additional alignment problem: automated red-teaming frameworks such as MAD-MAX attain jailbreak success rates exceeding 95 % on GPT-4-class systems with an average of 11 queries, revealing that reinforcement-learning-based alignment alone is insufficient to prevent malicious prompt injection.</p>
<p>Finally, supply-chain attacks on pre-trained weights, data-set poisoning in self-supervised learning, and logic bombs embedded in parameter-efficient adapters highlight that adversarial robustness must be considered across the entire model lifecycle, data acquisition, training, distribution, and inference monitoring, rather than at a single frozen checkpoint.</p>
</sec>
<sec sec-type="conclusions" id="sec54">
<label>18</label>
<title>Conclusion</title>
<p>This work advances the study of adversarial AI security by reframing robustness as a system-level property of agentic artificial intelligence rather than a narrowly defined characteristic of individual models or inputs. By formalising adversarial risk across perceptual, cognitive, and executive layers, the proposed framework extends classical adversarial machine-learning theory to account for autonomy, self-governance, and closed-loop decision-making. This shift is essential for analysing modern AI systems whose behaviour emerges over time through feedback, planning, and interaction with dynamic environments.</p>
<p>A central insight of this study is that adversarial vulnerabilities cannot be fully understood, or mitigated, through input-level defences alone. While perceptual attacks remain the most empirically validated, higher-order failures arise when erroneous perceptions propagate through internal reasoning, policy formation, and actuation. The analysis demonstrates that architectural properties such as feedback dynamics, memory, and goal specification fundamentally shape adversarial behaviour, transferability, and impact. Consequently, robustness must be treated as an emergent property of the entire agent&#x2013;environment system rather than as a static performance metric.</p>
<p>Importantly, this paper does not position higher-order agentic adversarial attacks as empirically settled phenomena. Instead, they are framed as hypothesis-driven, architecturally motivated risks that demand systematic investigation. The absence of standardised benchmarks for autonomy, long-horizon decision-making, and behavioural integrity represents a critical gap in current research. Addressing this gap requires new evaluation environments, metrics that capture cumulative behavioural deviation and policy drift, and defence mechanisms that integrate perceptual robustness with governance, verification, and runtime oversight.</p>
<p>Beyond consolidating existing adversarial knowledge, the primary contribution of this work lies in defining a coherent research agenda for agentic AI security. By unifying adversarial machine learning, systems safety, and control-theoretic perspectives, the framework provides a foundation for studying resilience, stability, and alignment in autonomous AI systems. As AI agents are increasingly deployed in safety-critical and socially consequential contexts, such system-level approaches will be indispensable for ensuring not only accuracy, but trustworthy and accountable behaviour over time.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec55">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec56">
<title>Author contributions</title>
<p>PR: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. OS: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. CM: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec57">
<title>Conflict of interest</title>
<p>Author OS was employed by Cisco Systems, RTP.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec58">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was used in the creation of this manuscript. Grammarly was used for spell check and grammar improvement.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec59">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Acharya</surname><given-names>D. B.</given-names></name> <name><surname>Kuppan</surname><given-names>K.</given-names></name> <name><surname>Access</surname><given-names>B. D.</given-names></name></person-group> (<year>2025</year>). <article-title>Agentic AI: autonomous intelligence for complex goals - a comprehensive survey</article-title>. <source><italic>IEEE Access</italic></source> <volume>3</volume>, <fpage>18912</fpage>&#x2013;<lpage>18936</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://ieeexplore.ieee.org/abstract/document/10849561/" ext-link-type="uri">https://ieeexplore.ieee.org/abstract/document/10849561/</ext-link>. (Accessed 18 August 2025)</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Anthi</surname><given-names>E.</given-names></name> <name><surname>Williams</surname><given-names>L.</given-names></name> <name><surname>Rhode</surname><given-names>M.</given-names></name> <name><surname>Burnap</surname><given-names>P.</given-names></name> <name><surname>Wedgbury</surname><given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Adversarial attacks on machine learning cybersecurity defences in industrial control systems</article-title>. <source>J. Inf. Secur. Appl.</source> <volume>58</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>.</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aria</surname><given-names>M.</given-names></name> <name><surname>Cuccurullo</surname><given-names>C.</given-names></name></person-group> (<year>2017</year>). <article-title>Bibliometrix: an R-tool for comprehensive science mapping analysis</article-title>. <source>J. Informet.</source> <volume>11</volume>, <fpage>959</fpage>&#x2013;<lpage>975</lpage>.</mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Balasubramanian</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>Intrinsically motivated multi-goal reinforcement learning using robotics environment integrated with OpenAI gym</article-title>. <source><italic>Journal of Science &#x0026; Technology</italic></source> <volume>4</volume>, <fpage>46</fpage>&#x2013;<lpage>60</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://www.thesciencebrigade.com/jst/article/view/21" ext-link-type="uri">https://www.thesciencebrigade.com/jst/article/view/21</ext-link> (Accessed 20 September 2024)</mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bloem da Silveira Junior</surname><given-names>L. A.</given-names></name> <name><surname>Vasconcellos</surname><given-names>E.</given-names></name> <name><surname>Vasconcellos Guedes</surname><given-names>L.</given-names></name> <name><surname>Guedes</surname><given-names>L. F. A.</given-names></name> <name><surname>Costa</surname><given-names>R. M.</given-names></name></person-group> (<year>2018</year>). <article-title>Technology roadmapping: a methodological proposition to refine Delphi results</article-title>. <source><italic>Technological Forecasting and Social Change</italic></source> <volume>126</volume>, <fpage>194</fpage>&#x2013;<lpage>206</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://www.sciencedirect.com/science/article/pii/S0040162516306734" ext-link-type="uri">https://www.sciencedirect.com/science/article/pii/S0040162516306734</ext-link> (Accessed 18 March 2018)</mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Breidenbach</surname><given-names>L.</given-names></name> <name><surname>Cachin</surname><given-names>C.</given-names></name> <name><surname>Chan</surname><given-names>B.</given-names></name> <name><surname>Coventry</surname><given-names>A.</given-names></name> <name><surname>Ellis</surname><given-names>S.</given-names></name> <name><surname>Juels</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>.) <article-title>Chainlink 2.0: next steps in the evolution of decentralized Oracle networks</article-title>.</mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Carlini</surname><given-names>N.</given-names></name> <name><surname>Wagner</surname><given-names>D.</given-names></name></person-group> (<year>2017a</year>). &#x201C;<article-title>Adversarial examples are not easily detected: bypassing ten detection methods</article-title>&#x201D; in <source>AISec 2017 - proceedings of the 10th ACM workshop on artificial intelligence and security, co-located with CCS 2017</source>, <fpage>3</fpage>&#x2013;<lpage>14</lpage>.</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Carlini</surname><given-names>N.</given-names></name> <name><surname>Wagner</surname><given-names>D.</given-names></name></person-group> (<year>2017b</year>). <article-title>MagNet and &#x2018;efficient Defenses against adversarial attacks&#x2019; are not robust to adversarial examples</article-title>. <comment>Available from:</comment> <ext-link xlink:href="http://arxiv.org/abs/1711.08478" ext-link-type="uri">http://arxiv.org/abs/1711.08478</ext-link> (Accessed 15 November 2024)</mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll1">CCPA</collab></person-group> (<year>2018</year>). <source><italic>California consumer privacy act (CCPA) | state of California - Department of Justice - Office of the Attorney General</italic></source>. <comment>Available from:</comment> <ext-link xlink:href="https://oag.ca.gov/privacy/ccpa" ext-link-type="uri">https://oag.ca.gov/privacy/ccpa</ext-link> (Accessed 20 September 2023)</mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chejara</surname><given-names>P.</given-names></name> <name><surname>Garg</surname><given-names>U.</given-names></name> <name><surname>Singh</surname><given-names>G.</given-names></name></person-group> (<year>2013</year>). <article-title>Vulnerability analysis in attack graphs using conditional probability</article-title>. <source>International Journal of Soft Computing and Engineering (IJSCE)</source> <volume>13</volume>, <fpage>18</fpage>&#x2013;<lpage>21</lpage>.</mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>K.</given-names></name> <name><surname>Muyang</surname><given-names>L.</given-names></name> <name><surname>Li</surname><given-names>G.</given-names></name> <name><surname>Zhang</surname><given-names>S.</given-names></name> <name><surname>Guo</surname><given-names>S.</given-names></name> <name><surname>Zhang</surname><given-names>T.</given-names></name></person-group> (<year>2025</year>). <source>TRUST-VLM: Thorough red-teaming for uncovering safety threats in vision-language models</source>.</mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>S.</given-names></name> <name><surname>Carlini</surname><given-names>N.</given-names></name><collab id="coll2">on, D. W.-P. of the 1st A. W</collab></person-group>. (<year>2020a</year>), <source>Undefined, n.d. Stateful detection of black-box adversarial attacks. dl.acm.org</source>. <comment>Available from:</comment> <ext-link xlink:href="https://dl.acm.org/doi/abs/10.1145/3385003.3410925" ext-link-type="uri">https://dl.acm.org/doi/abs/10.1145/3385003.3410925</ext-link></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>S.</given-names></name> <name><surname>Carlini</surname><given-names>N.</given-names></name> <name><surname>Wagner</surname><given-names>D.</given-names></name></person-group> (<year>2020b</year>). &#x201C;<article-title>Stateful detection of black-box adversarial attacks</article-title>&#x201D; in <source>SPAI 2020 - proceedings of the 1st ACM workshop on security and privacy on artificial intelligent, co-located with AsiaCCS 2020</source>, <fpage>30</fpage>&#x2013;<lpage>39</lpage>.</mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Cohen</surname><given-names>R.</given-names></name> <name><surname>Humphries</surname><given-names>J.</given-names></name> <name><surname>Risk</surname><given-names>S.</given-names></name></person-group> (<year>2019</year>) <source>An investigation of cyber loss data and its links to operational risk. <ext-link xlink:href="http://papers.ssrn.com" ext-link-type="uri"><italic>papers.ssrn.com</italic></ext-link></source>. <comment>Available from:</comment> <ext-link xlink:href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3459457" ext-link-type="uri">https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3459457</ext-link> (Accessed 11 December 2019)</mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Costa</surname><given-names>J. C.</given-names></name> <name><surname>Roxo</surname><given-names>T.</given-names></name> <name><surname>Proen&#x00E7;a</surname><given-names>H.</given-names></name> <name><surname>Member</surname><given-names>S.</given-names></name> <name><surname>In&#x00E1;cio</surname><given-names>M.</given-names></name> <name><surname>P</surname><given-names>R.</given-names></name></person-group> (<year>2023</year>). <article-title>How deep learning sees the world: a survey on adversarial attacks &#x0026; Defenses</article-title>. <source><italic>ARXIV</italic></source> <volume>12</volume>, <fpage>61113</fpage>&#x2013;<lpage>61136</lpage>. <comment>Available from:</comment> <ext-link xlink:href="http://arxiv.org/abs/2305.10862" ext-link-type="uri">http://arxiv.org/abs/2305.10862</ext-link></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Duan</surname><given-names>J.</given-names></name> <name><surname>Guan</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>S. E.</given-names></name> <name><surname>Ren</surname><given-names>Y.</given-names></name> <name><surname>Sun</surname><given-names>Q.</given-names></name> <name><surname>Cheng</surname><given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>Distributional soft actor-critic: off-policy reinforcement learning for addressing value estimation errors</article-title>. <source>IEEE Trans. Neural Networks Learn. Syst.</source> <volume>33</volume>, <fpage>6584</fpage>&#x2013;<lpage>6598</lpage>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Fredrikson</surname><given-names>M.</given-names></name> <name><surname>Jha</surname><given-names>S.</given-names></name> <name><surname>Ristenpart</surname><given-names>T.</given-names></name></person-group> (<year>2015</year>). &#x201C;<article-title>Model inversion attacks that exploit confidence information and basic countermeasures</article-title>&#x201D; in <source><italic>Proceedings of the ACM conference on computer and communications security</italic></source>, vol. <volume>2015-October</volume>, <fpage>1322</fpage>&#x2013;<lpage>1333</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://dl.acm.org/doi/10.1145/2810103.2813677" ext-link-type="uri">https://dl.acm.org/doi/10.1145/2810103.2813677</ext-link>.</mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll4">GDPR</collab></person-group> (<year>2018</year>). <article-title>What is GDPR, the EU&#x2019;S new data protection law? - GDPR.EU</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://gdpr.eu/what-is-gdpr/" ext-link-type="uri">https://gdpr.eu/what-is-gdpr/</ext-link>. (Accessed 7 July 2023)</mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Goodfellow</surname><given-names>I.</given-names></name> <name><surname>Papernot</surname><given-names>N.</given-names></name> <name><surname>McDaniel</surname><given-names>P.</given-names></name></person-group> (<year>2016</year>). <article-title>Cleverhans v0.1: an adversarial machine learning library</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://github.com/openai/cleverhans" ext-link-type="uri">https://github.com/openai/cleverhans</ext-link>.</mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goodfellow</surname><given-names>I.</given-names></name> <name><surname>Pouget-Abadie</surname><given-names>J.</given-names></name> <name><surname>Mirza</surname><given-names>M.</given-names></name> <name><surname>Xu</surname><given-names>B.</given-names></name> <name><surname>Warde-Farley</surname><given-names>D.</given-names></name> <name><surname>Ozair</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Generative adversarial networks</article-title>. <source><italic>Communications of the ACM</italic></source> <volume>63</volume>, <fpage>139</fpage>&#x2013;<lpage>144</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/abs/1406.2661v1" ext-link-type="uri">https://arxiv.org/abs/1406.2661v1</ext-link>. (Accessed 19 September 2024)</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goyal</surname><given-names>S.</given-names></name> <name><surname>Doddapaneni</surname><given-names>S.</given-names></name> <name><surname>Khapra</surname><given-names>M. M.</given-names></name> <name><surname>Ravindran</surname><given-names>B.</given-names></name></person-group> (<year>2023</year>). <article-title>A survey of adversarial Defenses and robustness in NLP</article-title>. <source><italic>ACM Computing Surveys</italic></source> <volume>55</volume> <comment>Available from:</comment> <ext-link xlink:href="https://dl.acm.org/doi/10.1145/3593042" ext-link-type="uri">https://dl.acm.org/doi/10.1145/3593042</ext-link></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname><given-names>A.</given-names></name> <name><surname>Mendonca</surname><given-names>R.</given-names></name> <name><surname>Liu</surname><given-names>Y.</given-names></name> <name><surname>Abbeel</surname><given-names>P.</given-names></name> <name><surname>Levine</surname><given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Meta-reinforcement learning of structured exploration strategies</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>31</volume>.</mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Haarnoja</surname><given-names>T.</given-names></name> <name><surname>Zhou</surname><given-names>A.</given-names></name> <name><surname>Abbeel</surname><given-names>P.</given-names></name> <name><surname>Levine</surname><given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://proceedings.mlr.press/v80/haarnoja18b.html" ext-link-type="uri">https://proceedings.mlr.press/v80/haarnoja18b.html</ext-link> (Accessed 19 September 2024)</mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll5">ICO</collab></person-group> (<year>2018</year>). <source><italic>Information commissioner&#x2019;s office (ICO): The UK GDPR</italic>. UK GDPR guidance and resources</source>. <comment>Available from:</comment> <ext-link xlink:href="https://ico.org.uk/for-organisations/data-protection-and-the-eu/data-protection-and-the-eu-in-detail/the-uk-gdpr/" ext-link-type="uri">https://ico.org.uk/for-organisations/data-protection-and-the-eu/data-protection-and-the-eu-in-detail/the-uk-gdpr/</ext-link> (Accessed 8 July 2023)</mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jerbi</surname><given-names>S.</given-names></name> <name><surname>Gyurik</surname><given-names>C.</given-names></name> <name><surname>Marshall</surname><given-names>S. C.</given-names></name> <name><surname>Briegel</surname><given-names>H. J.</given-names></name> <name><surname>Dunjko</surname><given-names>V.</given-names></name></person-group> (<year>2021</year>). <article-title>Parametrized quantum policies for reinforcement learning</article-title>. <source><italic>Advances in Neural Information Processing Systems</italic></source> <volume>34</volume>, <fpage>28362</fpage>&#x2013;<lpage>28375</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/pdf/2103.05577" ext-link-type="uri">https://arxiv.org/pdf/2103.05577</ext-link> (Accessed 01 September 2025)</mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khamaiseh</surname><given-names>S. Y.</given-names></name> <name><surname>Bagagem</surname><given-names>D.</given-names></name> <name><surname>Al-Alaj</surname><given-names>A.</given-names></name> <name><surname>Mancino</surname><given-names>M.</given-names></name> <name><surname>Alomari</surname><given-names>H. W.</given-names></name></person-group> (<year>2022</year>). <article-title>Adversarial deep learning: a survey on adversarial attacks and Defense mechanisms on image classification</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>102266</fpage>&#x2013;<lpage>102291</lpage>.</mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Kuutti</surname><given-names>S.</given-names></name> <name><surname>Bowden</surname><given-names>R.</given-names></name> <name><surname>Joshi</surname><given-names>H.</given-names></name> <name><surname>De Temple</surname><given-names>R.</given-names></name> <name><surname>Fallah</surname><given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>End-to-end reinforcement learning for autonomous longitudinal control using advantage actor critic with temporal context</article-title>. <source>2019 IEEE intelligent transportation systems conference, ITSC 2019</source>, <fpage>2456</fpage>&#x2013;<lpage>2462</lpage>.</mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lang</surname><given-names>S.</given-names></name> <name><surname>Kuetgens</surname><given-names>M.</given-names></name> <name><surname>Reichardt</surname><given-names>P.</given-names></name> <name><surname>Reggelin</surname><given-names>T.</given-names></name></person-group> (<year>2021</year>). <article-title>Modeling production scheduling problems as reinforcement learning environments based on discrete-event simulation and OpenAI gym</article-title>. <source>IFAC-PapersOnLine</source> <volume>54</volume>, <fpage>793</fpage>&#x2013;<lpage>798</lpage>.</mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Lapan</surname><given-names>M.</given-names></name></person-group> (<year>2018</year>). <source><italic>Deep reinforcement learning hands-on: Apply modern RL methods, with deep Q-networks, value iteration, policy gradients, TRPO, AlphaGo zero and more</italic></source>. <comment>Available from:</comment> <ext-link xlink:href="https://books.google.com/books?hl=en&#x0026;lr=&#x0026;id=xKdhDwAAQBAJ&#x0026;oi=fnd&#x0026;pg=PP1&#x0026;dq=AlphaGo&#x0026;ots=wUgilj2h7C&#x0026;sig=ZNDOd-qx8Hw3xK3sA6E4ZEsNB6A" ext-link-type="uri">https://books.google.com/books?hl=en&#x0026;lr=&#x0026;id=xKdhDwAAQBAJ&#x0026;oi=fnd&#x0026;pg=PP1&#x0026;dq=AlphaGo&#x0026;ots=wUgilj2h7C&#x0026;sig=ZNDOd-qx8Hw3xK3sA6E4ZEsNB6A</ext-link> (Accessed 20 October 2023)</mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>H.</given-names></name> <name><surname>He</surname><given-names>E.</given-names></name> <name><surname>Zhao</surname><given-names>Y.</given-names></name> <name><surname>Jia</surname><given-names>Z.</given-names></name> <name><surname>Li</surname><given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Adversarial attack and Defense: a survey</article-title>. <source><italic>Electronics 2022, Vol. 11, Page 1283</italic></source> <volume>11</volume>:<fpage>1283</fpage>. <comment>Available from:</comment> <ext-link xlink:href="https://www.mdpi.com/2079-9292/11/8/1283/htm" ext-link-type="uri">https://www.mdpi.com/2079-9292/11/8/1283/htm</ext-link>. (Accessed 30 May 2024)</mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Macas</surname><given-names>M.</given-names></name> <name><surname>Wu</surname><given-names>C.</given-names></name> <name><surname>Fuertes</surname><given-names>W.</given-names></name></person-group> (<year>2024</year>). <article-title>Adversarial examples: a survey of attacks and defenses in deep learning-enabled cybersecurity systems</article-title>. <source>Expert Syst. Appl.</source> <volume>238</volume>:<fpage>122223</fpage>.</mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maple</surname><given-names>C.</given-names></name> <name><surname>Bradbury</surname><given-names>M.</given-names></name> <name><surname>Le</surname><given-names>A. T.</given-names></name> <name><surname>Ghirardello</surname><given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>A connected and autonomous vehicle reference architecture for attack surface analysis</article-title>. <source><italic>Applied Sciences</italic></source> <volume>9</volume>:<fpage>5101</fpage>. <comment>Available from:</comment> <ext-link xlink:href="https://www.mdpi.com/2076-3417/9/23/5101" ext-link-type="uri">https://www.mdpi.com/2076-3417/9/23/5101</ext-link></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mendez</surname><given-names>J. A.</given-names></name> <name><surname>Shivkumar</surname><given-names>S.</given-names></name> <name><surname>Eaton</surname><given-names>E.</given-names></name></person-group> (<year>2018</year>). <article-title>Lifelong inverse reinforcement learning</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>31</volume>.</mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moosavi-Dezfooli</surname><given-names>S. M.</given-names></name> <name><surname>Fawzi</surname><given-names>A.</given-names></name> <name><surname>Fawzi</surname><given-names>O.</given-names></name> <name><surname>Frossard</surname><given-names>P.</given-names></name></person-group> (<year>2016a</year>). <article-title>Universal adversarial perturbations</article-title>. <source><italic>Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017</italic></source> <volume>2017-January</volume>, <fpage>86</fpage>&#x2013;<lpage>94</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/pdf/1610.08401" ext-link-type="uri">https://arxiv.org/pdf/1610.08401</ext-link> (Accessed 22 July 2025)</mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Moosavi-Dezfooli</surname><given-names>S. M.</given-names></name> <name><surname>Fawzi</surname><given-names>A.</given-names></name> <name><surname>Frossard</surname><given-names>P.</given-names></name></person-group> (<year>2016b</year>). &#x201C;<article-title>DeepFool: a simple and accurate method to fool deep neural networks</article-title>&#x201D; in <source>2016 IEEE conference on computer vision and pattern recognition (CVPR)</source>, vol. <volume>2016-December</volume>, <fpage>2574</fpage>&#x2013;<lpage>2582</lpage>.</mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moosavi-Dezfooli</surname><given-names>S.-M.</given-names></name> <name><surname>Fawzi</surname><given-names>A.</given-names></name> <name><surname>Frossard</surname><given-names>P. F.</given-names></name> <name><surname>Polytechnique</surname><given-names>F.</given-names></name> <name><surname>De Lausanne</surname><given-names>F.</given-names></name></person-group> (<year>2015</year>). <article-title>DeepFool: a simple and accurate method to fool deep neural networks</article-title>. <source><italic>Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition</italic></source> <volume>2016-December</volume>, <fpage>2574</fpage>&#x2013;<lpage>2582</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/pdf/1511.04599" ext-link-type="uri">https://arxiv.org/pdf/1511.04599</ext-link></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Mukhopadhyay</surname><given-names>R.</given-names></name> <name><surname>Bandyopadhyay</surname><given-names>S.</given-names></name> <name><surname>Sutradhar</surname><given-names>A.</given-names></name> <name><surname>Chattopadhyay</surname><given-names>P.</given-names></name></person-group> (<year>2019</year>). &#x201C;<article-title>Performance analysis of deep Q networks and advantage actor critic algorithms in designing reinforcement learning-based self-tuning PID controllers</article-title>&#x201D; in <source><italic>2019 IEEE Bombay section signature conference, IBSSC 2019</italic>, 2019January</source>.</mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Nasr</surname><given-names>M.</given-names></name> <name><surname>Hayes</surname><given-names>J.</given-names></name> <name><surname>Steinke</surname><given-names>T.</given-names></name> <name><surname>Balle</surname><given-names>B.</given-names></name> <name><surname>DeepMind</surname><given-names>G.</given-names></name> <name><surname>Tram&#x00E8;r</surname><given-names>F.</given-names></name> <etal/></person-group>. (<year>2023</year>). &#x201C;<article-title>Tight auditing of differentially private machine learning</article-title>&#x201D; in <source>online</source> (<publisher-loc>Anaheim, CA, USA</publisher-loc>: <publisher-name>32nd USENIX Security Symposium</publisher-name>. <comment>Available from:</comment> <ext-link xlink:href="https://www.usenix.org/conference/usenixsecurity23/presentation/nasr" ext-link-type="uri">https://www.usenix.org/conference/usenixsecurity23/presentation/nasr</ext-link>). (Accessed 21 April 2025)</mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ozdag</surname><given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Adversarial attacks and Defenses against deep neural networks: a survey</article-title>. <source>Procedia Computer Science</source> <volume>140</volume>, <fpage>152</fpage>&#x2013;<lpage>161</lpage>.</mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Papernot</surname><given-names>N.</given-names></name> <name><surname>Faghri</surname><given-names>F.</given-names></name> <name><surname>Carlini</surname><given-names>N.</given-names></name> <name><surname>Goodfellow</surname><given-names>I.</given-names></name> <name><surname>Feinman</surname><given-names>R.</given-names></name> <name><surname>Kurakin</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Technical report on the CleverHans v2.1.0 adversarial examples library</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/abs/1610.00768v6" ext-link-type="uri">https://arxiv.org/abs/1610.00768v6</ext-link> (Accessed 19 September 2024)</mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Pathade</surname><given-names>C.</given-names></name></person-group> (<year>2025</year>). <article-title>Red teaming the mind of the machine: a systematic evaluation of prompt injection and jailbreak vulnerabilities in LLMs</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/pdf/2505.04806" ext-link-type="uri">https://arxiv.org/pdf/2505.04806</ext-link> (Accessed 23 July 2025)</mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Peloquin</surname><given-names>D.</given-names></name> <name><surname>DiMaio</surname><given-names>M.</given-names></name> <name><surname>Bierer</surname><given-names>B.</given-names></name> <name><surname>Barnes</surname><given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Disruptive and avoidable: GDPR challenges to secondary research uses of data</article-title>. <source><italic>European Journal of Human Genetics 2020 28:6</italic></source> <volume>28</volume>, <fpage>697</fpage>&#x2013;<lpage>705</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://www.nature.com/articles/s41431-020-0596-x" ext-link-type="uri">https://www.nature.com/articles/s41431-020-0596-x</ext-link></mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname><given-names>K.</given-names></name> <name><surname>Zheng</surname><given-names>T.</given-names></name> <name><surname>Qin</surname><given-names>Z.</given-names></name> <name><surname>Liu</surname><given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Adversarial attacks and Defenses in deep learning</article-title>. <source>Engineering</source> <volume>6</volume>, <fpage>346</fpage>&#x2013;<lpage>360</lpage>.</mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schlatt</surname><given-names>V.</given-names></name> <name><surname>Guggenberger</surname><given-names>T.</given-names></name> <name><surname>Schmid</surname><given-names>J.</given-names></name> <name><surname>Urbach</surname><given-names>N.</given-names></name></person-group> (<year>2023</year>). <article-title>Attacking the trust machine: developing an information systems research agenda for blockchain cybersecurity</article-title>. <source>Int. J. Inf. Manag.</source> <volume>68</volume>:<fpage>102470</fpage>.</mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Schoepf</surname><given-names>S.</given-names></name> <name><surname>Hameed</surname><given-names>M. Z.</given-names></name> <name><surname>Rawat</surname><given-names>A.</given-names></name> <name><surname>Fraser</surname><given-names>K.</given-names></name> <name><surname>Zizzo</surname><given-names>G.</given-names></name> <name><surname>Cornacchia</surname><given-names>G.</given-names></name> <etal/></person-group>, (<year>2025</year>). <article-title>MAD-MAX: modular and diverse malicious attack MiXtures for automated LLM red teaming</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/pdf/2503.06253" ext-link-type="uri">https://arxiv.org/pdf/2503.06253</ext-link> (Accessed 23 July 2025)</mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sewak</surname><given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>Actor-critic models and the A3C</article-title>. <source><italic>Deep Reinforcement Learning</italic></source>, <fpage>141</fpage>&#x2013;<lpage>152</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://link.springer.com/chapter/10.1007/978-981-13-8285-7_11" ext-link-type="uri">https://link.springer.com/chapter/10.1007/978-981-13-8285-7_11</ext-link>.</mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shenoy</surname><given-names>K. S.</given-names></name> <name><surname>Sheth</surname><given-names>D. Y.</given-names></name> <name><surname>Behera</surname><given-names>B. K.</given-names></name> <name><surname>Panigrahi</surname><given-names>P. K.</given-names></name></person-group> (<year>2020</year>). <article-title>Demonstration of a measurement-based adaptation protocol with quantum reinforcement learning on the IBM Q experience platform</article-title>. <source><italic>Quantum Information Processing</italic></source> <volume>19</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://link.springer.com/article/10.1007/s11128-020-02657-x" ext-link-type="uri">https://link.springer.com/article/10.1007/s11128-020-02657-x</ext-link></mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shokri</surname><given-names>R.</given-names></name> <name><surname>Stronati</surname><given-names>M.</given-names></name> <name><surname>Song</surname><given-names>C.</given-names></name> <name><surname>Shmatikov</surname><given-names>V.</given-names></name></person-group> (<year>2017</year>). <article-title>Membership inference attacks against machine learning models</article-title>. <source>Proceedings - IEEE Symposium on Security and Privacy</source>, <fpage>3</fpage>&#x2013;<lpage>18</lpage>.</mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sutton</surname><given-names>R. S.</given-names></name> <name><surname>Barto</surname><given-names>A. G.</given-names></name></person-group> (<year>1998</year>). <article-title>Reinforcement learning: an introduction MIT press</article-title>. <source>Cambridge, MA</source> <volume>22447</volume>:<fpage>10</fpage>.</mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Szegedy</surname><given-names>C.</given-names></name> <name><surname>Liu</surname><given-names>W.</given-names></name> <name><surname>Jia</surname><given-names>Y.</given-names></name> <name><surname>Sermanet</surname><given-names>P.</given-names></name> <name><surname>Reed</surname><given-names>S.</given-names></name> <name><surname>Anguelov</surname><given-names>D.</given-names></name> <etal/></person-group>. (<year>2015</year>). &#x201C;<article-title>Going deeper with convolutions</article-title>&#x201D; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. (Accessed 9 October 2023)</mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Szegedy</surname><given-names>C.</given-names></name> <name><surname>Zaremba</surname><given-names>W.</given-names></name> <name><surname>Sutskever</surname><given-names>I.</given-names></name> <name><surname>Bruna</surname><given-names>J.</given-names></name> <name><surname>Erhan</surname><given-names>D.</given-names></name> <name><surname>Goodfellow</surname><given-names>I.</given-names></name> <etal/></person-group>. (<year>2013</year>). &#x201C;<article-title>Intriguing properties of neural networks</article-title>&#x201D; in <source><italic>2nd international conference on learning representations, ICLR 2014 - conference track proceedings</italic></source>. <comment>Available from:</comment> <ext-link xlink:href="https://arxiv.org/abs/1312.6199v4" ext-link-type="uri">https://arxiv.org/abs/1312.6199v4</ext-link>. (Accessed 9 October 2023)</mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Hu</surname><given-names>J.</given-names></name> <name><surname>Min</surname><given-names>G.</given-names></name> <name><surname>Zhan</surname><given-names>W.</given-names></name> <name><surname>Ni</surname><given-names>Q.</given-names></name> <name><surname>Georgalas</surname><given-names>N.</given-names></name></person-group> (<year>2019</year>). <article-title>Computation offloading in multi-Access edge computing using a deep sequential model based on reinforcement learning</article-title>. <source>IEEE Commun. Mag.</source> <volume>57</volume>, <fpage>64</fpage>&#x2013;<lpage>69</lpage>.</mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>W.</given-names></name> <name><surname>Di Maio</surname><given-names>F.</given-names></name> <name><surname>Zio</surname><given-names>E.</given-names></name></person-group> (<year>2019</year>). <article-title>Adversarial Risk analysis to allocate optimal Defense resources for protecting cyber&#x2013;physical systems from cyber attacks</article-title>. <source><italic>Risk Analysis</italic></source> <volume>39</volume>, <fpage>2766</fpage>&#x2013;<lpage>2785</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://onlinelibrary.wiley.com/doi/abs/10.1111/risa.13382" ext-link-type="uri">https://onlinelibrary.wiley.com/doi/abs/10.1111/risa.13382</ext-link></mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Wu</surname><given-names>X</given-names></name> <name><surname>Fredrikson</surname><given-names>M.</given-names></name></person-group> (<year>2024</year>). <source>A methodology for formalizing model-inversion attacks</source>. Available online at: <ext-link xlink:href="https://ieeexplore.ieee.org/abstract/document/7536387/" ext-link-type="uri">https://ieeexplore.ieee.org/abstract/document/7536387/</ext-link> (Accessed November 16, 2024).</mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname><given-names>Q.</given-names></name> <name><surname>Sim&#x00E3;o</surname><given-names>T. D.</given-names></name> <name><surname>Tindemans</surname><given-names>S. H.</given-names></name> <name><surname>Spaan</surname><given-names>M. T. J.</given-names></name></person-group> (<year>2021</year>). <article-title>WCSAC: worst-case soft actor critic for safety-constrained reinforcement learning</article-title>. <source><italic>Proceedings of the AAAI Conference on Artificial Intelligence</italic></source> <volume>35</volume>, <fpage>10639</fpage>&#x2013;<lpage>10646</lpage>. <comment>Available from:</comment> <ext-link xlink:href="https://ojs.aaai.org/index.php/AAAI/article/view/17272" ext-link-type="uri">https://ojs.aaai.org/index.php/AAAI/article/view/17272</ext-link> (Accessed 19 September 2024)</mixed-citation></ref>
<ref id="ref56"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname><given-names>N.</given-names></name> <name><surname>Farley</surname><given-names>T.</given-names></name> <name><surname>Lakshminarasimhan</surname><given-names>D.</given-names></name></person-group> (<year>2006</year>). <article-title>An attack-norm separation approach for detecting cyber attacks</article-title>. <source><italic>Information Systems Frontiers</italic></source> <volume>8</volume>, <fpage>163</fpage>&#x2013;<lpage>177</lpage>. <comment>Available from:</comment> <ext-link xlink:href="http://link.springer.com/10.1007/s10796-006-8731-y" ext-link-type="uri">http://link.springer.com/10.1007/s10796-006-8731-y</ext-link> (Accessed 16 October 2019)</mixed-citation></ref>
<ref id="ref57"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>T.</given-names></name> <name><surname>Quillen</surname><given-names>D.</given-names></name> <name><surname>He</surname><given-names>Z.</given-names></name> <name><surname>Julian</surname><given-names>R.</given-names></name> <name><surname>Hausman</surname><given-names>K.</given-names></name> <name><surname>Finn</surname><given-names>C.</given-names></name> <name><surname>Levine</surname><given-names>S.</given-names></name></person-group>, (<year>2020</year>). <article-title>Meta-world: a benchmark and evaluation for multi-task and Meta reinforcement learning</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://proceedings.mlr.press/v100/yu20a.html" ext-link-type="uri">https://proceedings.mlr.press/v100/yu20a.html</ext-link> (Accessed 20 September 2024)</mixed-citation></ref>
<ref id="ref58"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhan</surname><given-names>Y.</given-names></name> <name><surname>Ammar</surname><given-names>H. B.</given-names></name> <name><surname>Taylor</surname><given-names>M. E.</given-names></name></person-group> (<year>2017</year>). <article-title>Scalable lifelong reinforcement learning</article-title>. <source>Pattern Recogn.</source> <volume>72</volume>, <fpage>407</fpage>&#x2013;<lpage>418</lpage>.</mixed-citation></ref>
<ref id="ref59"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Hu</surname><given-names>H.</given-names></name> <name><surname>Chen</surname><given-names>T.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Fan</surname><given-names>C.</given-names></name> <name><surname>Zhang</surname><given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>MetaCURE: Meta reinforcement learning with empowerment-driven exploration</article-title>. <comment>Available from:</comment> <ext-link xlink:href="https://proceedings.mlr.press/v139/zhang21w.html" ext-link-type="uri">https://proceedings.mlr.press/v139/zhang21w.html</ext-link> (Accessed 20 September 2024)</mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0005">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/218737/overview">Mohammad Akbari</ext-link>, Amirkabir University of Technology, Iran</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0006">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1800173/overview">Liu Xinlei</ext-link>, Information Engineering University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3061236/overview">Khedija Arour</ext-link>, Jeddah University, Saudi Arabia</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn0001"><label>1</label><p><ext-link xlink:href="http://robust-ml.org" ext-link-type="uri">robust-ml.org</ext-link></p></fn>
<fn id="fn0002"><label>2</label><p><ext-link xlink:href="https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/applications/MobileNetV2" ext-link-type="uri">https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/applications/MobileNetV2</ext-link></p></fn>
<fn id="fn0003"><label>3</label><p><ext-link xlink:href="https://image-net.org/" ext-link-type="uri">https://image-net.org/</ext-link></p></fn>
</fn-group>
</back>
</article>