<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" 'JATS-journalpublishing1-3-mathml3.dtd'>
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1652050</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2025.1652050</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Trustworthy navigation with variational policy in deep reinforcement learning</article-title>
<alt-title alt-title-type="left-running-head">Bockrath et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2025.1652050">10.3389/frobt.2025.1652050</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Bockrath</surname>
<given-names>Karla</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3171828"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ernst</surname>
<given-names>Liam</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nadeem</surname>
<given-names>Rohaan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3171972"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pedraza</surname>
<given-names>Bryan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Dera</surname>
<given-names>Dimah</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1894933"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing&#x2013;original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Chester F. Carlson Center for Imaging Science, Rochester Institute of Technology</institution>, <city>Rochester</city>, <state>NY</state>, <country country="US">United States</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Department of Electrical and Computer Engineering, The University of Texas Rio Grande Valley</institution>, <city>Edinburg</city>, <state>TX</state>, <country country="US">United States</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Dimah Dera, <email xlink:href="mailto:dimah.dera@rit.edu">dimah.dera@rit.edu</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-10-08">
<day>08</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1652050</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>19</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>09</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Bockrath, Ernst, Nadeem, Pedraza and Dera.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Bockrath, Ernst, Nadeem, Pedraza and Dera</copyright-holder>
<license>
<ali:license_ref start_date="2025-10-08">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Developing a reliable and trustworthy navigation policy in deep reinforcement learning (DRL) for mobile robots is extremely challenging, particularly in real-world, highly dynamic environments. Particularly, exploring and navigating unknown environments without prior knowledge, while avoiding obstacles and collisions, is very cumbersome for mobile robots. </p>
</sec>
<sec>
<title>Methods</title>
<p>This study introduces a novel trustworthy navigation framework that utilizes variational policy learning to quantify uncertainty in the estimation of the robot&#x2019;s action, localization, and map representation. Trust-Nav employs the Bayesian variational approximation of the posterior distribution over the policy-based neural network&#x2019;s parameters. Policy-based and value-based learning are combined to guide the robot&#x2019;s actions in unknown environments. We derive the propagation of variational moments through all layers of the policy network and employ a first-order approximation for the nonlinear activation functions. The uncertainty in robot action is measured by the propagated variational covariance in the DRL policy network. At the same time, the uncertainty in the robot&#x2019;s localization and mapping is embedded in the reward function and stems from the traditional Theory of Optimal Experimental Design. The total loss function optimizes the parameters of the policy and value networks to maximize the robot&#x2019;s cumulative reward in an unknown environment.</p>
</sec>
<sec>
<title>Results</title>
<p>Experiments conducted using the Gazebo robotics simulator demonstrate the superior performance of the proposed Trust-Nav model in achieving robust autonomous navigation and mapping.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Trust-Nav consistently outperforms deterministic DRL approaches, particularly in complicated environments involving noisy conditions and adversarial attacks. This integration of uncertainty into the policy network promotes safer and more reliable navigation, especially in complex or unpredictable environments. Trust-Nav offers a step toward deployable, self-aware robotic systems capable of recognizing and responding to their own limitations.</p>
</sec>
</abstract>
<kwd-group>
<kwd>deep reinforcement learning</kwd>
<kwd>robot uncertainty</kwd>
<kwd>trustworthy navigation</kwd>
<kwd>variational policy</kwd>
<kwd>moment propagation</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by the US National Science Foundation Award CRII &#x23; 2401828.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="4"/>
<equation-count count="20"/>
<ref-count count="48"/>
<page-count count="14"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Robot Learning and Evolution</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Autonomous mobile robots are designed to execute complex tasks, navigate, and interact with unknown real-world environments. However, the challenges posed by the dynamic nature of the real world introduce a spectrum of obstacles that require innovative solutions (<xref ref-type="bibr" rid="B44">Wong et al., 2018</xref>; <xref ref-type="bibr" rid="B6">Carter-Templeton et al., 2018</xref>; <xref ref-type="bibr" rid="B19">Liaqat et al., 2019</xref>; <xref ref-type="bibr" rid="B2">Alatise and Hancke, 2020</xref>; <xref ref-type="bibr" rid="B27">Nam and Gon-Woo, 2021</xref>; <xref ref-type="bibr" rid="B28">Niloy et al., 2021</xref>; <xref ref-type="bibr" rid="B15">Gupta and Fernando, 2022</xref>; <xref ref-type="bibr" rid="B43">Wijayathunga et al., 2023</xref>). From surviving unpredictable barriers to responding to noisy or attacked environmental conditions, these challenges underscore the complexity of achieving autonomy in mobile robotic systems.</p>
<p>Deep reinforcement learning (DRL), rooted in the synergy of deep neural networks (DNNs) and reinforcement learning (RL) principles, has emerged as a powerful paradigm to endow autonomous robotic systems with adaptive and intelligent navigation and decision-making capabilities (<xref ref-type="bibr" rid="B24">Mnih et al., 2015</xref>; <xref ref-type="bibr" rid="B41">Wang et al., 2016</xref>; <xref ref-type="bibr" rid="B14">Gu et al., 2017</xref>; <xref ref-type="bibr" rid="B46">Zambaldi et al., 2018</xref>; <xref ref-type="bibr" rid="B21">Liu R. et al., 2021</xref>; <xref ref-type="bibr" rid="B31">Plaat, 2022</xref>). DRL offers a promising avenue for imbuing robots with the capability to learn and optimize behaviors autonomously with considerable success across various research domains, including navigation and mapping, as a particularly noteworthy area of exploration (<xref ref-type="bibr" rid="B1">Ahmed et al., 2023</xref>; <xref ref-type="bibr" rid="B34">Placed et al., 2023</xref>). Autonomous navigation encompasses a suite of methodologies wherein a mobile robot not only localizes itself but also concurrently traverses and maps an unfamiliar environment. This dynamic field within RL demonstrates the potential for robotic systems to autonomously navigate and explore unknown spaces while simultaneously building a coherent map of their surroundings. The latter process is known as active simultaneous localization and mapping (SLAM) (<xref ref-type="bibr" rid="B18">Leung et al., 2008</xref>; <xref ref-type="bibr" rid="B39">Trivun et al., 2015</xref>; <xref ref-type="bibr" rid="B29">Palomeras et al., 2019</xref>; <xref ref-type="bibr" rid="B7">Chen et al., 2020</xref>; <xref ref-type="bibr" rid="B23">Mih&#xe1;lik et al., 2022</xref>; <xref ref-type="bibr" rid="B1">Ahmed et al., 2023</xref>; <xref ref-type="bibr" rid="B34">Placed et al., 2023</xref>).</p>
<p>This paper proposes a novel trustworthy navigation (Trust-Nav) framework that adopts DRL and develops a variational policy learning paradigm. The variational policy consists of a Bayesian policy neural network, where we define a prior distribution over the parameters of the policy network. When the robot receives observations from the environment, the distribution over the parameters is updated to the posterior distribution using Bayes&#x2019; rule. However, computing the exact posterior is often intractable due to the complexity and high dimensionality of neural networks. We approximate the posterior distribution of the policy network&#x2019;s parameters using variational inference (<xref ref-type="bibr" rid="B4">Blei et al., 2017</xref>). The variational inference framework addresses this difficulty by approximating posterior estimation as an optimization problem, where a simpler distribution (i.e., Gaussian) is optimized to closely match the true posterior. To complete the Bayesian network structure, we propagate the moments of the Gaussian variational posterior through the network layers and estimate the mean and covariance of the predicted robot&#x2019;s actions of the policy network. The propagated covariance represents the uncertainty associated with the action and is used in the loss function to inform the decision. Moreover, Trust-Nav also computes uncertainty in the robot&#x2019;s localization and mapping using the D-optimal method (<xref ref-type="bibr" rid="B36">Rodr&#xed;guez-Ar&#xe9;valo et al., 2018</xref>; <xref ref-type="bibr" rid="B33">Placed and Castellanos, 2022</xref>) that captures the global variance of the map by analyzing the total length of the covariance of the state vectors. The proposed framework can be applied to various DRL algorithms and produces improved robustness in autonomous robot navigation, especially in noisy environments. The main contributions can be summarized as follows.<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Develop a novel DRL-based trustworthy, reliable, and collision-free autonomous navigation (Trust-Nav) framework that introduces closed-form variational moment propagation into DRL policy networks, and integrates statistical uncertainty in Bayesian theory to guide the robot&#x2019;s actions and mappings for trustworthy navigation.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Eliminate MC sampling to overcome robustness and scalability limitations of existing Bayesian DRL approaches, providing a tractable, analytically grounded framework that balances theoretical soundness with the computational constraints of embedded robotic systems.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Combine policy-based and value-based learning and quantify the uncertainty in the robot&#x2019;s actions and localizations to guide the navigation toward maximizing cumulative reward.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Design a Bayesian policy neural network that propagates the mean and covariance of the variational posterior distribution and produces robot actions to the environment and associated uncertainty within each action to guide the robot&#x2019;s decision-making process.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Adopt a reward function that accounts for the robot&#x2019;s localization uncertainty. Both action and localization/mapping uncertainties are combined into a unified loss function to maximize the cumulative reward.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Assess the Trust-Nav model performance and robustness under various noisy and attacked environments by an adversary using the Gazebo robotics simulator.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Literature review</title>
<sec id="s2-1">
<label>2.1</label>
<title>Deep reinforcement learning for navigation</title>
<p>Deep Reinforcement Learning (DRL) enables an autonomous robot to learn optimal behaviors through trial-and-error interactions with its environment. In the context of navigation and exploration, the robot&#x2014;equipped with sensors such as LiDAR and/or cameras&#x2014;learns to perceive, explore, and map previously unknown environments by leveraging action&#x2013;feedback loops to iteratively refine its policy (<xref ref-type="bibr" rid="B24">Mnih et al., 2015</xref>; <xref ref-type="bibr" rid="B26">Morales et al., 2021</xref>; <xref ref-type="bibr" rid="B31">Plaat, 2022</xref>).</p>
<p>The robot refines its behavior by receiving rewards or penalties based on the outcomes of its actions, as specified by a developer-defined reward function. Although the reward signal provides some supervision, as it guides the robot toward optimal actions, the robot primarily learns through its own interactions with the environment, making DRL a form of semi-supervised learning. This framework is particularly effective in complex environments characterized by high-dimensional state and action spaces. For example, in tasks such as playing chess, the robot must reason over an enormous decision space to win. To manage such complexity, DRL integrates deep neural networks, which enable the robot to approximate complex, non-linear functions and make decisions in high-dimensional environments. This learning paradigm closely resembles human learning through trial and error, as observed in the process of mastering strategic games such as checkers or chess.</p>
<p>A variety of DRL architectures have been applied to robotics navigation, including value-based methods such as Q-learning (<xref ref-type="bibr" rid="B16">Jang et al., 2019</xref>) and Deep Q-Networks (DQN) (<xref ref-type="bibr" rid="B24">Mnih et al., 2015</xref>), as well as their enhancements&#x2014;double DQN (<xref ref-type="bibr" rid="B40">Van Hasselt et al., 2016</xref>) and dueling architectures (<xref ref-type="bibr" rid="B41">Wang et al., 2016</xref>). While these approaches perform well in discrete action spaces, robotics often requires continuous control of motion parameters such as linear and angular velocities. Policy gradient methods, particularly the Advantage Actor&#x2013;Critic (A2C) framework (<xref ref-type="bibr" rid="B13">Grondman et al., 2012</xref>; <xref ref-type="bibr" rid="B25">Mnih et al., 2016</xref>; <xref ref-type="bibr" rid="B12">Grigsby et al., 2021</xref>), address this by decoupling policy learning (actor) from value estimation (critic), enabling better action prediction in continuous or mixed action spaces.</p>
<p>Despite these advances, current DRL navigation frameworks remain limited in their ability to operate reliably in real-world conditions where sensor noise, environmental uncertainty, and adversarial disturbances are prevalent. Recent work in robust reinforcement learning has explored adversarial training (<xref ref-type="bibr" rid="B30">Pinto et al., 2017</xref>), distributional RL (<xref ref-type="bibr" rid="B20">Liu Q. et al., 2021</xref>; <xref ref-type="bibr" rid="B3">Bellemare et al., 2023</xref>), and domain randomization (<xref ref-type="bibr" rid="B38">Tobin et al., 2017</xref>) to improve robustness, while adaptive control theory (<xref ref-type="bibr" rid="B48">Zhou, 1998</xref>) provides decades of insight into stability under uncertainty. However, these strategies often lack explicit mechanisms for quantifying and propagating uncertainty in the decision-making process.</p>
<p>Bayesian neural networks (BNNs) offer a principled approach to uncertainty quantification by modeling distributions over network parameters (<xref ref-type="bibr" rid="B10">Gal and Ghahramani, 2016</xref>; <xref ref-type="bibr" rid="B17">Kendall and Gal, 2017</xref>; <xref ref-type="bibr" rid="B9">Feng et al., 2019</xref>). In robotics, BNNs have been applied to perception (<xref ref-type="bibr" rid="B8">Dera et al., 2021</xref>) and control (<xref ref-type="bibr" rid="B42">Wang et al., 2024</xref>), demonstrating improved robustness to noisy inputs. Yet, integrating BNNs directly into DRL navigation pipelines remains underexplored. Most uncertainty-aware navigation methods either rely on sampling-based approximations or heuristic measures of prediction confidence, which can be computationally costly or unreliable in safety-critical scenarios.</p>
<p>Our proposed Trust-Nav framework addresses this gap by analytically propagating both the mean and covariance of the variational posterior through the policy network, enabling real-time, self-assessed navigation without additional sampling or computation. This design allows the robot to detect low-confidence decision states and adapt its behavior accordingly, bridging Bayesian uncertainty modeling with DRL and drawing conceptual parallels to robust and adaptive control principles.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Reward computation for navigation</title>
<p>An important component of autonomous exploration is the computation of rewards that guide the robot from its current position toward informative future locations. Prior work has shown that reward design can be grounded in the uncertainty of the robot&#x2019;s pose and the environment map, encouraging actions that reduce this uncertainty (<xref ref-type="bibr" rid="B5">Carrillo et al., 2012</xref>; <xref ref-type="bibr" rid="B36">Rodr&#xed;guez-Ar&#xe9;valo et al., 2018</xref>). Many of these methods are rooted in the Theory of Optimal Experimental Design (TOED) (<xref ref-type="bibr" rid="B35">Pukelsheim, 2006</xref>), which provides optimality criteria for selecting actions that maximize the information gained from new observations.</p>
<p>The research community has explored several TOED criteria, including T-optimality, A-optimality, D-optimality, E-optimality, and Shannon&#x2019;s entropy (<xref ref-type="bibr" rid="B5">Carrillo et al., 2012</xref>; <xref ref-type="bibr" rid="B36">Rodr&#xed;guez-Ar&#xe9;valo et al., 2018</xref>; <xref ref-type="bibr" rid="B33">Placed and Castellanos, 2022</xref>; <xref ref-type="bibr" rid="B32">Placed and Castellanos, 2020</xref>), each emphasizing different statistical properties of the state covariance matrix to infer the uncertainty in the robot&#x2019;s localization and mapping. For example, A-optimality minimizes the trace of the covariance (average variance), while E-optimality minimizes the maximum eigenvalue (worst-case variance). D-optimality, in contrast, maximizes the determinant of the information matrix (or equivalently, minimizes the volume of the confidence ellipsoid), thereby capturing global variance reduction across all state dimensions. This property makes D-optimality well-suited for active SLAM and exploration, where the objective is to efficiently reduce uncertainty throughout the map rather than along a single dimension.</p>
<p>In this paper, we adopt the D-optimal method as the most effective because its ability to integrate information from all map landmarks (the global variance of the map), represented by the eigenvalues <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the state covariance matrix <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the state vector. The D-optimal criterion <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined in (<xref ref-type="disp-formula" rid="e1">Equation 1</xref>).<disp-formula id="e1">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x225c;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The D-optimal function has been shown in prior robotics literature (<xref ref-type="bibr" rid="B5">Carrillo et al., 2012</xref>; <xref ref-type="bibr" rid="B36">Rodr&#xed;guez-Ar&#xe9;valo et al., 2018</xref>; <xref ref-type="bibr" rid="B33">Placed and Castellanos, 2022</xref>; <xref ref-type="bibr" rid="B32">Placed and Castellanos, 2020</xref>) to yield more balanced exploration trajectories compared to alternative criteria. The logarithmic formulation prevents convergence to zero, ensuring numerical stability while providing a robust measure of global uncertainty for navigation, exploration, and mapping. The robot communicates back and forth with the environment to help create the map and positions using measurements from LiDAR or camera through the ROS framework (<xref ref-type="bibr" rid="B22">Macenski et al., 2022</xref>).</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Trust-navigation with variational policy</title>
<p>The proposed Trust-Nav adopts the policy-value DRL algorithm with deep neural networks to define the robot. The robot was equipped with a depth camera and LiDAR. The camera&#x2019;s depth images or frames serve as inputs to the DRL neural networks, which determine the best action based on the environment&#x2019;s state. While LiDAR could be used for input, the camera was found to be more suitable for object detection and avoidance along the robot&#x2019;s trajectory path.</p>
<p>To extract useful information from images or frames, we deploy two convolutional neural networks (CNNs) for the policy and value functions, respectively. The policy CNN takes the environment states as input and produces probabilistic actions. At the same time, the value function determines the expected return for a robot starting at a given state and acting according to a particular policy. The two networks interact with each other through the temporal difference (TD) learning method, where the policy network makes an action, and the value network returns a value to penalize incorrect actions.</p>
<sec id="s3-1">
<label>3.1</label>
<title>Variational policy network</title>
<p>We develop the policy as a Bayesian CNN with <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> layers, and the probabilistic network parameters are <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is the weight matrix for the <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>th</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> layer. The Bayesian CNN architecture follows (<xref ref-type="bibr" rid="B8">Dera et al., 2021</xref>). We introduce a prior Gaussian distribution over the network parameters, <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a hyperparameter that refers to the prior variance. The input-output dataset for the policy network consists of states from the environment and the robot&#x2019;s actions at time <inline-formula id="inf17">
<mml:math id="m18">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf18">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. Given the data and the prior, we approximate the posterior distribution of the parameters given the data, i.e., <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> by the variational Gaussian distribution <inline-formula id="inf20">
<mml:math id="m21">
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The variational parameters <inline-formula id="inf21">
<mml:math id="m22">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with the mean, <inline-formula id="inf22">
<mml:math id="m23">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and covariance, <inline-formula id="inf23">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, are optimized by minimizing Kullback-Leibler (KL) divergence between the approximate and the true unknown posterior <inline-formula id="inf24">
<mml:math id="m25">
<mml:mrow>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> or equivalently maximizing the evidence lower bound (ELBO) loss function that converges to the optimal variational density (<xref ref-type="bibr" rid="B4">Blei et al., 2017</xref>). The ELBO loss is defined in (<xref ref-type="disp-formula" rid="e2">Equation 2</xref>).<disp-formula id="e2">
<mml:math id="m26">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>The ELBO loss function consists of two terms: (i) the expected log-likelihood of the robot&#x2019;s actions given the environment states and the probabilistic weights, and (ii) the regularization term, which is the KL divergence between the variational posterior and prior Gaussian distributions. The likelihood of the actions given the states, <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, is modeled by a Gaussian distribution with the action&#x2019;s mean, <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and covariance, <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, predicted at the output of the variational policy network. We approximate the expectation over the variational posterior in the first term of the ELBO loss using the first-order Taylor approximation as defined in (<xref ref-type="disp-formula" rid="e3">Equation 3</xref>). The use of a first-order Taylor expansion is a deliberate choice to enable closed-form propagation of both the mean and covariance of the variational posterior through nonlinear activation functions. This choice allows us to model the full predictive distribution in an analytically tractable manner, entirely avoiding the need for Monte Carlo (MC) sampling. MC-based uncertainty estimation, while accurate in theory, is computationally expensive, introduces sampling noise, and scales poorly with deeper network architectures&#x2014;limitations that are particularly critical in real-time robotics. The first-order approach therefore strikes a balance between accuracy and scalability, making it feasible to propagate uncertainty through deeper policy networks and to support low-latency inference. Although the first-order approximations can accumulate error in deep networks, our empirical results demonstrate that even with this assumption, the proposed framework significantly improves both accuracy and robustness compared to deterministic baselines and sampling-based Bayesian DRL approaches. This finding supports the suitability of the first-order approach in practical, real-time navigation scenarios.</p>
<p>We assume that the probabilistic parameters of the policy network are independent within and across layers. This independence assumption is crucial for developing a feasible optimization problem in high-dimensional policy networks. Estimating and storing a full covariance matrix across all weights is not computationally or mathematically tractable for large-scale DRL models, where the parameter count can be in the millions. Furthermore, this independence assumption promotes the extraction of non-correlated, informative features and reduces redundancy, which is beneficial for both generalization and interpretability (<xref ref-type="bibr" rid="B45">Yang et al., 2008</xref>). Thus, the variational covariance of the weight vector <inline-formula id="inf28">
<mml:math id="m30">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> in the <inline-formula id="inf29">
<mml:math id="m31">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th layer can be written as <inline-formula id="inf30">
<mml:math id="m32">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf31">
<mml:math id="m33">
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is an identity matrix and <inline-formula id="inf32">
<mml:math id="m34">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> the learnable variance. The second term of the ELBO loss has a closed-form mathematical formulation and can be written as in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, where <inline-formula id="inf33">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf34">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the number of neurons in the <inline-formula id="inf35">
<mml:math id="m37">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th and <inline-formula id="inf36">
<mml:math id="m38">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>th layers, respectively.<disp-formula id="e3">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2248;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m40">
<mml:mrow>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Thus, <inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the probabilistic action mean vector and covariance matrix. While <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are the mean vector and covariance matrix of the variational posterior distribution over the policy neural network&#x2019;s parameters for the <inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th weight vector, <inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, in the <inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th layer.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Policy variational moments propagation</title>
<p>In the proposed framework, the parameters of the policy network are modeled as probabilistic variables, specifically following Gaussian distributions. To accommodate this formulation, all network layers are redefined such that their computations operate on these probabilistic parameters. The variance values associated with the Gaussian posterior distributions capture the uncertainty in the model parameters. This parameter uncertainty propagates through the network layers, ultimately enabling the estimation of uncertainty in the robot&#x2019;s actions at the output of the policy network. Although the network parameters are assumed to be independent across layers, the output of each layer exhibits non-trivial correlations due to the transformations applied during forward propagation. Thus, the covariance over the output of every layer exists through the mathematical derivation. To quantify the uncertainty at each stage of the network&#x2014;including convolutional layers, multi-layer perceptrons, and non-linear activation functions&#x2014;we derive the output distributions using statistical properties of random variable transformations and the first-order (e.g., Taylor series) approximation.</p>
<p>The convolution and fully connected layers can be expressed as a multiplication between the input matrix <inline-formula id="inf44">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the weight matrix <inline-formula id="inf45">
<mml:math id="m49">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf46">
<mml:math id="m50">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The input matrix <inline-formula id="inf47">
<mml:math id="m51">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> has <inline-formula id="inf48">
<mml:math id="m52">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> probabilistic feature vectors (random vectors) as rows <inline-formula id="inf49">
<mml:math id="m53">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf50">
<mml:math id="m54">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> has <inline-formula id="inf51">
<mml:math id="m55">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> probabilistic weight vectors as columns <inline-formula id="inf52">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The mean matrix of the input feature vectors, where the means of the feature vectors are arranged in the matrix&#x2019;s rows, is given in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>.<disp-formula id="e5">
<mml:math id="m57">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the mean of the <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th row vector. The covariance matrix associated with each <inline-formula id="inf55">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, denoted by <inline-formula id="inf56">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is defined as <inline-formula id="inf57">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Similarly, every column of the matrix <inline-formula id="inf58">
<mml:math id="m63">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is <inline-formula id="inf59">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the mean matrix of the weights, which is the matrix of the mean vectors arranged in its columns, is defined as <inline-formula id="inf60">
<mml:math id="m65">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x22ef;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf61">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and the covariance matrix <inline-formula id="inf62">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf63">
<mml:math id="m68">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mspace width="0.17em"/>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>To simplify notation for the covariance derivation, we vectorize the output matrix <inline-formula id="inf64">
<mml:math id="m69">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> into a single column vector: <inline-formula id="inf65">
<mml:math id="m70">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mspace width="0.17em"/>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mspace width="0.17em"/>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf66">
<mml:math id="m71">
<mml:mrow>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the vectorization operation and the <inline-formula id="inf67">
<mml:math id="m72">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>-th element of <inline-formula id="inf68">
<mml:math id="m73">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (row <inline-formula id="inf69">
<mml:math id="m74">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, column <inline-formula id="inf70">
<mml:math id="m75">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) appears in <inline-formula id="inf71">
<mml:math id="m76">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at position <inline-formula id="inf72">
<mml:math id="m77">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>j</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>This ordering ensures that the indices <inline-formula id="inf73">
<mml:math id="m78">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (input row) and <inline-formula id="inf74">
<mml:math id="m79">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (weight column vector) are explicitly preserved, so that the mean and covariance for each element of <inline-formula id="inf75">
<mml:math id="m80">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be expressed in terms of the corresponding <inline-formula id="inf76">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf77">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Thus, the mean and covariance of the output vector <inline-formula id="inf78">
<mml:math id="m83">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are derived in <xref ref-type="disp-formula" rid="e6">Equation 6</xref> following the multiplication between two random vectors. <inline-formula id="inf79">
<mml:math id="m84">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the matrix trace.<disp-formula id="e6">
<mml:math id="m85">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mtext>Var</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mtext>Cov</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>q</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>where the index mapping <inline-formula id="inf80">
<mml:math id="m86">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2194;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf81">
<mml:math id="m87">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>&#x2194;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> follows the vectorization ordering above. Under the independence assumption between <inline-formula id="inf82">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf83">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> the variance of each element is simplified in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>.<disp-formula id="e7">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mtext>Var</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mtext>Cov</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>q</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>where the indices <inline-formula id="inf84">
<mml:math id="m91">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf85">
<mml:math id="m92">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> provide the variance components of the matrix <inline-formula id="inf86">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the indices <inline-formula id="inf87">
<mml:math id="m94">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf88">
<mml:math id="m95">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> provide the covariance components. <xref ref-type="fig" rid="F1">Figure 1</xref> illustrates the vectorization process in the covariance propagation derivation.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Illustration of the vectorization process in the covariance propagation derivation. The input matrix <inline-formula id="inf89">
<mml:math id="m96">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> multiplies the weight matrix <inline-formula id="inf90">
<mml:math id="m97">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to yield <inline-formula id="inf91">
<mml:math id="m98">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where each entry is <inline-formula id="inf92">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The vectorization <inline-formula id="inf93">
<mml:math id="m100">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> stacks row vectors of <inline-formula id="inf94">
<mml:math id="m101">
<mml:mrow>
<mml:mi mathvariant="bold">Z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, so <inline-formula id="inf95">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> maps to position <inline-formula id="inf96">
<mml:math id="m103">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>j</mml:mi>
</mml:math>
</inline-formula> in <inline-formula id="inf97">
<mml:math id="m104">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This explicit index mapping preserves the correspondence between each scalar and its originating pair <inline-formula id="inf98">
<mml:math id="m105">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, enabling consistent computation of means and covariances.</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g001.tif">
<alt-text content-type="machine-generated">Matrix multiplication visualization involving three matrices. A \(3 \times 3\) blue matrix \(X\) multiplies a \(3 \times 2\) pink matrix \(W\) to yield a \(3 \times 2\) green matrix \(Z\). The elements of \(Z\) are subsequently vectorized into a single column vector labeled \(\text{vec}(Z)\), with elements \(Z_1\) to \(Z_6\).</alt-text>
</graphic>
</fig>
<p>The mean and covariance at the output of the activation function, <inline-formula id="inf99">
<mml:math id="m106">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, are derived using the first-order Taylor approximation as in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>.<disp-formula id="e8">
<mml:math id="m107">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>;</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf100">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the Jacobian matrix of the activation function <inline-formula id="inf101">
<mml:math id="m109">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with respect to the input vector <inline-formula id="inf102">
<mml:math id="m110">
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, evaluated at the mean <inline-formula id="inf103">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Value network and reward design</title>
<p>We define the value function as a CNN that takes the state and reward from the environment as inputs and produces the value estimate that penalizes the robot&#x2019;s incorrect actions. The parameters of the value network are <inline-formula id="inf104">
<mml:math id="m112">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf105">
<mml:math id="m113">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the weight matrices for <inline-formula id="inf106">
<mml:math id="m114">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> layers. The critic or penalty value estimates, <inline-formula id="inf107">
<mml:math id="m115">
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, serve as a baseline for the policy network to update its parameters through policy-gradient approach and back-propagation (<xref ref-type="bibr" rid="B37">Sewak, 2019</xref>). The temporal difference (TD) error, <inline-formula id="inf108">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, between the subsequent state-value estimates is computed using the instantaneous reward and discounted state value of the subsequent state as in (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>), where <inline-formula id="inf109">
<mml:math id="m117">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the discounting factor and <inline-formula id="inf110">
<mml:math id="m118">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the reward. <inline-formula id="inf111">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>) represents one-step return updates, which can be expanded to a multi-step update. The value function in (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>), <inline-formula id="inf112">
<mml:math id="m120">
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, is the state-value CNN estimator parametrized by weights or parameters <inline-formula id="inf113">
<mml:math id="m121">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.<disp-formula id="e9">
<mml:math id="m122">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>
<xref ref-type="fig" rid="F2">Figure 2</xref> illustrates the general structure of the proposed Trust-Nav framework, where the policy and value networks form a robot that interacts with the environment. The detailed interaction and optimization procedure is provided in Algorithm 1.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The proposed Trust-Nav framework with the variational policy and value networks forming a robot that interacts with the environment, and quantifies uncertainty in the robot&#x2019;s actions.</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g002.tif">
<alt-text content-type="machine-generated">Diagram of a Trustworthy Navigation system using the ROS Gazebo Environment. It shows a flow of states, rewards, and actions involving a Variational Policy Network, TD Error, and a Value Network. The process involves calculating actions from states, generating rewards, and feedback loops via TD Error to optimize navigation. The system aims for efficient decision-making in navigation tasks.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Reward function</title>
<p>The reward function defined in <xref ref-type="disp-formula" rid="e10">Equation 10</xref> incorporates a standard non-collision mechanism that imposes strong penalties on the robot for collisions or other undesirable behaviors with an exploration bonus grounded in D-optimality from TOED. Collisions or unsafe actions incur a large negative reward, while forward motion without collision receives the highest positive reward, turning receives a smaller positive reward, and exploration into unmapped areas receives an additional uncertainty-based reward.</p>
<p>To encourage informative exploration, we employ the D-optimality criterion from the Theory of Optimal Experimental Design (TOED). In TOED, D-optimality maximizes the determinant of the information matrix, which is equivalent to minimizing the volume of the pose-map confidence ellipsoid associated with the estimated parameters. In the context of active SLAM and exploration, this property directly translates to maximizing global information gain about the environment and reducing overall localization and mapping uncertainty. Compared to other TOED measures such as A-optimality (which minimizes the average variance) or E-optimality (which minimizes the maximum or worst-case variance), D-optimality captures global variance across all state dimensions and has been shown in prior work (<xref ref-type="bibr" rid="B5">Carrillo et al., 2012</xref>; <xref ref-type="bibr" rid="B36">Rodr&#xed;guez-Ar&#xe9;valo et al., 2018</xref>) to produce more balanced and efficient exploration trajectories in robotics.</p>
<p>The exploration reward is bounded using the hyperbolic tangent function, <inline-formula id="inf114">
<mml:math id="m123">
<mml:mrow>
<mml:mi>tanh</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, with scaling factor <inline-formula id="inf115">
<mml:math id="m124">
<mml:mrow>
<mml:mi>&#x3b6;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, to prevent extreme exploration values from dominating the fixed forward/turn rewards. This normalization strategy stabilizes learning and is consistent with reward-bounding methods used in reinforcement learning for navigation. This structured reward design encourages safe navigation while explicitly rewarding globally informative exploration, thus promoting efficient and robust policy learning. The reward function is defined in <xref ref-type="disp-formula" rid="e10">Equation 10</xref>.<disp-formula id="e10">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;collision</mml:mtext>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b6;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;straight</mml:mtext>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b6;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;turning</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf116">
<mml:math id="m126">
<mml:mrow>
<mml:mi>&#x3b6;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a task-dependent scale factor and <inline-formula id="inf117">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the D-optimality criterion. The D-optimal exploration reward is derived from TOED (<xref ref-type="bibr" rid="B33">Placed and Castellanos, 2022</xref>). This design of the reward components follows standard practices in reinforcement learning for navigation tasks, where the goal is to balance safety, efficiency, and exploration.<list list-type="simple">
<list-item>
<p>
<inline-formula id="inf118">
<mml:math id="m128">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Collision penalty (-100): A large negative reward is assigned to collisions to strongly discourage unsafe behaviors. This magnitude is consistent with navigation benchmarks, where collisions must be treated as catastrophic outcomes relative to other objectives.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf119">
<mml:math id="m129">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Straight motion reward (&#x2b;1): A positive baseline reward is assigned to forward movement to encourage progress toward the goal and avoid oscillatory or stagnant behaviors.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf120">
<mml:math id="m130">
<mml:mrow>
<mml:mo>&#x2022;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Turning penalty (-0.1): A small negative reward is assigned to turning to discourage excessive rotations without progress. The magnitude is modest to allow necessary turns when required, but still biases the policy toward efficient, goal-directed motion.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Learning objective, gradients, and relation to policy-gradient theory</title>
<p>Let <inline-formula id="inf121">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the marginal policy induced by the variational posterior over the policy network weights as in <xref ref-type="disp-formula" rid="e11">Equation 11</xref>.<disp-formula id="e11">
<mml:math id="m132">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x222b;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="" close=")">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-script">W</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf122">
<mml:math id="m133">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. With the log-derivative trick, the policy-gradient theorem writes the loss gradient as in <xref ref-type="disp-formula" rid="e12">Equation 12</xref>.<disp-formula id="e12">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>J</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf123">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is any unbiased advantage estimate. In our implementation <inline-formula id="inf124">
<mml:math id="m136">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the TD error <inline-formula id="inf125">
<mml:math id="m137">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is a standard, low-variance advantage estimator.</p>
<p>Because the log function is concave, Jensen gives a lower bound in <xref ref-type="disp-formula" rid="e13">Equation 13</xref>.<disp-formula id="e13">
<mml:math id="m138">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2265;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:mtext>ELBO&#x2009;on&#x2009;</mml:mtext>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>Maximizing <inline-formula id="inf126">
<mml:math id="m139">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> therefore maximizes a <italic>surrogate</italic> for <inline-formula id="inf127">
<mml:math id="m140">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Replacing <inline-formula id="inf128">
<mml:math id="m141">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with its ELBO inside the actor objective yields the standard advantage-weighted maximum-likelihood surrogate as in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>.<disp-formula id="e14">
<mml:math id="m142">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actor</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>ELBO&#x2009;</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mtext>&#x2009;likelihood</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Bayesian&#x2009;Regularizer</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf129">
<mml:math id="m143">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is a regularization weight. This is directly analogous to REINFORCE/actor-critic with (i) an advantage weight <inline-formula id="inf130">
<mml:math id="m144">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and (ii) a Bayesian regularizer (akin to entropy/trust-region regularization).</p>
<p>The critic or value neural network is trained with the standard TD mean-squared error (MSE) as in <xref ref-type="disp-formula" rid="e15">Equation 15</xref>.<disp-formula id="e15">
<mml:math id="m145">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Critic</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>and is optimized independently of the actor&#x2019;s KL/ELBO terms (no gradients from the actor loss flow into <inline-formula id="inf131">
<mml:math id="m146">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. We use the critic only to supply the advantage estimate <inline-formula id="inf132">
<mml:math id="m147">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for the actor update; as noted above, <inline-formula id="inf133">
<mml:math id="m148">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is detached when forming <inline-formula id="inf134">
<mml:math id="m149">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actor</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to avoid biasing the critic.</p>
<p>Summary of the learning rule: If we denote the likelihood by the following definition,<disp-formula id="e16">
<mml:math id="m150">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2248;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf135">
<mml:math id="m151">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are obtained analytically via our moment-propagation (linear layers and first-order treatment of nonlinearities). The actor gradient is defined in <xref ref-type="disp-formula" rid="e17">Equation 17</xref>.<disp-formula id="e17">
<mml:math id="m152">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actor</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>The critic gradient is defined in <xref ref-type="disp-formula" rid="e18">Equation 18</xref>.<disp-formula id="e18">
<mml:math id="m153">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Critic</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
<mml:mtext>&#x2009;with&#x2009;&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;&#x2009;detached&#x2009;in&#x2009;actor&#x2009;updates</mml:mtext>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
</p>
<p>The interaction of the policy and value networks with the environment and the optimization through the TD error to maximize the cumulative reward is detailed in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Trust-Nav: Advantage-weighted variational actor&#x2013;critic with closed-form moment propagation.<list list-type="simple">
<list-item>
<p>1: <bold>Inputs:</bold> Number of episodes, <inline-formula id="inf136">
<mml:math id="m154">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, maximum number of steps per episode <inline-formula id="inf137">
<mml:math id="m155">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, learning rate <inline-formula id="inf138">
<mml:math id="m156">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, discount factor <inline-formula id="inf139">
<mml:math id="m157">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, KL weight <inline-formula id="inf140">
<mml:math id="m158">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the initial conditions for the learnable parameters <inline-formula id="inf141">
<mml:math id="m159">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf142">
<mml:math id="m160">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 2: <bold>Init</bold>: Variational policy: <inline-formula id="inf143">
<mml:math id="m161">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the value (critic) network parameters <inline-formula id="inf144">
<mml:math id="m162">
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 3: <bold>for</bold> <inline-formula id="inf145">
<mml:math id="m163">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf146">
<mml:math id="m164">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p> 4:&#x2003;&#x2003;&#x2003;&#x2003; Reset the environment to get the initial state <inline-formula id="inf147">
<mml:math id="m165">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p> 5:&#x2003;&#x2003;&#x2003;&#x2003;<bold>for</bold> <inline-formula id="inf148">
<mml:math id="m166">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf149">
<mml:math id="m167">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p> 6:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Policy forward</bold>: propagate posterior means/covariances <inline-formula id="inf150">
<mml:math id="m168">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> through the policy network</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;to get <inline-formula id="inf151">
<mml:math id="m169">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 7:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Action selection</bold>: sample <inline-formula id="inf152">
<mml:math id="m170">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p> 8:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Execute</bold> <inline-formula id="inf153">
<mml:math id="m171">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and observe the reward <inline-formula id="inf154">
<mml:math id="m172">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and the new state <inline-formula id="inf155">
<mml:math id="m173">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p> 9:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Value forward</bold>: forward pass through the value network, and calculate <inline-formula id="inf156">
<mml:math id="m174">
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and critic TD</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;error <inline-formula id="inf157">
<mml:math id="m175">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p> 10:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Critic loss</bold>: <inline-formula id="inf158">
<mml:math id="m176">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Critic</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 11:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Actor loss</bold>: <inline-formula id="inf159">
<mml:math id="m177">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actor</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<inline-formula id="inf959">
<mml:math id="m977">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-script">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mtext>KL</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 12:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Update the value (critic) network parameters</bold>: <inline-formula id="inf160">
<mml:math id="m178">
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="bold-script">U</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Critic</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-script">U</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 13:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<bold>Update the variational policy network parameters</bold>: <inline-formula id="inf161">
<mml:math id="m179">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>Actor</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p> 14:&#x2003;&#x2003;&#x2003;&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p> 15: <bold>end for</bold>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<sec id="s4-1">
<label>4.1</label>
<title>Experimental set-up</title>
<p>In our experiments, we utilize OpenAI&#x2019;s Gym-Gazebo extension (<xref ref-type="bibr" rid="B47">Zamora et al., 2016</xref>), which leverages the Gazebo robotics simulator to provide a standardized and reproducible interface for reinforcement learning in robotic environments. The Gym-Gazebo extension library facilitates the creation of simulated environments where robotic agents are readily accessible and can be seamlessly integrated with machine learning architectures for both training and evaluation (<xref ref-type="bibr" rid="B47">Zamora et al., 2016</xref>). This simulator enables precise control of environmental conditions and sensor characteristics, which is essential for isolating and quantifying the effects of uncertainty modeling in our framework.</p>
<p>The proposed Trust-Nav model is deployed and evaluated using a simulated <italic>TurtleBot3</italic> robot and pre-configured environments provided by OpenAI&#x2019;s repositories. The action space consists of three discrete actions: move forward, turn left, and turn right, with fixed linear and angular velocities defined in the TurtleBot3 simulation. The state representation comprises processed 2D LiDAR scan data (360&#xb0; range readings) and robot pose estimates from ROS, all normalized to [0,1] for stable learning.</p>
<p>Experimental results are systematically documented and analyzed in comparison to a carefully selected baseline model&#x2014;Det-Nav&#x2014;which represents a deterministic navigation approach. Both Trust-Nav and Det-Nav share the same underlying network architecture; however, Det-Nav does not incorporate variational inference and instead relies on point estimates for action selection, omitting the propagation of uncertainty through the policy network. This comparison allows us to isolate and assess the impact of uncertainty modeling on decision-making, particularly in the presence of environmental noise or corruption, thereby validating the robustness and effectiveness of the proposed Trust-Nav approach. This controlled architectural parity allows us to isolate the contribution of uncertainty modeling to policy performance, avoiding confounding effects from differences in mapping, planning, or control modules.</p>
<p>Both Trust-Nav and Det-Nav models employ identical 10-layer convolutional neural network (CNN) architectures for both the policy and value networks. The architecture begins with three convolutional layers using 32 filters of size <inline-formula id="inf162">
<mml:math id="m180">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, followed by three layers with 64 filters of size <inline-formula id="inf163">
<mml:math id="m181">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. This is succeeded by three additional convolutional layers with 128 filters of size <inline-formula id="inf164">
<mml:math id="m182">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, which capture fine-grained spatial features. The final layer is a fully connected layer that produces the output corresponding to either the policy distribution or the value estimate, depending on the network&#x2019;s role. While both Trust-Nav and Det-Nav share identical network architectures and the same hyperparameter search protocol, the final learning rates differ due to independent tuning for stable convergence in each method. This approach avoids biasing the comparison by forcing identical learning rates despite differing optimization dynamics (variational inference in Trust-Nav vs. point estimates in Det-Nav). All hyperparameter values are provided in <xref ref-type="table" rid="T1">Table 1</xref> for full transparency.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Hyperparameters for the Trust-Nav and Det-Nav models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Hyperparameter</th>
<th align="center">Trust-Nav</th>
<th align="center">Det-Nav</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Learning rate <inline-formula id="inf165">
<mml:math id="m183">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.0002</td>
<td align="center">0.001</td>
</tr>
<tr>
<td align="left">Batch size</td>
<td align="center">16</td>
<td align="center">16</td>
</tr>
<tr>
<td align="left">Discount factor <inline-formula id="inf166">
<mml:math id="m184">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.95</td>
<td align="center">0.95</td>
</tr>
<tr>
<td align="left">Replay memory</td>
<td align="center">100,000</td>
<td align="center">100,000</td>
</tr>
<tr>
<td align="left">Episode size</td>
<td align="center">1,500 steps</td>
<td align="center">1,500 steps</td>
</tr>
<tr>
<td align="left">Total number of episodes</td>
<td align="center">200</td>
<td align="center">200</td>
</tr>
<tr>
<td align="left">Exploration decay rate</td>
<td align="center">0.999</td>
<td align="center">0.999</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Noise and disturbance analysis is conducted using realistically parameterized Gaussian noise and adversarial attacks (<xref ref-type="table" rid="T2">Table 2</xref>), with values chosen to reflect ranges reported for common mobile robot sensors such as LiDAR and RGB-D cameras.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Noise levels for random (Gaussian) noise and adversarial attacks.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Type of noise</th>
<th colspan="7" align="center">Levels of noise</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Random noise (std)</td>
<td align="center">0.0001</td>
<td align="center">0.001</td>
<td align="center">0.1</td>
<td align="center">0.2</td>
<td align="center">0.3</td>
<td align="center">0.4</td>
<td align="center">0.5</td>
</tr>
<tr>
<td align="left">Adversarial noise <inline-formula id="inf167">
<mml:math id="m185">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.0001</td>
<td align="center">0.001</td>
<td align="center">0.01</td>
<td align="center">0.05</td>
<td align="center">0.1</td>
<td align="center">&#x2013;</td>
<td align="center">&#x2013;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental pipeline is implemented using the Robot Operating System (ROS), which runs on a Linux-based system with computation accelerated by four NVIDIA Quadro RTX 6000 GPUs (24 GB memory each). Each policy is evaluated over 200 independent episodes per condition to ensure statistical reliability and consistency of results. To assess learning stability and navigation robustness, we track key performance metrics, including <italic>Moving Average Rewards, Maximum Rewards, and Cumulative Rewards</italic>. These metrics are summarized in <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> and <xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The cumulative reward of the proposed Trust-Nav (blue curve) and Det-Nav (red curve) in a noise-free test environment (without adding noise).</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g003.tif">
<alt-text content-type="machine-generated">Line graph showing cumulative rewards over episodes for two navigation systems: Trust-Nav (blue) and Det-Nav (red). Trust-Nav's performance stabilizes around a higher reward compared to Det-Nav, which fluctuates more widely. The x-axis represents episodes from 0 to 250, and the y-axis shows cumulative rewards from -2500 to 1000.</alt-text>
</graphic>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The average reward values of Trust-Nav compared to Det-Nav, validated on Gazebo environments under various levels of Gaussian noise and adversarial attacks. <bold>(a)</bold> Both models are evaluated under Gaussian noise. <bold>(b)</bold> Trust-Nav is tested under Gaussian noise. <bold>(c)</bold> Det-Nav is tested under Gaussian noise. <bold>(d)</bold> Both models are evaluated under adversarial attacks. <bold>(e)</bold> Trust-Nav is tested under adversarial attacks. <bold>(f)</bold> Det-Nav is tested under adversarial attacks.</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g004.tif">
<alt-text content-type="machine-generated">Six line graphs show the average reward over episodes for Trust-Nav and Det-Nav under Gaussian and adversarial conditions. Graphs (a), (b), and (c) present Gaussian results, with varying standard deviations. Graphs (d), (e), and (f) show adversarial results with different epsilon values. Trust-Nav and Det-Nav are distinguished by solid and dashed lines, respectively, with distinct colors representing different noise levels.</alt-text>
</graphic>
</fig>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Maximum and average reward values for the Trust-Nav and Det-Nav models in test environments corrupted with different levels of Gaussian noise (std).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Noise level (std)</th>
<th colspan="2" align="center">Trust-Nav</th>
<th colspan="2" align="center">Det-Nav</th>
</tr>
<tr>
<th align="center">Max reward</th>
<th align="center">Avg reward</th>
<th align="center">Max reward</th>
<th align="center">Avg reward</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">0.0001</td>
<td align="center">916.700 <inline-formula id="inf168">
<mml:math id="m186">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.31</td>
<td align="center">916.316 <inline-formula id="inf169">
<mml:math id="m187">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.23</td>
<td align="center">846.197 <inline-formula id="inf170">
<mml:math id="m188">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 4.56</td>
<td align="center">836.894 <inline-formula id="inf171">
<mml:math id="m189">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 4.98</td>
</tr>
<tr>
<td align="center">0.001</td>
<td align="center">915.423 <inline-formula id="inf172">
<mml:math id="m190">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.65</td>
<td align="center">915.313 <inline-formula id="inf173">
<mml:math id="m191">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.92</td>
<td align="center">838.760 <inline-formula id="inf174">
<mml:math id="m192">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 4.89</td>
<td align="center">832.734 <inline-formula id="inf175">
<mml:math id="m193">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.87</td>
</tr>
<tr>
<td align="center">0.1</td>
<td align="center">914.288 <inline-formula id="inf176">
<mml:math id="m194">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.28</td>
<td align="center">913.596 <inline-formula id="inf177">
<mml:math id="m195">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.21</td>
<td align="center">832.301 <inline-formula id="inf178">
<mml:math id="m196">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.69</td>
<td align="center">825.627 <inline-formula id="inf179">
<mml:math id="m197">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.23</td>
</tr>
<tr>
<td align="center">0.2</td>
<td align="center">913.864 <inline-formula id="inf180">
<mml:math id="m198">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.91</td>
<td align="center">913.379 <inline-formula id="inf181">
<mml:math id="m199">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.29</td>
<td align="center">808.853 <inline-formula id="inf182">
<mml:math id="m200">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.98</td>
<td align="center">812.423 <inline-formula id="inf183">
<mml:math id="m201">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.94</td>
</tr>
<tr>
<td align="center">0.3</td>
<td align="center">912.926 <inline-formula id="inf184">
<mml:math id="m202">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3.17</td>
<td align="center">913.127 <inline-formula id="inf185">
<mml:math id="m203">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.54</td>
<td align="center">771.164 <inline-formula id="inf186">
<mml:math id="m204">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 8.79</td>
<td align="center">774.136 <inline-formula id="inf187">
<mml:math id="m205">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 7.49</td>
</tr>
<tr>
<td align="center">0.4</td>
<td align="center">909.846 <inline-formula id="inf188">
<mml:math id="m206">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.94</td>
<td align="center">912.864 <inline-formula id="inf189">
<mml:math id="m207">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3.62</td>
<td align="center">730.738 <inline-formula id="inf190">
<mml:math id="m208">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 8.96</td>
<td align="center">734.537 <inline-formula id="inf191">
<mml:math id="m209">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 8.89</td>
</tr>
<tr>
<td align="center">0.5</td>
<td align="center">909.011 <inline-formula id="inf192">
<mml:math id="m210">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3.52</td>
<td align="center">911.365 <inline-formula id="inf193">
<mml:math id="m211">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3.48</td>
<td align="center">683.200 <inline-formula id="inf194">
<mml:math id="m212">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 9.25</td>
<td align="center">682.930 <inline-formula id="inf195">
<mml:math id="m213">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 10.61</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Maximum and average reward values for the Trust-Nav and Det-Nav models in test environments corrupted with different levels of adversarial attacks <inline-formula id="inf196">
<mml:math id="m214">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Noise level <inline-formula id="inf197">
<mml:math id="m215">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="2" align="center">Trust-Nav</th>
<th colspan="2" align="center">Det-Nav</th>
</tr>
<tr>
<th align="center">Max reward</th>
<th align="center">Avg reward</th>
<th align="center">Max reward</th>
<th align="center">Avg reward</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">0.0001</td>
<td align="center">921.269 <inline-formula id="inf198">
<mml:math id="m216">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.12</td>
<td align="center">918.213 <inline-formula id="inf199">
<mml:math id="m217">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.97</td>
<td align="center">883.674 <inline-formula id="inf200">
<mml:math id="m218">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.89</td>
<td align="center">878.989 <inline-formula id="inf201">
<mml:math id="m219">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.51</td>
</tr>
<tr>
<td align="center">0.001</td>
<td align="center">918.518 <inline-formula id="inf202">
<mml:math id="m220">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.11</td>
<td align="center">917.156 <inline-formula id="inf203">
<mml:math id="m221">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.36</td>
<td align="center">877.642 <inline-formula id="inf204">
<mml:math id="m222">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5.88</td>
<td align="center">868.454 <inline-formula id="inf205">
<mml:math id="m223">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.54</td>
</tr>
<tr>
<td align="center">0.01</td>
<td align="center">917.844 <inline-formula id="inf206">
<mml:math id="m224">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 1.98</td>
<td align="center">916.469 <inline-formula id="inf207">
<mml:math id="m225">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.58</td>
<td align="center">873.430 <inline-formula id="inf208">
<mml:math id="m226">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.97</td>
<td align="center">863.324 <inline-formula id="inf209">
<mml:math id="m227">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.94</td>
</tr>
<tr>
<td align="center">0.05</td>
<td align="center">915.770 <inline-formula id="inf210">
<mml:math id="m228">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.56</td>
<td align="center">913.695 <inline-formula id="inf211">
<mml:math id="m229">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.49</td>
<td align="center">868.769 <inline-formula id="inf212">
<mml:math id="m230">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.82</td>
<td align="center">862.221 <inline-formula id="inf213">
<mml:math id="m231">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 6.99</td>
</tr>
<tr>
<td align="center">0.1</td>
<td align="center">886.264 <inline-formula id="inf214">
<mml:math id="m232">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.96</td>
<td align="center">880.293 <inline-formula id="inf215">
<mml:math id="m233">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3.12</td>
<td align="center">853.136 <inline-formula id="inf216">
<mml:math id="m234">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 9.83</td>
<td align="center">818.373 <inline-formula id="inf217">
<mml:math id="m235">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 8.85</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Robustness analysis under noisy conditions</title>
<p>We evaluate the robustness of the proposed Trust-Nav model against two well-defined disturbance types: additive Gaussian noise and adversarial attacks, comparing it to the Det-Nav model. The post-training robustness analysis is performed after the models are fully trained and validated in a simulated training environment, which ensures that the performance degradation can be attributed purely to test-time perturbations, without affecting the learned policy during training. We design the experiments such that we start training the robot in a clean, noise-free environment before introducing noise to assess the effect of noise on policy performance without influencing the learning process. Then, we incrementally introduce noise complexity in a test environment using random (Gaussian) noise and adversarial attacks to progressively degrade the robot&#x2019;s perception. First, we evaluate the performance of the proposed Trust-Nav compared to Det-Nav models in a clean test environment (without noise). Then, we gradually add various levels of Gaussian or adversarial noise to the test environment states to evaluate the performance of each model.</p>
<p>Gaussian noise is introduced in seven levels of increasing severity, defined by the standard deviation (std) parameter, which is chosen to align with empirical sensor noise characteristics documented in robotics literature. <xref ref-type="fig" rid="F5">Figure 5</xref> demonstrates the depth camera observations for robot navigation under increasing Gaussian noise levels, where higher standard deviations progressively degrade the visual quality of the input. As shown in the figure, higher noise levels progressively corrupt the sensor observations, making navigation more challenging. This experiment demonstrates how Trust-Nav adapts to noisy depth measurements by leveraging uncertainty propagation in its policy.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Depth camera observations under varying input noise levels used for robot navigation. The standard deviation (std) values (0.0, 0.1, 0.2, 0.3) represent increasing amounts of Gaussian perturbation added to the depth measurements. As noise grows, the sensor data becomes progressively more corrupted, highlighting the challenge of robust policy learning under uncertain perception.</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g005.tif">
<alt-text content-type="machine-generated">Four heatmaps display increasing levels of noise from left to right, labeled with standard deviations of 0.0, 0.1, 0.2, and 0.3. The top of each map shifts from green to yellow, while the lower areas vary from dark blue to purple, reflecting the noise level's impact on data clarity.</alt-text>
</graphic>
</fig>
<p>Adversarial examples are generated using the Fast Gradient Sign Method (FGSM) (<xref ref-type="bibr" rid="B11">Goodfellow et al., 2015</xref>), with five attack levels controlled by <inline-formula id="inf218">
<mml:math id="m236">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as in <xref ref-type="disp-formula" rid="e19">Equation 19</xref>. Both disturbance types are applied in the test environment only, preserving a clean training phase for fair assessment.<disp-formula id="e19">
<mml:math id="m237">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>adv</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mtext>where&#x2009;</mml:mtext>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>adv</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf219">
<mml:math id="m238">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the ELBO likelihood of the policy network defined in <xref ref-type="disp-formula" rid="e16">Equation 16</xref>, while the networks&#x2019; parameters will be frozen during attack generation. The normalization constraint ensures that adversarial states remain in the valid input range <inline-formula id="inf220">
<mml:math id="m239">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf221">
<mml:math id="m240">
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mtext>&#x2009;sign</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. <xref ref-type="table" rid="T2">Table 2</xref> provides <inline-formula id="inf222">
<mml:math id="m241">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> values for the five levels of attacks applied to the test environment. Both Trust-Nav and Det-Nav models are validated under all noise levels, with each level undergoing 200 episodes per run, with results averaged to ensure consistency. The complete noise/attack specifications are given in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Robot uncertainty vs. signal to noise ratio</title>
<p>The proposed Trust-Nav framework develops a robot that produces actions and uncertainty information simultaneously in the form of the actions&#x2019; distribution mean and variance-covariance matrix. The analysis of uncertainty under noisy conditions (when the environment is corrupted by Gaussian noise or adversarial attacks) provides insights into the navigation performance after deployment and possible detection of the robot&#x2019;s failure due to environment complexity. We analyze the predictive variance of actions at various levels of Gaussian noise and adversarial attacks. The amount of noise at each level is measured using the signal-to-noise ratio (SNR). For adversarial attacks, the signal is the clean input state <inline-formula id="inf223">
<mml:math id="m242">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the noise is the perturbation vector <inline-formula id="inf224">
<mml:math id="m243">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> applied by FGSM. The SNR is typically defined in decibels (dB) as in <xref ref-type="disp-formula" rid="e20">Equation 20</xref>.<disp-formula id="e20">
<mml:math id="m244">
<mml:mrow>
<mml:mtext>SNR</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>log</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2113;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
</p>
<p>The average action variance is calculated for all the test frames at each noise level. We scale the action variance curves from zero by subtracting the variance at the baseline (clean test environment states without noise) at each level. The resulting average action variance is plotted against the respective SNR values to produce <italic>variance-vs-SNR</italic> curves (<xref ref-type="fig" rid="F6">Figure 6</xref>), which are interpreted from right to left. The variance values at the extreme right side of the graph correspond to very high SNR (low noise levels). The addition of noise results in a decrease in the SNR values, progressing from right to left. The extreme left point represents the average variance at the lowest SNR (i.e., the highest levels of noise).</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Relationship between signal-to-noise ratio (SNR) and <bold>(a)</bold> average action variance, <bold>(b)</bold> maximum episode reward, and <bold>(c)</bold> average episode reward for Trust-Nav and Det-Nav under Gaussian noise and adversarial perturbations. Blue curves correspond to Trust-Nav and red curves to Det-Nav; solid lines indicate adversarial attacks, and dashed lines indicate Gaussian noise. Higher action variance at low SNR reflects increased navigation uncertainty, with Trust-Nav showing a statistically significant increase in variance compared to noise-free conditions (Wilcoxon signed-rank test, <inline-formula id="inf225">
<mml:math id="m245">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Trust-Nav consistently maintains higher maximum and average rewards across all noise conditions, demonstrating robustness to both Gaussian and adversarial perturbations as compared to the Det-Nav baseline. The star marker denotes the point of statistically significant variance increase.</p>
</caption>
<graphic xlink:href="frobt-12-1652050-g006.tif">
<alt-text content-type="machine-generated">Three line graphs compare the performance of different navigation strategies. (a) Shows action variance versus SNR, demonstrating a decrease in variance as SNR increases. (b) Displays maximum reward versus SNR, where maximum reward stabilizes with higher SNR for all strategies. (c) Depicts average reward versus SNR, indicating an upward trend with increasing SNR. Blue lines represent Trust-Nav strategies, while red lines represent Det-Nav strategies, each with solid for adversarial and dashed for Gaussian conditions.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="results|discussion" id="s5">
<label>5</label>
<title>Results and discussion</title>
<sec id="s5-1">
<label>5.1</label>
<title>Performance analysis and robustness</title>
<p>This section discusses the performance evaluation and the robustness behavior of the proposed Trust-Nav model compared to the baseline Det-Nav model. The average, cumulative, and maximum rewards demonstrate the performance metric of the models in the test-simulated environment. <xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the cumulative reward obtained by the proposed Trust-Nav (blue curve) and Det-Nav (red curve) in a noise-free test environment (without adding noise). Initially, both models yield low reward values; however, as training progresses over multiple episodes, the rewards steadily increase, indicating effective policy learning and successful maximization of the reward function.</p>
<p>
<xref ref-type="fig" rid="F4">Figure 4</xref> presents the average reward values for Trust-Nav and Det-Nav models in a test simulated environment under varying levels of Gaussian noise and adversarial attacks. Each curve represents the average reward obtained in a separate experiment corresponding to a specific noise level. <xref ref-type="fig" rid="F4">Figures 4b,c</xref> show the average rewards of Trust-Nav and Det-Nav, respectively, across Gaussian noise levels ranging from 0.0001 to 0.5. In <xref ref-type="fig" rid="F4">Figure 4a</xref>, the average rewards of both models are plotted together for direct comparison, with solid lines representing Trust-Nav and dashed lines representing Det-Nav. As expected, increasing the standard deviation of the injected Gaussian noise has a negative impact on performance for both models. However, Trust-Nav demonstrates greater robustness by consistently achieving higher average rewards&#x2014;approximately 900&#x2014;compared to Det-Nav, whose performance drops to around 675 under high noise conditions.</p>
<p>
<xref ref-type="fig" rid="F4">Figures 4e,f</xref> demonstrate the rewards achieved by Trust-Nav and Det-Nav models, respectively, when adversarial perturbations are introduced into the environment&#x2019;s state observations at varying levels of attack severity (<inline-formula id="inf226">
<mml:math id="m246">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0001</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> - <inline-formula id="inf227">
<mml:math id="m247">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Every curve presents an experiment with distinct attack severity. <xref ref-type="fig" rid="F4">Figure 4d</xref> provides a comparative view, plotting the reward trajectories of both models under all five levels of adversarial attacks. In this figure, solid lines represent Trust-Nav, while dashed lines represent Det-Nav. To ensure visual consistency and facilitate comparative analysis, the same color scheme is used across all subplots to indicate equivalent noise or attack severity levels.</p>
<p>As expected, the introduction of adversarial examples negatively impacts both models, with increasing attack strength leading to greater reward degradation. Nevertheless, Trust-Nav exhibits significantly more robust behavior under adversarial conditions. Its average reward remains relatively stable across all but the highest attack level, decreasing only slightly from approximately 920 to 880 when<inline-formula id="inf228">
<mml:math id="m248">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. In contrast, Det-Nav exhibits a more pronounced decline, with reward values decreasing from approximately 900 to 850 under the same conditions. These results highlight the enhanced robustness and reliability of Trust-Nav to adversarial perturbations in comparison to its deterministic counterpart.</p>
</sec>
<sec id="s5-2">
<label>5.2</label>
<title>Robot uncertainty analysis and self-assessment</title>
<p>We employ the action variance at the output of the variational policy network in the Trust-Nav model as a quantitative metric to evaluate the robot&#x2019;s navigation confidence (or uncertainty) without requiring any additional sensing, data processing or computational overhead. This property enables what we refer to as <italic>self-assessment</italic>, whereby the model internally gauges the trustworthiness of its own actions based on the magnitude of the output variance. Intuitively, higher action variance reflects increased uncertainty in navigation decisions, signaling low confidence in the robot&#x2019;s actions under challenging or degraded sensing conditions.</p>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> illustrates the relationship between signal-to-noise ratio (SNR) and (a) average action variance, (b) maximum episode reward, and (c) average episode reward for both Trust-Nav and the deterministic baseline Det-Nav. Blue curves represent Trust-Nav and red curves represent Det-Nav, with solid lines denoting adversarial perturbations and dashed lines denoting Gaussian noise. The plots read from right to left, as lower SNR values correspond to higher noise levels.</p>
<p>Across all noise levels, Trust-Nav consistently outperforms Det-Nav in both maximum and average rewards. While both models experience declining performance at low SNR, Trust-Nav maintains significantly higher rewards, particularly under Gaussian noise, where the average reward decreases by only (<inline-formula id="inf229">
<mml:math id="m249">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.5%) compared to (<inline-formula id="inf230">
<mml:math id="m250">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 14%) for Det-Nav. Under adversarial perturbations, Trust-Nav experiences a larger drop (<inline-formula id="inf231">
<mml:math id="m251">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 8%) but still remains superior to Det-Nav&#x2019;s (<inline-formula id="inf232">
<mml:math id="m252">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 18%) reduction. Importantly, under low SNR (e.g., SNR <inline-formula id="inf233">
<mml:math id="m253">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> dB), the action variance of Trust-Nav increases sharply, indicating heightened uncertainty that correlates with performance degradation. This relationship is statistically significant according to a Wilcoxon signed-rank test <inline-formula id="inf234">
<mml:math id="m254">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> when comparing action variance at high versus low SNR, validating variance as a meaningful uncertainty indicator. We refer to the point of statistically significant variance increase by a star in <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<p>The increase in variance concurrent with declining reward demonstrates that Trust-Nav is self-aware of deteriorating navigation performance. This self-assessment capability is a key step toward safe and reliable deployment in real-world robotics, where the ability to detect and respond to uncertain decision states is essential for preventing unsafe actions.</p>
</sec>
<sec id="s5-3">
<label>5.3</label>
<title>Discussion</title>
<p>This paper introduces a new deep reinforcement learning navigation (Trust-Nav) framework that propagates variational moments through the policy neural network and estimates the uncertainty in the robot&#x2019;s actions and localization. The variational policy network propagates the first two moments (mean and covariance) of the variational posterior distribution of the network&#x2019;s parameters and estimates the uncertainty in the robot&#x2019;s actions via the variance of the policy distribution. We conduct a comprehensive analysis using the Gazebo simulated environment under various noisy conditions. The performance of the Trust-Nav model is compared with the state-of-the-art DRL navigation networks under multiple levels of Gaussian noise and adversarial attacks, i.e., FGSM.</p>
<p>Our analysis reveals that the Trust-Nav model maintains its reward values and outperforms the corresponding deterministic DRL navigation when the environment is subject to Gaussian noise or adversarial attacks. Furthermore, the robot&#x2019;s action variance significantly increases when the adversarial noise is high, and the model&#x2019;s reward values start to decrease. The moments of the policy variational distribution transmit vital state features from the environment through the policy network to the action predictions. The second moment (i.e., the variance) of the variational distribution over the policy parameters filters the state features according to their importance. This policy filtering mechanism of the environmental dynamic features via the variance of the variational distribution forces the robot&#x2019;s action variance to increase when these features are corrupted with noise or adversarial attacks.</p>
<p>In addition to the quantitative results, we also observe qualitative behavioral patterns that reinforce the role of action variance as a self-assessment signal. For instance, under high-uncertainty zones corresponding to low-SNR adversarial conditions, the robot exhibits noticeably cautious navigation&#x2014;slowing down, hesitating before turns, and occasionally failing to commit to decisive maneuvers. These behaviors coincide with spikes in action variance, highlighting the model&#x2019;s internal recognition of unreliable decision states. Conversely, when operating in higher-SNR conditions, the variance remains low, and the robot navigates confidently, with smoother trajectories and fewer hesitations. This qualitative evidence illustrates how Trust-Nav&#x2019;s uncertainty-aware design enables the robot to adaptively signal and respond to reliability degradation, offering an interpretable connection between statistical variance and observable robot performance.</p>
</sec>
<sec id="s5-4">
<label>5.4</label>
<title>Deployment perspective and real-world applicability</title>
<p>Although our evaluation is conducted in simulation, the Trust-Nav framework is designed with deployment feasibility in mind. By explicitly propagating both the mean and variance of the variational posterior through the policy network, the approach enables the robot to self-assess the reliability of its actions in real time, without introducing additional computational burden or requiring external supervision. This self-assessment capability is particularly advantageous for physical deployment, as it allows the robot to identify low-confidence states and adapt its behavior accordingly, thus enhancing safety in uncertain or adversarial environments. Importantly, because the proposed method operates directly on the learned policy outputs, it is agnostic to the underlying robot platform and sensing configuration, which facilitates seamless transfer from simulation to hardware. This positions Trust-Nav as a practical framework for bridging robust uncertainty-aware navigation with real-world autonomous systems.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>We propose Trust-Nav, a deep reinforcement learning framework that incorporates uncertainty estimation via a variational policy network. The proposed Trust-Nav is built on fundamental principles of Bayesian density propagation in dynamical systems. By propagating moments of the variational policy network, Trust-Nav enables robust decision-making and provides a built-in measure of action confidence (or equivalently uncertainty). Experiments in simulated environments demonstrate that Trust-Nav model consistently outperforms baseline models and remains robust under Gaussian noise and adversarial attacks. Trust-Nav models maintain not only higher rewards but also demonstrate reduced sensitivity to input corruption. When the reward values decrease due to the high level of adversarial attacks, the uncertainty associated with the robot&#x2019;s actions increases significantly to warn the robot of uncertain actions. This integration of uncertainty into the policy network promotes safer and more reliable navigation, especially in complex or unpredictable environments. Trust-Nav offers a step toward deployable, self-aware robotic systems capable of recognizing and responding to their own limitations.</p>
</sec>
<sec id="s7">
<label>7</label>
<title>Future work</title>
<p>While the present study introduces closed-form variational moment propagation within DRL policy networks&#x2014;offering a tractable and sampling-free approach to uncertainty estimation&#x2014;several extensions are envisioned to further enhance the framework&#x2019;s accuracy and applicability. First, the current formulation adopts an independence assumption for network parameters across and within layers to ensure scalability and real-time feasibility. In future work, we plan to investigate structured covariance approximations, such as Kronecker-factored or low-rank representations, to capture inter-parameter correlations while preserving computational efficiency. Second, our method currently employs a first-order Taylor approximation for nonlinear activation functions. Although this enables a closed-form, low-latency uncertainty propagation, we will explore the use of unscented transformations, which can approximate nonlinear mappings up to second-order accuracy, thereby reducing approximation error without resorting to Monte Carlo sampling. Finally, future studies will expand the evaluation to include real-world robotic platforms, additional noise models derived from real sensor data, and comparisons with other uncertainty-aware DRL approaches, further validating the robustness and generalizability of the proposed framework.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The datasets analyzed for this study are available in the gym-gazebo repository at <ext-link ext-link-type="uri" xlink:href="https://github.com/erlerobot/gym-gazebo">https://github.com/erlerobot/gym-gazebo</ext-link> All source code, experiment configurations, and instructions to reproduce the results are publicly available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/dimahdera/Robust-Uncertainty-Estimation-Framework-in-Deep-Reinforcement-Learning-for-Active-SLAM.git">https://github.com/dimahdera/Robust-Uncertainty-Estimation-Framework-in-Deep-Reinforcement-Learning-for-Active-SLAM.git</ext-link>
</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>KB: Data curation, Formal Analysis, Investigation, Software, Validation, Writing &#x2013; review and editing. LE: Data curation, Investigation, Software, Validation, Writing &#x2013; review and editing. RN: Data curation, Writing &#x2013; review and editing. BP: Conceptualization, Data curation, Formal Analysis, Investigation, Methodology, Software, Validation, Visualization, Writing &#x2013; review and editing. DD: Conceptualization, Data curation, Formal Analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2339658/overview">Xiaocong Li</ext-link>, Agency for Science, Technology and Research (A&#x2a;STAR), Singapore</p>
</fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1142145/overview">Mazin Al-saedi</ext-link>, Middle Technical University, Iraq</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3046118/overview">Fabio Valerio Buonomo</ext-link>, Sapienza University of Rome, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3077173/overview">Yinghao Jia</ext-link>, Harbin Institute of Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3141572/overview">Abedalmuhdi Almomany</ext-link>, Gulf University for Science and Technology, Kuwait</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmed</surname>
<given-names>M. F.</given-names>
</name>
<name>
<surname>Masood</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Fremont</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Fantoni</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Active SLAM: a review on last decade</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>8097</fpage>. <pub-id pub-id-type="doi">10.3390/s23198097</pub-id>
<pub-id pub-id-type="pmid">37836928</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alatise</surname>
<given-names>M. B.</given-names>
</name>
<name>
<surname>Hancke</surname>
<given-names>G. P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A review on challenges of autonomous mobile robot and sensor fusion methods</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>39830</fpage>&#x2013;<lpage>39846</lpage>. <pub-id pub-id-type="doi">10.1109/access.2020.2975643</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Bellemare</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Dabney</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Rowland</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <source>Distributional reinforcement learning</source>. <publisher-name>MIT Press</publisher-name>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Blei</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Kucukelbir</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>McAuliffe</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Variational inference: a review for statisticians</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>112</volume>, <fpage>859</fpage>&#x2013;<lpage>877</lpage>. <pub-id pub-id-type="doi">10.1080/01621459.2017.1285773</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Carrillo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Reid</surname>
<given-names>I. D.</given-names>
</name>
<name>
<surname>Castellanos</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>On the comparison of uncertainty criteria for active SLAM</article-title>,&#x201d; in <conf-name>2012 IEEE International Conference on Robotics and Automation</conf-name>, <fpage>2080</fpage>&#x2013;<lpage>2087</lpage>. <pub-id pub-id-type="doi">10.1109/icra.2012.6224890</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carter-Templeton</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Frazier</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>H. Wyatt</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Robotics in nursing: a bibliometric analysis</article-title>. <source>J. Nurs. Scholarsh.</source> <volume>50</volume>, <fpage>582</fpage>&#x2013;<lpage>589</lpage>. <pub-id pub-id-type="doi">10.1111/jnu.12399</pub-id>
<pub-id pub-id-type="pmid">29920944</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fitch</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Active slam for mobile robots with area coverage and obstacle avoidance</article-title>. <source>IEEE/ASME Trans. Mechatronics</source> <volume>25</volume>, <fpage>1182</fpage>&#x2013;<lpage>1192</lpage>. <pub-id pub-id-type="doi">10.1109/tmech.2019.2963439</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dera</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bouaynaya</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Rasool</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shterenberg</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fathallah-Shaykh</surname>
<given-names>H. M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>PremiUm-CNN: propagating uncertainty towards robust convolutional neural networks</article-title>. <source>IEEE Trans. Signal Process.</source> <volume>69</volume>, <fpage>4669</fpage>&#x2013;<lpage>4684</lpage>. <pub-id pub-id-type="doi">10.1109/TSP.2021.3096804</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Durner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>M&#xe1;rton</surname>
<given-names>Z.-C.</given-names>
</name>
<name>
<surname>B&#xe1;lint-Bencz&#xe9;di</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Triebel</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Introspective robot perception using smoothed predictions from Bayesian neural networks</article-title>,&#x201d; in <source>The international symposium of robotics research</source> (<publisher-name>Springer</publisher-name>), <fpage>660</fpage>&#x2013;<lpage>675</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gal</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ghahramani</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Bayesian convolutional neural networks with Bernoulli approximate variational inference</article-title>,&#x201d; in <source>Proceedings of 4th international conference on learning representations, (ICLR) workshop track</source>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Shlens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Explaining and harnessing adversarial examples</article-title>,&#x201d; in <source>Proceedings of 3rd international conference on learning representations, (ICLR)</source>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grigsby</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Towards automatic actor-critic solutions to continuous control</article-title>. <source>arXiv Prepr. arXiv:2106.08918</source>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grondman</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Busoniu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lopes</surname>
<given-names>G. A.</given-names>
</name>
<name>
<surname>Babuska</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A survey of actor-critic reinforcement learning: standard and natural policy gradients</article-title>. <source>IEEE Trans. Syst. Man, Cybern. Part C Appl. Rev.</source> <volume>42</volume>, <fpage>1291</fpage>&#x2013;<lpage>1307</lpage>. <pub-id pub-id-type="doi">10.1109/tsmcc.2012.2218595</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Holly</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Lillicrap</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Deep reinforcement learning for robotic manipulation with asynchronous off-policy updates</article-title>,&#x201d; in <source>IEEE international conference on robotics and automation (ICRA)</source>, <fpage>3389</fpage>&#x2013;<lpage>3396</lpage>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fernando</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Simultaneous localization and mapping (slam) and data fusion in unmanned aerial vehicles: recent advances and challenges</article-title>. <source>Drones</source> <volume>6</volume>, <fpage>85</fpage>. <pub-id pub-id-type="doi">10.3390/drones6040085</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Harerimana</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J. W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Q-learning algorithms: a comprehensive classification and applications</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>133653</fpage>&#x2013;<lpage>133667</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2941229</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kendall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gal</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>What uncertainties do we need in Bayesian deep learning for computer vision?</article-title>,&#x201d; in <conf-name>Proceedings of the 31st International Conference on Neural Information Processing Systems</conf-name>, <conf-loc>Honolulu, HI</conf-loc>, <conf-date>February 8&#x2013;12, 2012</conf-date>. (<publisher-loc>Long Beach, CA, United States</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>), <fpage>5580</fpage>&#x2013;<lpage>5590</lpage>.</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Leung</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dissanayake</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Active SLAM in structured environments</article-title>,&#x201d; in <source>IEEE international conference on robotics and automation</source>, <fpage>1898</fpage>&#x2013;<lpage>1903</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liaqat</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hutabarat</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tiwari</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tinkler</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Harra</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Morgan</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Autonomous mobile robots in manufacturing: highway code development, simulation, and testing</article-title>. <source>Int. J. Adv. Manuf. Technol.</source> <volume>104</volume>, <fpage>4617</fpage>&#x2013;<lpage>4628</lpage>. <pub-id pub-id-type="doi">10.1007/s00170-019-04257-1</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lv</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021a</year>). &#x201c;<article-title>Exploration via distributional reinforcement learning with epistemic and aleatoric uncertainty estimation</article-title>,&#x201d; in <source>2021 IEEE 17th international conference on automation science and engineering (CASE)</source>, <fpage>2256</fpage>&#x2013;<lpage>2261</lpage>. <pub-id pub-id-type="doi">10.1109/CASE49439.2021.9551544</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nageotte</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zanne</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>de Mathelin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dresp-Langley</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Deep reinforcement learning for the control of robotic manipulation: a focussed mini-review</article-title>. <source>Robotics</source> <volume>10</volume>, <fpage>22</fpage>. <pub-id pub-id-type="doi">10.3390/robotics10010022</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Macenski</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Foote</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gerkey</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lalancette</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Woodall</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Robot operating system 2: design, architecture, and uses in the wild</article-title>. <source>Sci. Robotics</source> <volume>7</volume>, <fpage>eabm6074</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.abm6074</pub-id>
<pub-id pub-id-type="pmid">35544605</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mih&#xe1;lik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Malobick&#xfd;</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Peniak</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Vestenick&#xfd;</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>The new method of active slam for mapping using lidar</article-title>. <source>Electronics</source> <volume>11</volume>, <fpage>1082</fpage>. <pub-id pub-id-type="doi">10.3390/electronics11071082</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mnih</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bellemare</surname>
<given-names>M. G.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Human-level control through deep reinforcement learning</article-title>. <source>Nature</source> <volume>518</volume>, <fpage>529</fpage>&#x2013;<lpage>533</lpage>. <pub-id pub-id-type="doi">10.1038/nature14236</pub-id>
<pub-id pub-id-type="pmid">25719670</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Mnih</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Badia</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Mirza</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Graves</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lillicrap</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Harley</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>Asynchronous methods for deep reinforcement learning</article-title>,&#x201d; in <source>International conference on machine learning</source>, <fpage>1928</fpage>&#x2013;<lpage>1937</lpage>.</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morales</surname>
<given-names>E. F.</given-names>
</name>
<name>
<surname>Murrieta-Cid</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Becerra</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Esquivel-Basaldua</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey on deep learning and deep reinforcement learning in robotics with a tutorial on deep reinforcement learning</article-title>. <source>Intell. Serv. Robot.</source> <volume>14</volume>, <fpage>773</fpage>&#x2013;<lpage>805</lpage>. <pub-id pub-id-type="doi">10.1007/s11370-021-00398-z</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Nam</surname>
<given-names>D. V.</given-names>
</name>
<name>
<surname>Gon-Woo</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Solid-state LiDAR based-SLAM: a concise review and application</article-title>,&#x201d; in <source>IEEE international conference on big data and smart computing (BigComp)</source>, <fpage>302</fpage>&#x2013;<lpage>305</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niloy</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Shama</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chakrabortty</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Ryan</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Badal</surname>
<given-names>F. R.</given-names>
</name>
<name>
<surname>Tasneem</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Critical design and control issues of indoor autonomous mobile robots: a review</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>35338</fpage>&#x2013;<lpage>35370</lpage>. <pub-id pub-id-type="doi">10.1109/access.2021.3062557</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Palomeras</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Carreras</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Andrade-Cetto</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Active slam for autonomous underwater exploration</article-title>. <source>Remote Sens.</source> <volume>11</volume>, <fpage>2827</fpage>. <pub-id pub-id-type="doi">10.3390/rs11232827</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Pinto</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Davidson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sukthankar</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Robust adversarial reinforcement learning</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-loc>Sydney, NSW, Australia</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>2817</fpage>&#x2013;<lpage>2826</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Plaat</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <source>Deep reinforcement learning</source>. <publisher-name>Springer Nature Singapore</publisher-name>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Placed</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Castellanos</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A deep reinforcement learning approach for active SLAM</article-title>. <source>Appl. Sci.</source> <volume>10</volume>, <fpage>8386</fpage>. <pub-id pub-id-type="doi">10.3390/app10238386</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Placed</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Castellanos</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A general relationship between optimality criteria and connectivity indices for active graph-SLAM</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume>, <fpage>816</fpage>&#x2013;<lpage>823</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3233230</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Placed</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Strader</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Carrillo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Atanasov</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Indelman</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Carlone</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A survey on active simultaneous localization and mapping: state of the art and new frontiers</article-title>. <source>IEEE Trans. Robotics</source> <volume>39</volume>, <fpage>1686</fpage>&#x2013;<lpage>1705</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2023.3248510</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Pukelsheim</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2006</year>). <source>Optimal design of experiments</source>. <publisher-loc>Philadelphia, PA, United States</publisher-loc>: <publisher-name>Society for Industrial and Applied Mathematics</publisher-name>. <pub-id pub-id-type="doi">10.1137/1.9780898719109</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodr&#xed;guez-Ar&#xe9;valo</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Neira</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Castellanos</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>On the importance of uncertainty representation in active SLAM</article-title>. <source>IEEE Trans. Robotics</source> <volume>34</volume>, <fpage>829</fpage>&#x2013;<lpage>834</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2018.2808902</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Sewak</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Deep reinforcement learning</source>. <publisher-name>Springer</publisher-name>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tobin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ray</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Domain randomization for transferring deep neural networks from simulation to the real world</article-title>,&#x201d; in <conf-name>2017 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <fpage>23</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1109/iros.2017.8202133</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Trivun</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>&#x160;alaka</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Osmankovi&#xfc;</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Velagi&#xfc;</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Osmi&#xfc;</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Active SLAM-based algorithm for autonomous exploration with a mobile robot</article-title>,&#x201d; in <source>IEEE international conference on industrial Technology (ICIT)</source>, <fpage>74</fpage>&#x2013;<lpage>79</lpage>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van Hasselt</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Guez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Deep reinforcement learning with double Q-learning</article-title>. <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>30</volume>, <fpage>2094</fpage>&#x2013;<lpage>2100</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v30i1.10295</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Schaul</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hessel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hasselt</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lanctot</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Freitas</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Dueling network architectures for deep reinforcement learning</article-title>,&#x201d; in <source>International conference on machine learning</source>, <fpage>1995</fpage>&#x2013;<lpage>2003</lpage>.</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Probabilistic deep learning based on bayes by backprop for remaining useful life prognostics of consumer electronics</article-title>. <source>IEEE Trans. Consumer Electron.</source> <volume>71</volume>, <fpage>839</fpage>&#x2013;<lpage>848</lpage>. <pub-id pub-id-type="doi">10.1109/TCE.2024.3507006</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wijayathunga</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rassau</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chai</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Challenges and solutions for autonomous ground robot scene understanding and navigation in unstructured outdoor environments: a review</article-title>. <source>Appl. Sci.</source> <volume>13</volume>, <fpage>9877</fpage>. <pub-id pub-id-type="doi">10.3390/app13179877</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>X.-T.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Autonomous robots for harsh environments: a holistic overview of current solutions and ongoing challenges</article-title>. <source>Syst. Sci. Control Eng.</source> <volume>6</volume>, <fpage>213</fpage>&#x2013;<lpage>219</lpage>. <pub-id pub-id-type="doi">10.1080/21642583.2018.1477634</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Feature extraction and uncorrelated discriminant analysis for high-dimensional data</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source> <volume>20</volume>, <fpage>601</fpage>&#x2013;<lpage>614</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2007.190720</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zambaldi</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Raposo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Santoro</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bapst</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Babuschkin</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Relational deep reinforcement learning</article-title>. <source>arXiv Prepr. arXiv:1806.01830</source>.</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zamora</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Lopez</surname>
<given-names>N. G.</given-names>
</name>
<name>
<surname>Vilches</surname>
<given-names>V. M.</given-names>
</name>
<name>
<surname>Cordero</surname>
<given-names>A. H.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Extending the OpenAI Gym for robotics: a toolkit for reinforcement learning using ROS and Gazebo</article-title>. <source>
<italic>Corr.</italic> abs/1608</source>, <fpage>05742</fpage>.</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1998</year>). <source>Essentials of robust control</source>, <volume>104</volume>. <publisher-loc>Upper Saddle River, NJ</publisher-loc>: <publisher-name>Prentice Hall</publisher-name>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>