<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1734564</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2025.1734564</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Adaptive querying for reward learning from human feedback</article-title>
<alt-title alt-title-type="left-running-head">Anand et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2025.1734564">10.3389/frobt.2025.1734564</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Anand</surname>
<given-names>Yashwanthi</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3110311"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nwagwu</surname>
<given-names>Nnamdi</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/2532683"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sabbe</surname>
<given-names>Kevin</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3342431"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fitter</surname>
<given-names>Naomi T.</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/471491"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Saisubramanian</surname>
<given-names>Sandhya</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3340745"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>Oregon State University</institution>, <city>Corvallis</city>, <state>OR</state>, <country country="US">United States</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Sandhya Saisubramanian, <email xlink:href="mailto:sandhya.sai@oregonstate.edu">sandhya.sai@oregonstate.edu</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-12">
<day>12</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1734564</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>04</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Anand, Nwagwu, Sabbe, Fitter and Saisubramanian.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Anand, Nwagwu, Sabbe, Fitter and Saisubramanian</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-12">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Learning from human feedback is a popular approach to train robots to adapt to user preferences and improve safety. Existing approaches typically consider a single querying (interaction) format when seeking human feedback and do not leverage multiple modes of user interaction with a robot. We examine how to learn a penalty function associated with unsafe behaviors using <italic>multiple</italic> forms of human feedback, by optimizing both the <italic>query state</italic> and <italic>feedback format</italic>. Our proposed <italic>adaptive feedback selection</italic> is an iterative, two-phase approach which first selects critical states for querying, and then uses information gain to select a feedback format for querying across the sampled critical states. The feedback format selection also accounts for the cost and probability of receiving feedback in a certain format. Our experiments in simulation demonstrate the sample efficiency of our approach in learning to avoid undesirable behaviors. The results of our user study with a physical robot highlight the practicality and effectiveness of adaptive feedback selection in seeking informative, user-aligned feedback that accelerate learning. Experiment videos, code and supplementary materials are found on our website: <ext-link ext-link-type="uri" xlink:href="https://tinyurl.com/AFS-learning">https://tinyurl.com/AFS-learning</ext-link>.</p>
</abstract>
<kwd-group>
<kwd>information gain</kwd>
<kwd>interactive imitation learning</kwd>
<kwd>learning from human feedback</kwd>
<kwd>learning from multiple formats</kwd>
<kwd>robot learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported in part by National Science Foundation grant number 2416459.</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="1"/>
<equation-count count="4"/>
<ref-count count="51"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Intelligence in Robotics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>A key factor affecting an autonomous agent&#x2019;s behavior is its reward function. Due to the complexity of real-world environments and the practical challenges in reward design, agents often operate with incomplete reward functions corresponding to underspecified objectives, which can lead to unintended and undesirable behaviors such as negative side effects (NSEs) (<xref ref-type="bibr" rid="B2">Amodei et al., 2016</xref>; <xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>; <xref ref-type="bibr" rid="B43">Srivastava et al., 2023</xref>). For example, a robot optimizing the distance to transport an object to the goal, may damage items along the way if its reward function does not model the undesirability of colliding into other objects in the way (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>An illustration of adaptive feedback selection. The robot arm learns to move the blue object to the white bin, without colliding with other objects in the way, by querying the human in different format across the state space.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g001.tif">
<alt-text content-type="machine-generated">A robotic arm is positioned in a workspace with a green background. It&#x2019;s shown making a movement over boxes and objects. A red dashed line indicates the trajectory before learning, while a purple line shows the trajectory after learning. Annotations with correction and rank information are placed near the lines.</alt-text>
</graphic>
</fig>
<p>Human feedback offers a natural way to provide the missing knowledge, and several prior works have examined learning from various forms of human feedback to improve robot performance, including avoiding side effects (<xref ref-type="bibr" rid="B11">Cui and Niekum, 2018</xref>; <xref ref-type="bibr" rid="B13">Cui et al., 2021b</xref>; <xref ref-type="bibr" rid="B28">Lakkaraju et al., 2017</xref>; <xref ref-type="bibr" rid="B31">Ng and Russell, 2000</xref>; <xref ref-type="bibr" rid="B39">Saran et al., 2021</xref>; <xref ref-type="bibr" rid="B51">Zhang et al., 2020</xref>). In many real-world settings, the human can provide feedback in many forms, ranging from binary signals indicating action approval to correcting robot actions, each varying in the granularity of information revealed to the robot and the human effort required to provide it. For instance, a person supervising a household robot may occasionally be willing to provide detailed corrections when the robot encounters a fragile vase but may only want to give quick binary approvals during a routine motion. Ignoring this variability either limits what the robot can learn or burdens the user. To efficiently balance the <italic>trade-off</italic> between seeking feedback in a format that accelerates robot learning and reducing human effort involved, it is beneficial to seek detailed feedback sparingly in certain states and complement it with feedback types that require less human effort in other states. Such an approach could also reduce the sampling biases associated with learning from any one format, thereby improving learning performance (<xref ref-type="bibr" rid="B38">Saisubramanian et al., 2022</xref>). In fact, a recent study indicates that users are generally willing to engage with the robot in more than one feedback format (<xref ref-type="bibr" rid="B37">Saisubramanian et al., 2021b</xref>). However, existing approaches rarely exploit this flexibility, and <italic>do not support</italic> gathering feedback in different formats in different regions of the state space (<xref ref-type="bibr" rid="B12">Cui et al., 2021a</xref>; <xref ref-type="bibr" rid="B41">Settles, 1995</xref>).</p>
<p>These practical considerations motivate the core question of this paper: &#x201c;How can a robot identify <italic>when to query</italic> and in <italic>what format</italic>, while accounting for the cost and availability of different forms of feedback?&#x201d; We present a framework for <italic>adaptive feedback selection</italic> (AFS) that enables a robot to seek feedback in multiple formats in its learning phase, such that its information gain is maximized. Rather than treating all states and feedback formats uniformly, AFS prioritizes human feedback in states where feedback is most valuable and chooses feedback types based on their expected cost and information gain. This design reduces user effort, accommodates different levels of feedback granularity, and focuses on state where learning improves safety. In the interest of clarity, the rest of this paper grounds the discussion of AFS as an approach for robots to learn to avoid negative side effects (NSEs) of their actions. The NSEs refer to unintended and undesirable outcomes that arise as the agent performs its assigned task. In object delivery example in <xref ref-type="fig" rid="F1">Figure 1</xref>, the robot may inadvertently collide with other objects on the table, producing NSEs. Focusing on NSEs provides a well-defined and measurable setting&#x2013;quantified by the number of NSE occurrences&#x2013;to evaluate how AFS improves an agent&#x2019;s learning efficiency and safety. However, note that AFS is a general technique that can be applied broadly to learn about various forms of undesirable behavior.</p>
<p>Minimizing NSEs using AFS involves four iterative steps (<xref ref-type="fig" rid="F4">Figure 4</xref>): (1) states are partitioned into clusters, with a cluster weight proportional to the number of NSEs discovered in it; (2) a set of critical states&#x2014;states where human feedback is crucial for learning an association of state features and NSEs, i.e., a predictive model of NSE severity, is formed by sampling from each cluster based on its weight; (3) a feedback format that maximizes the information gain in critical states is identified, while accounting for the cost and uncertainty in receiving a feedback, using the human feedback preference model; and (4) cluster weights and information gain are updated, and a new set of critical states are sampled to learn about NSEs, until the querying budget expires. The learned NSE information is mapped to a penalty function and augmented to the robot&#x2019;s model to compute an NSE-minimizing policy to complete its task.</p>
<p>We evaluate AFS in both simulation and using a user study where participants interact with a robot arm. First, we evaluate the approach in three simulated proof-of-concept settings with simulated human feedback. Second, we conduct a pilot study where 12 human participants interact with and provide feedback to the agent in a simulated gridworld domain. Finally, we evaluate using a Kinova Gen3 7DoF arm and 30 human participants. Besides the performance and sample efficiency, our experiments also provide insights into how the querying process can influence user trust. Together, these complementary studies demonstrate both the practicality and effectiveness of AFS.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Background and related work</title>
<sec id="s2-1">
<label>2.1</label>
<title>Markov Decision Processes (MDPs)</title>
<p>The MDPs are a popular framework to model sequential decision making problems. An MDP is defined by the tuple <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the set of states, <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the set of actions, <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of reaching state <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> after taking an action <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> from a state <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the reward for taking action <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in state <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. An optimal deterministic policy <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
<mml:mo>&#x2a;</mml:mo>
<mml:mo>:</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is one that maximizes the expected reward. When the objective or reward function is incomplete, even an optimal policy can produce unsafe behaviors such as side effects. Negative Side Effects (NSEs) are immediate, undesired, unmodeled effects of an agent&#x2019;s actions on the environment (<xref ref-type="bibr" rid="B26">Krakovna et al., 2018</xref>; <xref ref-type="bibr" rid="B35">Saisubramanian and Zilberstein, 2021</xref>; <xref ref-type="bibr" rid="B43">Srivastava et al., 2023</xref>). We focus on NSEs arising due to incomplete reward function (<xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>), which we mitigate by learning a penalty function using human feedback.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Learning from human feedback</title>
<p>Learning from human feedback is a popular approach to train agents when reward functions are unavailable or incomplete (<xref ref-type="bibr" rid="B1">Abbeel and Ng, 2004</xref>; <xref ref-type="bibr" rid="B31">Ng and Russell, 2000</xref>; <xref ref-type="bibr" rid="B34">Ross et al., 2011</xref>; <xref ref-type="bibr" rid="B30">Najar and Chetouani, 2021</xref>), including to improve safety (<xref ref-type="bibr" rid="B8">Brown et al., 2020b</xref>; <xref ref-type="bibr" rid="B6">2018</xref>; <xref ref-type="bibr" rid="B17">Hadfield Menell et al., 2017</xref>; <xref ref-type="bibr" rid="B33">Ramakrishnan et al., 2020</xref>; <xref ref-type="bibr" rid="B51">Zhang et al., 2020</xref>; <xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>; <xref ref-type="bibr" rid="B19">Hassan et al., 2025</xref>). Feedback can take various forms such as <italic>demonstrations</italic> (<xref ref-type="bibr" rid="B32">Ramachandran and Amir, 2007</xref>; <xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>; <xref ref-type="bibr" rid="B40">Seo and Unhelkar, 2024</xref>; <xref ref-type="bibr" rid="B50">Zha et al., 2024</xref>), <italic>corrections</italic> (<xref ref-type="bibr" rid="B14">Cui et al., 2023</xref>; <xref ref-type="bibr" rid="B3">B&#xe4;rmann et al., 2024</xref>), <italic>critiques</italic> (<xref ref-type="bibr" rid="B11">Cui and Niekum, 2018</xref>; <xref ref-type="bibr" rid="B45">Tarakli et al., 2024</xref>), <italic>ranking</italic> trajectories (<xref ref-type="bibr" rid="B7">Brown et al., 2020a</xref>; <xref ref-type="bibr" rid="B47">Xue et al., 2024</xref>; <xref ref-type="bibr" rid="B15">Feng et al., 2025</xref>), natural language instructions (<xref ref-type="bibr" rid="B29">Lou et al., 2024</xref>; <xref ref-type="bibr" rid="B48">Yang Y. et al., 2024</xref>; <xref ref-type="bibr" rid="B19">Hassan et al., 2025</xref>) or may be <italic>implicit</italic> in the form of facial expressions and gestures (<xref ref-type="bibr" rid="B13">Cui et al., 2021b</xref>; <xref ref-type="bibr" rid="B44">Strokina et al., 2022</xref>; <xref ref-type="bibr" rid="B9">Candon et al., 2023</xref>).</p>
<p>While the existing approaches for learning from feedback have shown success, they typically assume that a single feedback type is used to teach the agent. This assumption limits learning efficiency and adaptability. Some efforts combine demonstrations with preferences (<xref ref-type="bibr" rid="B5">B&#x131;y&#x131;k et al., 2022</xref>; <xref ref-type="bibr" rid="B21">Ibarz et al., 2018</xref>), showing that utilizing more than one format accelerates learning. Extending this idea, recent works integrate richer modalities such as language and vision with demonstrations. <xref ref-type="bibr" rid="B49">Yang Z. et al. (2024)</xref> learn reward function from comparative language feedback, while <xref ref-type="bibr" rid="B42">Sontakke et al. (2023)</xref> show that a single demonstration or natural language description can help define a proxy reward when used along with a vision-language models (VLM) that is pretrained on a large amount of out-of-domain video demonstrations and language pairs. <xref ref-type="bibr" rid="B24">Kim et al. (2023)</xref> use multimodal embeddings of visual observations and natural language descriptions to compute alignment-based rewards. A recent study even emphasizes that combining multiple feedback modalities can further enhance learning outcomes (<xref ref-type="bibr" rid="B4">Beierling et al., 2025</xref>). Together, these works highlight that combining complementary feedback formats help advance reward learning beyond using a fixed feedback format. Building on this insight, our approach uses multiple forms of human feedback for learning.</p>
<p>Learning from human feedback has also been used for modeling variations in human behavior. <xref ref-type="bibr" rid="B20">Huang et al. (2024)</xref> model the heterogeneous behaviors of human, capturing differences in feedback frequency, delay, strictness, and bias to improve the robustness during the learning process, as optimal behaviors vary across users. Along the same line, the reward learning approach proposed by <xref ref-type="bibr" rid="B16">Ghosal et al. (2023)</xref>, selects a single feedback format based on the user ability to provide feedback in that format, resulting in an interaction that is tailored to a user&#x2019;s skill level. Collectively, these works reveal a shift towards adaptive and user-aware querying mechanisms that improves reward inference and learning efficiency, motivating our approach to dynamically select both when to query and in what feedback format.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Problem formulation</title>
<p>
<bold>Setting:</bold> Consider a robot operating in a discrete environment modeled as a Markov Decision Process (MDP), using its acquired model <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The robot optimizes the completion of its assigned task, which is its primary objective described by reward <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. A <italic>primary policy</italic>, <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, is an optimal policy for the robot&#x2019;s primary objective.</p>
<p>
<bold>Assumption 1.</bold> Similar to (<xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>), we assume that the agent&#x2019;s model <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> has all the necessary information for the robot to successfully complete its assigned task but lacks other superfluous details that are unrelated to the task.</p>
<p>Since the model is incomplete in ways unrelated to the primary objective, executing the primary policy produces negative side effects (NSEs) that are difficult to identify at design time. Following (<xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>), we define NSEs as immediate, undesired, unmodeled effects of a robot&#x2019;s actions on the environment. We focus on settings where the robot has <italic>no prior knowledge</italic> about the NSEs of its actions or the underlying true NSE penalty function <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. It learns to avoid NSEs by learning a penalty function <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from human feedback that is consistent with <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>We target settings where the human can provide feedback in multiple ways and the robot can seek feedback in a <italic>specific</italic> format such as approval or corrections. This represents a significant shift from traditional active learning methods, which typically gather feedback only in a single format (<xref ref-type="bibr" rid="B33">Ramakrishnan et al., 2020</xref>; <xref ref-type="bibr" rid="B36">Saisubramanian et al., 2021a</xref>; <xref ref-type="bibr" rid="B39">Saran et al., 2021</xref>). Using the learned <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the robot computes an NSE-minimizing policy to complete its task by optimizing: <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are fixed, tunable weights denoting priority over objectives.</p>
<p>
<bold>Running Example:</bold> We illustrate the problem using a simple object delivery task using a Kinova Gen3 7DoF arm shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The robot optimizes delivering the blue block to the white bin, by taking the shortest path. However, passing through states with a cardboard box or a glass bowl constitutes an NSE. Since the robot has no prior knowledge about NSEs of its actions, it may inadvertently navigate through these states causing NSEs.</p>
<p>
<bold>Human&#x2019;s Feedback Preference Model:</bold> The feedback format selection must account for the cost and human preferences in providing feedback in a certain format. The user&#x2019;s <italic>feedback preference model</italic> is denoted by <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> where,<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a predefined set of feedback formats the human can provide, such as demonstrations and corrections;</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mo>:</mml:mo>
<mml:mo>&#x2009;&#x2009;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of receiving feedback in a format <inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, denoted as <inline-formula id="inf27">
<mml:math id="m27">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>; and</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf28">
<mml:math id="m28">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>:</mml:mo>
<mml:mo>&#x2009;&#x2009;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a cost function that assigns a cost to each feedback format <inline-formula id="inf29">
<mml:math id="m29">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, representing the human&#x2019;s time or cognitive effort required to provide that feedback.</p>
</list-item>
</list>
</p>
<p>This work assumes the robot has access to the user&#x2019;s feedback preference model <inline-formula id="inf30">
<mml:math id="m30">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2014;either handcrafted by an expert or learned from user interactions prior to robot querying, as in our user study experiments. Abstracting user feedback preferences into probabilities and costs enables generalizing the preferences across similar tasks. We take the pragmatic stance that <inline-formula id="inf31">
<mml:math id="m31">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is independent of time and state, denoting the user&#x2019;s preference about a format, such as not preferring formats that require constant supervision of robot performance. While this can be relaxed and the approach can be extended to account for state-dependent preferences, obtaining an accurate state-dependent <inline-formula id="inf32">
<mml:math id="m32">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> could be challenging in practice.</p>
<p>
<bold>Assumption 2.</bold> Human feedback is immediate and accurate, when available.</p>
<p>Below, we describe the various feedback formats considered in this paper, and how the data from these formats are mapped to NSE severity labels.</p>
<sec id="s3-1">
<label>3.1</label>
<title>Feedback formats studied</title>
<p>The agent learns an association between state-action pairs and NSE severity, based on the human feedback provided in response to agent queries. The NSE categories we consider in this work are <inline-formula id="inf33">
<mml:math id="m33">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mtext>No&#x2009;NSE</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Mild&#x2009;NSE</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Severe&#x2009;NSE</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. We focus on the following commonly used feedback types, each differing in the level of information conveyed to the agent and the human effort required to provide them.</p>
<p>
<bold>Approval (App):</bold> The robot randomly selects <inline-formula id="inf34">
<mml:math id="m34">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> state-action pairs from all possible actions in critical states and queries the human for approval or disapproval. Approved actions are labeled as acceptable, while disapproved actions are labeled as unacceptable.</p>
<p>
<bold>Annotated Approval (Ann. App):</bold> An extension of Approval, where the human specifies the <italic>NSE severity</italic> (or category) for each disapproved action in the critical states.</p>
<p>
<bold>Corrections (Corr):</bold> The robot performs a trajectory of its primary policy in the critical states, under human supervision. If the robot&#x2019;s action is unacceptable, then the human intervenes with an acceptable action in these states. If all actions in a state lead to NSE, the human specifies an action with the least NSE. When interrupted, the robot assumes all actions except the correction are unacceptable in that state.</p>
<p>
<bold>Annotated Corrections (Ann. Corr):</bold> An extension of Corrections, where the human specifies the severity of NSEs caused by the robot&#x2019;s unacceptable action in critical states.</p>
<p>
<bold>Rank:</bold> The robot randomly selects <inline-formula id="inf35">
<mml:math id="m35">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> ranking queries of the form <inline-formula id="inf36">
<mml:math id="m36">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mtext mathvariant="italic">state</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext mathvariant="italic">action</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext mathvariant="italic">action</mml:mtext>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, by sampling two actions for each critical state. The human selects the safer action among the two options. If both are safe or unsafe, one of them is selected at random. The selected action is marked as acceptable and the other is treated as unacceptable.</p>
<p>
<bold>Demo-Action Mismatch (DAM):</bold> The human demonstrates a safe action in each critical state, which the robot compares with its policy. All mismatched robot&#x2019;s actions are labeled as unacceptable. Matched actions are labeled as acceptable.</p>
<p>
<bold>Mapping feedback data to NSE severity labels:</bold> We use <inline-formula id="inf37">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf38">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf39">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to denote labels corresponding to no, mild and severe NSEs, respectively. An acceptable action in a state is mapped to <inline-formula id="inf40">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, i.e., <inline-formula id="inf41">
<mml:math id="m41">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, while an unacceptable action is mapped to <inline-formula id="inf42">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. When the severity of NSEs for unacceptable actions is known, actions producing mild NSEs are mapped to <inline-formula id="inf43">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and those producing severe NSEs to <inline-formula id="inf44">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Mapping feedback to this common label set provides a consistent representation of NSE severity across diverse feedback types. The granularity of information and the sampling biases of the different feedback types affect the learned reward. <xref ref-type="fig" rid="F2">Figure 2</xref> illustrates this with the learned NSE penalty for the running example of moving an object to the bin (<xref ref-type="fig" rid="F1">Figure 1</xref>), motivating the need for an adaptive approach that can learn from more than one feedback format. In the running example, the robot arm colliding with cardboard boxes is a mild NSE, and colliding with a glass bowl is a severe NSE.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Visualization of reward learned using different feedback types. (Row 1) Black arrows indicate queries, and feedback is in speech bubbles. <inline-graphic xlink:href="frobt-12-1734564-fx3.tif">
<alt-text content-type="machine-generated">A small red square on a black background.</alt-text>
</inline-graphic>, <inline-graphic xlink:href="frobt-12-1734564-fx2.tif">
<alt-text content-type="machine-generated">Red square in a small image format.</alt-text>
</inline-graphic>, <inline-graphic xlink:href="frobt-12-1734564-fx1.tif">
<alt-text content-type="machine-generated">Solid yellow square with a thin black border.</alt-text>
</inline-graphic> indicates high, mild, and zero penalty. Outer box is the true reward, and inner box shows the learned reward. Mismatches between the outer and inner box colors indicate incorrect learned model.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g002.tif">
<alt-text content-type="machine-generated">Flowchart illustrating different feedback and reward types in a grid environment, including &#x22;Approval&#x22;, &#x22;Annotated Approval&#x22;, &#x22;Correction&#x22;, &#x22;Annotated Correction&#x22;, &#x22;Ranking&#x22;, and &#x22;Demo-Action Mismatch&#x22;. The top row depicts different item positions, movements, and feedback indicators like checks and crosses. The bottom row shows corresponding reward levels using color-coded heat maps representing true and learned rewards, with variations in intensity and color.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Adaptive feedback selection</title>
<p>Given an agent&#x2019;s decision making model <inline-formula id="inf45">
<mml:math id="m45">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the human&#x2019;s feedback preference model <inline-formula id="inf46">
<mml:math id="m46">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, AFS enables the agent to query for feedback in critical states in a format that maximizes its information gain. We first formalize the NSE model learning process and then describe in detail how AFS selects critical states and the query format.</p>
<p>
<bold>Formalizing NSE Model Learning:</bold> Let <inline-formula id="inf47">
<mml:math id="m47">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2a;</mml:mo>
<mml:mo>:</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the <italic>true</italic> NSE severity label for each state-action pair, which is unknown to the agent but known to the human. The label <inline-formula id="inf48">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> corresponds to <italic>no NSE</italic>, <inline-formula id="inf49">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes <italic>mild NSE</italic>, <inline-formula id="inf50">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the label for <italic>severe NSE</italic>. Let <inline-formula id="inf51">
<mml:math id="m51">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> be a sampled approximation of <inline-formula id="inf52">
<mml:math id="m52">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf53">
<mml:math id="m53">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, denoting the dataset of NSE labels collected via human feedback in response to the <inline-formula id="inf54">
<mml:math id="m54">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> pairs queried. That is, <inline-formula id="inf55">
<mml:math id="m55">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the data collected from human feedback until iteration <inline-formula id="inf56">
<mml:math id="m56">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf57">
<mml:math id="m57">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the categorical NSE severity label assigned to the state-action pair <inline-formula id="inf58">
<mml:math id="m58">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Let <inline-formula id="inf59">
<mml:math id="m59">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the labels predicted by the learned NSE model&#x2014;learned using a supervised classifier with <inline-formula id="inf60">
<mml:math id="m60">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as the training data. In this paper, we use a Random Forest (RF) classifier, though any classifier can be used in practice. Hyperparameters are optimized through randomized search with three-fold cross validation, and the configuration yielding the lowest mean-squared error is selected for training.</p>
<p>
<xref ref-type="fig" rid="F3">Figure 3</xref> shows an example of <inline-formula id="inf61">
<mml:math id="m61">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf62">
<mml:math id="m62">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for the object delivery task. We encode NSE categories as <inline-formula id="inf63">
<mml:math id="m63">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>0,1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> corresponding to <inline-formula id="inf64">
<mml:math id="m64">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mtext>&#x2009;no&#x2009;NSE,&#x2009;mild&#x2009;NSE,&#x2009;severe&#x2009;NSE</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> respectively. Each state has four possible actions <inline-formula id="inf65">
<mml:math id="m65">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the vector <inline-formula id="inf66">
<mml:math id="m66">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> (and similarly <inline-formula id="inf67">
<mml:math id="m67">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>) encodes the categorical NSE labels for <inline-formula id="inf68">
<mml:math id="m68">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> in that order. Since the human&#x2019;s categorization of NSE is initially unknown, <inline-formula id="inf69">
<mml:math id="m69">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is sampled from a uniform prior over the labels, and <inline-formula id="inf70">
<mml:math id="m70">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is initialized to [0,0,0,0] (all actions are assumed to be safe) across all states.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Illustration of <inline-formula id="inf71">
<mml:math id="m71">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (accumulated feedback) and <inline-formula id="inf72">
<mml:math id="m72">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (generalized NSE labels) for the object delivery task. <inline-formula id="inf73">
<mml:math id="m73">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> indicates the feedback formats selected until iteration <inline-formula id="inf74">
<mml:math id="m74">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-graphic xlink:href="frobt-12-1734564-fx1.tif">
<alt-text content-type="machine-generated">Solid yellow square with a thin black border.</alt-text>
</inline-graphic> indicates no NSE; <inline-graphic xlink:href="frobt-12-1734564-fx2.tif">
<alt-text content-type="machine-generated">Red square in a small image format.</alt-text>
</inline-graphic> indicates mild NSE; <inline-graphic xlink:href="frobt-12-1734564-fx3.tif">
<alt-text content-type="machine-generated">A small red square on a black background.</alt-text>
</inline-graphic> indicates severe NSE. Queried states in each iteration is highlighted in blue.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g003.tif">
<alt-text content-type="machine-generated">Two grid diagrams showing iterations \(t-1\) and \(t\) for sequences \(p\) and \(q\). At \(t-1\), sequence \(p\) has an orange highlighted cell in the first column, and sequence \(q\) has a blue highlighted cell in the first column, both with set \(\{Ann. App\}\). At \(t\), sequence \(p\) has a red highlighted cell in the second column, and sequence \(q\) has a red highlighted cell in the fourth column, both with set \(\{Ann. App, App\}\).</alt-text>
</graphic>
</fig>
<p>At <inline-formula id="inf75">
<mml:math id="m75">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf76">
<mml:math id="m76">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> reflects a single labeled state from the feedback received, while <inline-formula id="inf77">
<mml:math id="m77">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> reflects NSE label for the state after learning from <inline-formula id="inf78">
<mml:math id="m78">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. For example, in iteration <inline-formula id="inf79">
<mml:math id="m79">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, an action <inline-formula id="inf80">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in state <inline-formula id="inf81">
<mml:math id="m81">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is randomly selected for querying using the <italic>Annotated Approval</italic> feedback format. The human labels it as mild NSE, so <inline-formula id="inf82">
<mml:math id="m82">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and consequently <inline-formula id="inf83">
<mml:math id="m83">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,0,1,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. After training on <inline-formula id="inf84">
<mml:math id="m84">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, the classifier may sometimes incorrectly predict <inline-formula id="inf85">
<mml:math id="m85">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, especially in early iterations when there is less data. At the next iteration <inline-formula id="inf86">
<mml:math id="m86">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the agent queries in a similar state using the <italic>Approval</italic> format, where the action <inline-formula id="inf87">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is randomly selected. Because the NSE severity level (i.e., mild/severe) cannot be indicated through the Approval format, <inline-formula id="inf88">
<mml:math id="m88">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is updated as <inline-formula id="inf89">
<mml:math id="m89">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>2,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and training now yields a prediction <inline-formula id="inf90">
<mml:math id="m90">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>2,0,1,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> (i.e., the NSE model predicts severe NSE outcome on <inline-formula id="inf91">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and a mild NSE outcome on <inline-formula id="inf92">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). This illustrates that <inline-formula id="inf93">
<mml:math id="m93">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> may initially disagree with <inline-formula id="inf94">
<mml:math id="m94">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, but as feedback accumulates on related states, the generalization of <inline-formula id="inf95">
<mml:math id="m95">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> across actions begins to align with <inline-formula id="inf96">
<mml:math id="m96">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Each predicted label is then mapped to a penalty value to form the learned penalty function, <inline-formula id="inf97">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, with penalties for <inline-formula id="inf98">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf99">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> set to <inline-formula id="inf100">
<mml:math id="m100">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf101">
<mml:math id="m101">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> respectively, in our experiments. This penalty function is integrated into the agent&#x2019;s reward model to compute an updated policy that minimizes NSEs while completing the primary task.</p>
<p>In this learning setup, minimizing NSEs using AFS involves four iterative steps (<xref ref-type="fig" rid="F4">Figure 4</xref>). In each learning iteration, AFS identifies (1) which states are most critical for querying (<xref ref-type="sec" rid="s4-1">Section 4.1</xref>), and (2) which feedback format maximizes the expected information gain at the critical states, while accounting for user feedback preferences and effort involved (<xref ref-type="sec" rid="s4-2">Section 4.2</xref>). The information gain associated with a feedback quantifies the effect of a feedback in improving the agent&#x2019;s understanding of the underlying reward function, and is measured using Kullback-Leibler (KL) Divergence (<xref ref-type="bibr" rid="B16">Ghosal et al., 2023</xref>; <xref ref-type="bibr" rid="B46">Tien et al., 2023</xref>). At the end of each iteration, the cluster weights and information gain are updated, and a new set of critical states are sampled to learn about NSEs, until the querying budget expires or the KL-divergence is below a problem-specific, pre-defined threshold.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Solution approach overview. The critical states <inline-formula id="inf102">
<mml:math id="m102">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for querying are selected by clustering the states. A feedback format <inline-formula id="inf103">
<mml:math id="m103">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> that maximizes information gain is selected for querying the user across <inline-formula id="inf104">
<mml:math id="m104">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The NSE model is iteratively refined based on feedback. An updated policy is calculated using a penalty function <inline-formula id="inf105">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, derived from the learned NSE model.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g004.tif">
<alt-text content-type="machine-generated">Flowchart depicting a process for feedback-driven learning in critical states. It illustrates steps for selecting optimal feedback format, updating information gain, and learning the NSE severity prediction model. The chart includes visual representations of feedback methods such as demonstration, approval, and correction. It involves computing policies, updating budgets, and executing actions through supervised learning, with mathematical formulas for optimization. Various components like critical states, feedback reception, budget updates, and policy computation are intertwined in the learning process. The chart is structured for clarity with arrows denoting process flow and decision points.</alt-text>
</graphic>
</fig>
<sec id="s4-1">
<label>4.1</label>
<title>Critical states selection</title>
<p>When the budget for querying a human is limited, it is useful to query in states with a high <italic>learning gap</italic> measured as the KL-divergence between the agent&#x2019;s knowledge of NSE severity and the true NSE severity given the feedback data collected so far. States with a high learning gap are called <italic>critical states</italic> <inline-formula id="inf106">
<mml:math id="m106">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and querying in these states can reduce the learning gap.</p>
<p>Since <inline-formula id="inf107">
<mml:math id="m107">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf108">
<mml:math id="m108">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> contain categorical values rather than probabilities, their corresponding empirical probability mass functions (PMFs) are computed over the three NSE categories (no NSE, mild NSE, and severe NSE), yielding <inline-formula id="inf109">
<mml:math id="m109">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf110">
<mml:math id="m110">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. In this case, <inline-formula id="inf111">
<mml:math id="m111">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf112">
<mml:math id="m112">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> will be vectors of length three, since we consider three NSE categories.</p>
<p>In order to select critical states for querying, we compute the KL divergence between <inline-formula id="inf113">
<mml:math id="m113">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf114">
<mml:math id="m114">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf115">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Although <inline-formula id="inf116">
<mml:math id="m116">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> may appear as a reasonable criterion to guide critical states selection, it only measures how well the agent learns from the feedback at <inline-formula id="inf117">
<mml:math id="m117">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. It does not reveal states where the agent&#x2019;s predictions were incorrect. For the example shown in <xref ref-type="fig" rid="F3">Figure 3</xref> with <inline-formula id="inf118">
<mml:math id="m118">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf119">
<mml:math id="m119">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>2,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf120">
<mml:math id="m120">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf121">
<mml:math id="m121">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are calculated as the average occurrence of each NSE category (no NSE, mild NSE, severe NSE) across the four actions. That is, for <inline-formula id="inf122">
<mml:math id="m122">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, the frequency is <inline-formula id="inf123">
<mml:math id="m123">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, resulting in <inline-formula id="inf124">
<mml:math id="m124">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1.0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. For <inline-formula id="inf125">
<mml:math id="m125">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>2,0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, the frequency is <inline-formula id="inf126">
<mml:math id="m126">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, yielding <inline-formula id="inf127">
<mml:math id="m127">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0.75</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.25</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Calculating the divergence between <inline-formula id="inf128">
<mml:math id="m128">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf129">
<mml:math id="m129">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> reveals that the prediction was incorrect at <inline-formula id="inf130">
<mml:math id="m130">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and therefore more data is required to align the learned model, and hence <inline-formula id="inf131">
<mml:math id="m131">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> or similar states should be selected for querying. Therefore, the sampling weight of the cluster containing <inline-formula id="inf132">
<mml:math id="m132">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is increased (the region where the NSE model is still uncertain). In the following iteration, critical states are drawn from the reweighted clusters. <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref> outlines our approach for selecting critical states at each learning iteration, with the following three key steps.</p>
<p>
<italic>1. <underline>Clustering states</underline>
</italic>: Since NSEs are typically correlated with specific state features and do not occur at random, we cluster the states <inline-formula id="inf133">
<mml:math id="m133">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> into <inline-formula id="inf134">
<mml:math id="m134">
<mml:mrow>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> number of clusters so as to group states with similar NSE severity (<xref ref-type="bibr" rid="B28">Lakkaraju et al., 2017</xref>). In our experiments, we use KMeans clustering algorithm with Jaccard distance to measure the distance between states based on their features. In practice, any clustering algorithm can be used, including manual clustering. The goal is to create meaningful partitions of the state space to guide critical states selection for querying the user.</p>
<p>
<italic>2. <underline>Estimating information gain</underline>
</italic>: We define the information gain of sampling from a cluster <inline-formula id="inf135">
<mml:math id="m135">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, based on the learning gap, as follows:<disp-formula id="e1">
<mml:math id="m136">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>G</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m137">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf136">
<mml:math id="m138">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> denotes the set of states sampled for querying from cluster <inline-formula id="inf137">
<mml:math id="m139">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at iteration <inline-formula id="inf138">
<mml:math id="m140">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf139">
<mml:math id="m141">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf140">
<mml:math id="m142">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the probability of observing NSE category <inline-formula id="inf141">
<mml:math id="m143">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> in state <inline-formula id="inf142">
<mml:math id="m144">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, derived from <inline-formula id="inf143">
<mml:math id="m145">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf144">
<mml:math id="m146">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. This formulation quantifies how much the predicted NSE distribution diverges from the feedback received for each state, providing a principled measure of the expected information gain from querying in a cluster, <inline-formula id="inf145">
<mml:math id="m147">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, as defined in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>.</p>
<p>
<italic>3. <underline>Sampling critical states:</underline>
</italic> At each learning iteration <inline-formula id="inf146">
<mml:math id="m148">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the agent assigns a weight <inline-formula id="inf147">
<mml:math id="m149">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to each cluster <inline-formula id="inf148">
<mml:math id="m150">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, proportional to the new information on NSEs revealed by the most informative feedback format identified at <inline-formula id="inf149">
<mml:math id="m151">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, using <xref ref-type="disp-formula" rid="e2">Equation 2</xref>. Clusters are given equal weights when there is no prior feedback (Line 4). Let <inline-formula id="inf150">
<mml:math id="m152">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the number of critical states to be sampled in every iteration. We sample critical states in batches but they can also be sampled sequentially. When sampling in batches of <inline-formula id="inf151">
<mml:math id="m153">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> states, the number of states <inline-formula id="inf152">
<mml:math id="m154">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to be sampled from each cluster is determined by its assigned weight. At least one state is sampled from each cluster to ensure sufficient information for calculating the information gain for every cluster (Line 5). The agent randomly samples <inline-formula id="inf153">
<mml:math id="m155">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> states from corresponding cluster and adds them to a set of critical states <inline-formula id="inf154">
<mml:math id="m156">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (Lines 6, 7). If the total number of critical states sampled is less than <inline-formula id="inf155">
<mml:math id="m157">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> due to rounding, then the remaining <inline-formula id="inf156">
<mml:math id="m158">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> states are sampled from the cluster with the highest weight and added to <inline-formula id="inf157">
<mml:math id="m159">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (Lines 9&#x2013;12).</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Critical States Selection.<list list-type="simple">
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf158">
<mml:math id="m160">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: &#x23;critical states; <inline-formula id="inf159">
<mml:math id="m161">
<mml:mrow>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>:&#x23;clusters;</p>
</list-item>
<list-item>
<p>&#x2003;1: <inline-formula id="inf160">
<mml:math id="m162">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>&#x2205;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;2: Cluster states into <inline-formula id="inf161">
<mml:math id="m163">
<mml:mrow>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> clusters, <inline-formula id="inf162">
<mml:math id="m164">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;3: <bold>for</bold> each cluster <inline-formula id="inf163">
<mml:math id="m165">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;4: &#x2003;&#x2003;<inline-formula id="inf356">
<mml:math id="m166">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;if&#x2009;no&#x2009;feedback&#x2009;received&#x2009;in&#x2009;any&#x2009;iteration</mml:mtext>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfrac>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>G</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>G</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;if&#x2009;feedback&#x2009;received</mml:mtext>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;5: &#x2003;&#x2003;<inline-formula id="inf165">
<mml:math id="m167">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;6: &#x2003;&#x2003;Sample <inline-formula id="inf166">
<mml:math id="m168">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> states at random, <inline-formula id="inf167">
<mml:math id="m169">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mtext>Sample</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;7: &#x2003;&#x2003;<inline-formula id="inf168">
<mml:math id="m170">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;8: <bold>end for</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;9: <inline-formula id="inf169">
<mml:math id="m171">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;10: <bold>if</bold> <inline-formula id="inf170">
<mml:math id="m172">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;11: &#x2003;&#x2003;<inline-formula id="inf171">
<mml:math id="m173">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>arg</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;12: &#x2003;&#x2003;<inline-formula id="inf172">
<mml:math id="m174">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mtext>Sample</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;13: <bold>end if</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;14: <bold>return</bold> Set of selected critical states <inline-formula id="inf173">
<mml:math id="m175">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Feedback format selection</title>
<p>To query in the critical states, <inline-formula id="inf174">
<mml:math id="m176">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, it is important to select a feedback format that not only maximizes the expected information gain about NSEs but also accounts for likelihood and cost of the feedback. The <italic>information gain</italic> of a feedback format <inline-formula id="inf175">
<mml:math id="m177">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at iteration <inline-formula id="inf176">
<mml:math id="m178">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, for <inline-formula id="inf177">
<mml:math id="m179">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> critical states, is computed as the KL divergence between the observed and predicted NSE severity distributions, <inline-formula id="inf178">
<mml:math id="m180">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf179">
<mml:math id="m181">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e3">
<mml:math id="m182">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where, <inline-formula id="inf180">
<mml:math id="m183">
<mml:mrow>
<mml:mi mathvariant="double-struck">I</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is an indicator function that checks whether the format provided by the human, <inline-formula id="inf181">
<mml:math id="m184">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, matches the requested format <inline-formula id="inf182">
<mml:math id="m185">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. If no feedback is received, the information gain for that format remains unchanged. The following equation is used to select the feedback format <inline-formula id="inf183">
<mml:math id="m186">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, accounting for feedback cost and user preferences:<disp-formula id="e4">
<mml:math id="m187">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:munder>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>C</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Feedback&#x2009;utility&#x2009;of&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf184">
<mml:math id="m188">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of receiving a feedback in format <inline-formula id="inf185">
<mml:math id="m189">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf186">
<mml:math id="m190">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the feedback cost, determined using the human preference model <inline-formula id="inf187">
<mml:math id="m191">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf188">
<mml:math id="m192">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the current learning iteration, <inline-formula id="inf189">
<mml:math id="m193">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of times feedback in format <inline-formula id="inf190">
<mml:math id="m194">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> was received, and <inline-formula id="inf191">
<mml:math id="m195">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a small constant for numeric stability. The selected format <inline-formula id="inf192">
<mml:math id="m196">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the most informative feedback format given the agent&#x2019;s current knowledge, balancing exploration (less frequently used formats) and exploitation (formats known to provide high information gain).</p>
<p>
<statement content-type="algorithm" id="Algorithm_2">
<label>Algorithm 2</label>
<p>Feedback Selection for NSE Learning.<list list-type="simple">
<list-item>
<p>
<bold>Require:</bold> B, Querying budget; <inline-formula id="inf193">
<mml:math id="m197">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, Human preference model; <inline-formula id="inf194">
<mml:math id="m198">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: KL divergence threshold</p>
</list-item>
<list-item>
<p>&#x2003;1:<inline-formula id="inf195">
<mml:math id="m199">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf196">
<mml:math id="m200">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf197">
<mml:math id="m201">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;2:Initialize actions in <inline-formula id="inf198">
<mml:math id="m202">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to random l &#x2208; <inline-formula id="inf2204"> <mml:math id="m2208"> <mml:mrow> <mml:mrow> <mml:mrow> <mml:mo stretchy="false">{</mml:mo> <mml:mrow> <mml:msub> <mml:mrow> <mml:mi>l</mml:mi> </mml:mrow> <mml:mrow> <mml:mi>a</mml:mi> </mml:mrow> </mml:msub> <mml:mo>,</mml:mo> <mml:msub> <mml:mrow> <mml:mi>l</mml:mi> </mml:mrow> <mml:mrow> <mml:mi>m</mml:mi> </mml:mrow> </mml:msub> <mml:mo>,</mml:mo> <mml:msub> <mml:mrow> <mml:mi>l</mml:mi> </mml:mrow> <mml:mrow> <mml:mi>h</mml:mi> </mml:mrow> </mml:msub> </mml:mrow> <mml:mo stretchy="false">}</mml:mo> </mml:mrow> </mml:mrow> </mml:mrow> </mml:math> </inline-formula> and in <inline-formula id="inf199">
<mml:math id="m203">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to acceptable, l<sub>a</sub> <inline-formula id="inf202">
<mml:math id="m206">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>,<inline-formula id="inf203">
<mml:math id="m207">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> RandomNSELabel<inline-formula id="inf204">
<mml:math id="m208">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf205">
<mml:math id="m209">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;3:<bold>while</bold> <inline-formula id="inf206">
<mml:math id="m210">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf207">
<mml:math id="m211">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;4:&#x2003;Sample critical states using <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>
</p>
</list-item>
<list-item>
<p>&#x2003;5:&#x2003;Query user with feedback format <inline-formula id="inf208">
<mml:math id="m212">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, selected using <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, across sampled <inline-formula id="inf209">
<mml:math id="m213">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;6:&#x2003;<bold>if</bold> feedback received in format <inline-formula id="inf210">
<mml:math id="m214">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;7:&#x2003;&#x2003;<inline-formula id="inf211">
<mml:math id="m215">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> Update distribution based on the feedback received in format <inline-formula id="inf212">
<mml:math id="m216">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;8:&#x2003;&#x2003;<inline-formula id="inf213">
<mml:math id="m217">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mo>&#x2190;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> TrainClassifier<inline-formula id="inf214">
<mml:math id="m218">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;9:&#x2003;&#x2003;<inline-formula id="inf215">
<mml:math id="m219">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;10:&#x2003; Update <inline-formula id="inf216">
<mml:math id="m220">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, using <xref ref-type="disp-formula" rid="e3">Equation 3</xref>
</p>
</list-item>
<list-item>
<p>&#x2003;11:&#x2003; <inline-formula id="inf217">
<mml:math id="m221">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;12: <bold>end if</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;13: <inline-formula id="inf218">
<mml:math id="m222">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf219">
<mml:math id="m223">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;14:<bold>end while</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;15:<bold>return</bold> NSE classifier model, <inline-formula id="inf220">
<mml:math id="m224">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>
<xref ref-type="statement" rid="Algorithm_2">Algorithm 2</xref> outlines our feedback format selection approach. Since the agent has no prior knowledge of how the human categorizes NSE for each state-action pairs, the labeling function <inline-formula id="inf221">
<mml:math id="m225">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is instantiated by sampling from a uniform prior over the three NSE labels <inline-formula id="inf222">
<mml:math id="m226">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> for every <inline-formula id="inf223">
<mml:math id="m227">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, while q is initialized assuming all actions are safe <inline-formula id="inf224">
<mml:math id="m228">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (Line 2). These initial labels are progressively refined as human feedback is received. At each iteration, the agent samples <inline-formula id="inf225">
<mml:math id="m229">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> critical states using <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref> (Line 4), and selects a feedback format <inline-formula id="inf226">
<mml:math id="m230">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is selected using <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. The agent queries the human for feedback in <inline-formula id="inf227">
<mml:math id="m231">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (Line 5). If the feedback is received (with probability <inline-formula id="inf228">
<mml:math id="m232">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>), the observed NSE labels <inline-formula id="inf229">
<mml:math id="m233">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are updated and an NSE prediction model <inline-formula id="inf230">
<mml:math id="m234">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is trained (Lines 6&#x2013;8). The classifier <inline-formula id="inf231">
<mml:math id="m235">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> predicts the labels for the sampled critical states <inline-formula id="inf232">
<mml:math id="m236">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, yielding <inline-formula id="inf233">
<mml:math id="m237">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. We restrict the prediction to <inline-formula id="inf234">
<mml:math id="m238">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> since these states indicate regions of high uncertainty and contribute to reducing the divergence between the true and learned NSE distributions. Further, restricting predictions to <inline-formula id="inf235">
<mml:math id="m239">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> also reduces computational overhead during iterative querying. <inline-formula id="inf236">
<mml:math id="m240">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is recomputed using <xref ref-type="disp-formula" rid="e3">Equation 3</xref>, and <inline-formula id="inf237">
<mml:math id="m241">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is incremented (Lines 9&#x2013;11). This repeats until either the querying budget is exhausted or the KL divergence between <inline-formula id="inf238">
<mml:math id="m242">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf239">
<mml:math id="m243">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> over all states is within a problem-specific threshold <inline-formula id="inf240">
<mml:math id="m244">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>
<xref ref-type="fig" rid="F5">Figure 5</xref> illustrates the critical states and the most informative feedback formats selected at each iteration in the object delivery task using AFS, demonstrating that feedback utility changes over time, based on the robot&#x2019;s current knowledge.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Feedback utility of each format across iterations. Numbers mark when a state was identified as critical, and circle colors denote the chosen feedback format.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g005.tif">
<alt-text content-type="machine-generated">Bar graph shows feedback utility scores across five learning iterations, comparing six feedback methods: Annotated Approval, Approval, Ranking, Annotated Correction, Correction, and DAM. A robotic arm is positioned over colored squares labeled one to five on a checkered tabletop, illustrating a task setup.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Stopping criteria</title>
<p>Besides guiding the selection of critical states and feedback format, the KL-divergence also serves as an indicator of when to stop querying. The querying phase can be terminated when <inline-formula id="inf241">
<mml:math id="m245">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf242">
<mml:math id="m246">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a problem-specific threshold. When <inline-formula id="inf243">
<mml:math id="m247">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, it indicates that the learned model is a reasonable approximation of the underlying NSE distribution, and therefore the querying can be terminated even if the allotted budget <inline-formula id="inf244">
<mml:math id="m248">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> has not been exhausted. The choice of <inline-formula id="inf245">
<mml:math id="m249">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> provides a trade-off between thorough learning and human effort, and can be tuned based on domain-specific safety requirements.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experiments in simulation</title>
<p>We first evaluate AFS on three simulated domains (<xref ref-type="fig" rid="F6">Figure 6</xref>). Human feedback is simulated by modeling an oracle that selects safer actions with higher probability using a softmax action selection (<xref ref-type="bibr" rid="B16">Ghosal et al., 2023</xref>; <xref ref-type="bibr" rid="B22">Jeon et al., 2020</xref>): the probability of choosing an action <inline-formula id="inf246">
<mml:math id="m250">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> from a set of all safe actions <inline-formula id="inf247">
<mml:math id="m251">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> in state <inline-formula id="inf248">
<mml:math id="m252">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is, <inline-formula id="inf249">
<mml:math id="m253">
<mml:mrow>
<mml:mi>Pr</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Illustrations of evaluation domains. Red box denotes the agent and the goal location is in green. <bold>(a)</bold> Navigation: Unavoidable NSE. <bold>(b)</bold> Vase: Unavoidable NSE. <bold>(c)</bold> Safety-gym Push.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g006.tif">
<alt-text content-type="machine-generated">Panel (a) illustrates a navigation scenario with a vehicle on a track featuring green paths and water puddles. Panel (b) shows a grid with vases obstructing a path where a robot icon is placed at the top. Panel (c) presents a grid with purple circles and an orange puzzle piece, labeled &#x22;Safety-gym Push.&#x22;</alt-text>
</graphic>
</fig>
<p>
<bold>Baselines</bold> (i) <italic>Naive Agent</italic>: The agent naively executes its primary policy without learning about NSEs, providing an upper bound on the NSE penalty incurred. (ii) <italic>Oracle</italic>: The agent has complete knowledge about <inline-formula id="inf250">
<mml:math id="m254">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf251">
<mml:math id="m255">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, providing a lower bound on the NSE penalty incurred. (iii) <italic>Reward Inference with</italic> <inline-formula id="inf252">
<mml:math id="m256">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <italic>Modeling (RI)</italic> (<xref ref-type="bibr" rid="B16">Ghosal et al., 2023</xref>): The agent selects a feedback format that maximizes information gain according to the human&#x2019;s inferred rationality, <inline-formula id="inf253">
<mml:math id="m257">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. (iv) <italic>Cost-Sensitive Approach</italic>: The agent selects a feedback method with the least cost, according to the preference model <inline-formula id="inf254">
<mml:math id="m258">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. (v) <italic>Most-Probable Feedback</italic>: The agent selects a feedback format that the human is most likely to provide, based on <inline-formula id="inf255">
<mml:math id="m259">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. (vi) <italic>Random Critical States</italic>: The agent uses our AFS framework to learn about NSEs, except the critical states are sampled randomly from the entire state space. We use <inline-formula id="inf256">
<mml:math id="m260">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf257">
<mml:math id="m261">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for all our experiments. AFS uses learned <inline-formula id="inf258">
<mml:math id="m262">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>
<bold>Domains, Metrics and Feedback Formats:</bold> We evaluate the performance of various techniques on three domains in simulation (<xref ref-type="fig" rid="F6">Figure 6</xref>): outdoor navigation, vase and safety-gym&#x2019;s push. We optimize costs (negations of rewards) and compare techniques using average NSE penalty and average cost to goal, averaged over 100 trials. For navigation, vase and push, we simulate human feedback. The cost for <inline-formula id="inf259">
<mml:math id="m263">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf260">
<mml:math id="m264">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf261">
<mml:math id="m265">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are 0, <inline-formula id="inf262">
<mml:math id="m266">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf263">
<mml:math id="m267">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> respectively.</p>
<p>
<bold>Navigation:</bold> In this ROS-based city environment, the robot optimizes the shortest path to the goal location. A state is represented as <inline-formula id="inf264">
<mml:math id="m268">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, where, <inline-formula id="inf265">
<mml:math id="m269">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf266">
<mml:math id="m270">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are robot coordinates, <inline-formula id="inf267">
<mml:math id="m271">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the surface type (concrete or grass), and <inline-formula id="inf268">
<mml:math id="m272">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates the presence of a puddle. The robot can move in all four directions and each costs <inline-formula id="inf269">
<mml:math id="m273">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Actions succeed with probability 0.8. Navigating over grass damages the grass and is a mild NSE. Navigating over grass with puddles is a severe NSE. Features used for training are <inline-formula id="inf270">
<mml:math id="m274">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Here, NSEs are unavoidable.</p>
<p>
<bold>Vase:</bold> In this domain, the robot must quickly reach the goal, while minimizing breaking a vase as a side effect (<xref ref-type="bibr" rid="B27">Krakovna et al., 2020</xref>). A state is represented as <inline-formula id="inf271">
<mml:math id="m275">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> where, <inline-formula id="inf272">
<mml:math id="m276">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf273">
<mml:math id="m277">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are robot&#x2019;s coordinates. <inline-formula id="inf274">
<mml:math id="m278">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates the presence of a vase and <inline-formula id="inf275">
<mml:math id="m279">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates if the floor is carpeted. The robot moves in all four directions and each costs <inline-formula id="inf276">
<mml:math id="m280">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Actions succeed with probability 0.8. Breaking a vase placed on a carpet is a mild NSE and breaking a vase on the hard surface is a severe NSE. <inline-formula id="inf277">
<mml:math id="m281">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are used for training. All instances have unavoidable NSEs.</p>
<p>
<bold>Push:</bold> In this <monospace>safety-gymnasium</monospace> domain, the robot aims to push a box quickly to a goal state (<xref ref-type="bibr" rid="B23">Ji et al., 2023</xref>). Pushing a box on a hazard zone (blue circles) produces NSEs. We modify the domain such that in addition to the existing actions, the agent can also <italic>wrap</italic> the box that costs <inline-formula id="inf278">
<mml:math id="m282">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Every move action succeeds with probability 0.8, and the wrap action succeeds with probability 1.0. The NSEs can be avoided by pushing a wrapped box. A state is represented as <inline-formula id="inf279">
<mml:math id="m283">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> where, <inline-formula id="inf280">
<mml:math id="m284">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the robot&#x2019;s coordinates, <inline-formula id="inf281">
<mml:math id="m285">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates carrying a box, <inline-formula id="inf282">
<mml:math id="m286">
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates if box is wrapped and <inline-formula id="inf283">
<mml:math id="m287">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes if it is a hazard area. <inline-formula id="inf284">
<mml:math id="m288">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are used for training.</p>
<sec id="s5-1">
<label>5.1</label>
<title>Results and discussion</title>
<p>
<bold>Effect of learning using AFS:</bold> We first examine the benefit of querying using AFS, by comparing the resulting average NSE penalties and the cost for task completion, across domains and query budget. <xref ref-type="fig" rid="F7">Figure 7</xref> shows the average NSE penalties when operating based on an NSE model learned using different querying approaches. Clusters for critical state selection were generated using KMeans clustering algorithm with <inline-formula id="inf285">
<mml:math id="m289">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for navigation, vase and safety-gym&#x2019;s push domains (<xref ref-type="fig" rid="F7">Figures 7a&#x2013;c</xref>). The results show that our approach consistently performs similar to or better than the baselines.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Average penalty incurred when querying with different feedback selection techniques. <bold>(a)</bold> Navigation: Unavoidable NSE. <bold>(b)</bold> Vase: Unavoidable NSE. <bold>(c)</bold> Safety-gym Push.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g007.tif">
<alt-text content-type="machine-generated">Three line graphs depict the average penalty versus budget for different agents across three tasks: Navigation, Vase, and Safety-gym Push. Each graph shows performance of various methods, including Naive Agent, Most Probable Feedback, Cost-Sensitive Approach, Random Critical States, RI, Oracle, and AFS, with AFS and Oracle achieving the lowest penalties. The graphs have a budget range of 400 to 2500 and penalty values varying according to the task.</alt-text>
</graphic>
</fig>
<p>There is a trade-off between optimizing task completion and mitigating NSEs, especially when NSEs are unavoidable. While some techniques are better at mitigating NSEs, they significantly impact task performance. <xref ref-type="table" rid="T1">Table 1</xref> shows the average cost for task completion at <inline-formula id="inf286">
<mml:math id="m290">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>400</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. <italic>Lower</italic> values are better for both NSEs and task completion cost. While the Naive Agent has a lower cost for task completion, it incurs the highest NSE penalty as it has no knowledge of <inline-formula id="inf287">
<mml:math id="m291">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. RI causes more NSEs, especially when they are unavoidable, as its reward function does not fully model the penalties for mild and severe NSEs. Overall, the results show that our approach consistently mitigates avoidable and unavoidable NSEs, without affecting the task performance substantially.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Average cost and standard error at task completion.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">Navigation: Unavoidable NSE</th>
<th align="center">Vase: Unavoidable NSE</th>
<th align="center">Safety-gym push: Avoidable NSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Oracle</td>
<td align="center">
<inline-formula id="inf288">
<mml:math id="m292">
<mml:mrow>
<mml:mn>51.37</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.69</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf289">
<mml:math id="m293">
<mml:mrow>
<mml:mn>54.46</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>6.70</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf290">
<mml:math id="m294">
<mml:mrow>
<mml:mn>44.62</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>9.97</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Naive</td>
<td align="center">
<inline-formula id="inf291">
<mml:math id="m295">
<mml:mrow>
<mml:mn>36.11</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.39</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf292">
<mml:math id="m296">
<mml:mrow>
<mml:mn>36.0</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.89</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf293">
<mml:math id="m297">
<mml:mrow>
<mml:mn>39.82</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>5.44</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">RI</td>
<td align="center">
<inline-formula id="inf294">
<mml:math id="m298">
<mml:mrow>
<mml:mn>40.10</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.69</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf295">
<mml:math id="m299">
<mml:mrow>
<mml:mn>37.42</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf296">
<mml:math id="m300">
<mml:mrow>
<mml:mn>42.15</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.44</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">AFS</td>
<td align="center">
<inline-formula id="inf297">
<mml:math id="m301">
<mml:mrow>
<mml:mn>64.8</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf298">
<mml:math id="m302">
<mml:mrow>
<mml:mn>52.68</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>7.87</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf299">
<mml:math id="m303">
<mml:mrow>
<mml:mn>48.32</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>4.42</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> shows the average penalty when AFS uses KL-divergence (KLD) as the stopping criteria, compared to querying with budget <inline-formula id="inf300">
<mml:math id="m304">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>400</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. For comparison, we also annotate in the plot the querying budget used by AFS with KLD stopping at the time of termination. The results show that despite terminating earlier and using few queries, AFS with the KLD stopping achieves comparable performance to that of AFS with query budget <inline-formula id="inf301">
<mml:math id="m305">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>400</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, demonstrating the usefulness of KLD as a stopping criterion.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Average penalty incurred when learning with AFS using querying budget <inline-formula id="inf302">
<mml:math id="m306">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>400</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and KL divergence (KLD) as the stopping criterion. The budget utilized by AFS with KLD stopping is annotated in the plot.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g008.tif">
<alt-text content-type="machine-generated">Bar chart showing average penalty comparisons across three tasks: Navigation, Vase, and Safety-gym Push, each with a specific B value. Bars represent &#x22;AFS without KLD Stopping,&#x22; &#x22;AFS with KLD Stopping,&#x22; and &#x22;Oracle.&#x22; Error bars indicate variability.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s6">
<label>6</label>
<title>In-person user study with a physical robot arm</title>
<p>We conducted an in-person study with a Kinova Gen3 7DoF arm (<xref ref-type="bibr" rid="B25">Kinova, 2025</xref>) tasked with delivering two objects&#x2014;an orange toy and a white box&#x2014;across a workspace containing items of varying fragility (<xref ref-type="fig" rid="F9">Figure 9</xref>). This setup involves users providing both interface-based and kinesthestic feedback to the robot. The study was approved by Oregon State University IRB. Participants were compensated with a <inline-formula id="inf303">
<mml:math id="m307">
<mml:mrow>
<mml:mi>$</mml:mi>
<mml:mn>15</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Amazon gift card for their participation in the study.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Task setup for the human subject study. <bold>(a)</bold> Physical setup of the task for human subjects study; <bold>(b)</bold> Replication of the physical setup using PyBullet. A dialog box corresponding to the current feedback format is shown for every query.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g009.tif">
<alt-text content-type="machine-generated">Side-by-side comparison showing a physical robotic arm setup with assorted objects for a user study on the left, and a simulated version of the same workspace featuring colored grid tiles, direction labels, and a digital robotic arm on the right.</alt-text>
</graphic>
</fig>
<p>This user study had three goals: (1) to measure our approach&#x2019;s effectiveness in reducing NSEs for a real-world task, (2) to understand how users perceive the adaptivity, workload and competence of the robot operating in the AFS framework, and (3) to evaluate the extent to which AFS captures user preferences in practice, while ensuring maximum information gain during the learning process.</p>
<sec id="s6-1">
<label>6.1</label>
<title>Methods</title>
<sec id="s6-1-1">
<label>6.1.1</label>
<title>Participants</title>
<p>We conducted a pilot study in simulation to inform our overall design, the details of which are discussed under Section 2 in the <xref ref-type="sec" rid="s16">Supplementary Material</xref>. We conducted another pilot study with <inline-formula id="inf304">
<mml:math id="m308">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> participants to evaluate the study setup with the Kinova arm. In particular, this pilot study assessed the clarity of instructions, survey wording, and feasibility of the task design in the object delivery task of the Kinova arm. Based on the participant feedback, we simplified the survey questions and included example trajectories that demonstrated safe and NSE-causing behaviors. For the main study, we recruited <inline-formula id="inf305">
<mml:math id="m309">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>30</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> participants with basic computer literacy from the <italic>general population</italic> through university mailing lists and public forums. Participants were aged 18&#x2013;72 years <inline-formula id="inf306">
<mml:math id="m310">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>32.10</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>13.11</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, with <inline-formula id="inf307">
<mml:math id="m311">
<mml:mrow>
<mml:mn>53.3</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> men and <inline-formula id="inf308">
<mml:math id="m312">
<mml:mrow>
<mml:mn>46.7</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> women. Participants reported varied prior experience with robots: <inline-formula id="inf309">
<mml:math id="m313">
<mml:mrow>
<mml:mn>73.3</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> had general awareness of similar robot products, <inline-formula id="inf310">
<mml:math id="m314">
<mml:mrow>
<mml:mn>6.7</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> had researched or investigated robots, <inline-formula id="inf311">
<mml:math id="m315">
<mml:mrow>
<mml:mn>3.3</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> had interacted through product demos, and <inline-formula id="inf312">
<mml:math id="m316">
<mml:mrow>
<mml:mn>13.3</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> had no prior awareness of similar products.</p>
</sec>
<sec id="s6-1-2">
<label>6.1.2</label>
<title>Robotic system setup</title>
<p>The Kinova Gen3 arm was equipped with a joint space compliant controller which allowed participants to physically move the joints of the arm through space with gravity compensation when needed. Additionally, a task-space planner allowed for navigation to discrete grid positions for both feedback queries and policy execution (<xref ref-type="bibr" rid="B25">Kinova, 2025</xref>). <xref ref-type="fig" rid="F9">Figure 9a</xref> shows the physical workspace and the two delivery objects, while <xref ref-type="fig" rid="F9">Figure 9b</xref> shows the corresponding PyBullet simulation used for visualization during GUI-based feedback. A dialog box was displayed to prompt the participant whenever feedback was queried<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>.</p>
</sec>
<sec id="s6-1-3">
<label>6.1.3</label>
<title>Interaction premise</title>
<p>The interaction simulated an assistive robot delivering objects to their designated bins. Specifically, the task required the Kinova arm to deliver an orange plush toy and a rigid white box to their respective bins while avoiding collision with surrounding obstacles of different fragility. Collisions with fragile obstacles (e.g., a glass vase) during delivery of the plush toy were considered a mild NSE. Collisions involving the white rigid box were severe NSEs if with a fragile object and were mild NSEs if with a non-fragile object. All other scenarios were considered safe. The workspace was discretized into a grid of cells marked with tape on the tabletop and mirrored in the GUI. Each cell represented a state corresponding to possible end-effector position.</p>
</sec>
<sec id="s6-1-4">
<label>6.1.4</label>
<title>Study design</title>
<p>The robot&#x2019;s state space was discretized and represented as <inline-formula id="inf313">
<mml:math id="m317">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf314">
<mml:math id="m318">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denote the end-effector position, <inline-formula id="inf315">
<mml:math id="m319">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf316">
<mml:math id="m320">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> indicate the presence of either orange plush toy or white rigid box in the end effector, <inline-formula id="inf317">
<mml:math id="m321">
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates the presence of an obstacle, and <inline-formula id="inf318">
<mml:math id="m322">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates obstacle fragility, and <inline-formula id="inf319">
<mml:math id="m323">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf320">
<mml:math id="m324">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> indicate whether either of the objects were delivered in their corresponding goal locations (i.e., orange plush toy in white bin and the white box in the wicker bin).</p>
<p>Participants interacted with the robot through <italic>four</italic> feedback formats, <inline-formula id="inf321">
<mml:math id="m325">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mtext>App,&#x2009;Corr,&#x2009;Rank,&#x2009;DAM</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, during both the training and main experience phases. Depending on the feedback format, the Kinova arm executed the queried action in the physical workspace or displayed a simulation of the action in the graphical user interface (GUI). Interaction across the four feedback formats are described below.<list list-type="order">
<list-item>
<p>
<bold>Approval:</bold> The robot executed a single action in simulation, and participants indicated whether it was safe by selecting &#x201c;yes&#x201d; or &#x201c;no&#x201d; in the GUI.</p>
</list-item>
<list-item>
<p>
<bold>Correction:</bold> The robot first executes action prescribed by its policy in simulation. If the action in simulation is deemed unsafe by the participant, the robot in the physical setup moves to the queried location. Participants then correct the robot by physically moving the robot arm to demonstrate a safe alternative action.</p>
</list-item>
<list-item>
<p>
<bold>Demo-Action Mismatch:</bold> The robot first physically moved its arm to a specific end-effector position in the workspace. Participants then provided feedback by guiding the arm to a safe position, thereby demonstrating the safe action. The robot compares the action given by its policy to the demonstrated action. If the robot&#x2019;s action and the demonstrated actions do not match, then the robot&#x2019;s action is considered unsafe.</p>
</list-item>
<list-item>
<p>
<bold>Ranking:</bold> Simulation clips of two actions selected at random in a given state were presented in GUI. Participants compared the two candidate actions and selected which was safer. If both actions were judged equally safe or unsafe, either option could be chosen.</p>
</list-item>
</list>
</p>
<p>Each participant experienced four learning conditions in a within-subjects, counterbalanced design:<list list-type="order">
<list-item>
<p>The baseline RI approach proposed in <xref ref-type="bibr" rid="B16">Ghosal et al. (2023)</xref>,</p>
</list-item>
<list-item>
<p>AFS with random <inline-formula id="inf322">
<mml:math id="m326">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where critical states are randomly selected,</p>
</list-item>
<list-item>
<p>AFS with a fixed feedback format (DAM) for querying, consistent with prior works that rely primarily on demonstrations, and</p>
</list-item>
<list-item>
<p>The proposed AFS approach, where both the feedback format and the critical states are selected to maximize information gain.</p>
</list-item>
</list>
</p>
<p>Each condition is a distinct feedback query selection strategy controlling how the robot queried participants during learning. These conditions are the independent variables. The dependent measures include NSE occurrences, their severity, perceived workload, trust, competence and user alignment.</p>
</sec>
<sec id="s6-1-5">
<label>6.1.5</label>
<title>Hypotheses</title>
<p>We test the following hypotheses in the in-person study. These hypotheses were derived from trends observed in the experiments and human subjects study in simulation (Section 5 and Section 2 in the <xref ref-type="sec" rid="s16">Supplementary Materials</xref>).</p>
<p>
<bold>H1:</bold> <italic>Robots learning using AFS will have fewer NSEs in comparison to the baselines.</italic>
</p>
<p>This hypothesis is derived from the results of our experiments on simulated domains (<xref ref-type="fig" rid="F7">Figure 7</xref>) where AFS consistently reduced NSEs while completing the assigned task. We hypothesize that this trend extends to physical human-robot interactions.</p>
<p>
<bold>H2:</bold> <italic>AFS will achieve comparable or better performance compared to the baselines, with a lower perceived workload for the users.</italic>
</p>
<p>The results on simulated domains (<xref ref-type="fig" rid="F8">Figure 8</xref>) show that AFS achieved better or comparable performance to the baselines, using fewer feedback queries. While the in-person user study requires relatively greater physical and cognitive effort, we expect the advantage of the sample efficiency to persist and investigate whether it translates to reduced perceived workload.</p>
<p>
<bold>H3:</bold> <italic>Participants will report AFS as more trustworthy, competent, and aligned with user expectations, in comparison to the baselines.</italic>
</p>
<p>In the human subjects simulation study (<xref ref-type="sec" rid="s16">Supplementary Table S2</xref>), participants reported that AFS selected intelligent queries, targeted critical states, and improved the agent&#x2019;s performance, reflecting indicators of trust, competence and user alignment. We hypothesize that this trend extends to physical settings as well.</p>
<p>Hypotheses <bold>H1</bold> and <bold>H2</bold> explore trends identified in simulation and are therefore confirmatory. Hypothesis <bold>H3</bold> builds on the perception measures used in the human subjects study in simulation, and is hence treated as an extended confirmatory hypothesis.</p>
</sec>
<sec id="s6-1-6">
<label>6.1.6</label>
<title>Procedure</title>
<p>Each study session lasted approximately 1 hour and followed three phases.</p>
<sec id="s6-1-6-1">
<label>6.1.6.1</label>
<title>Training</title>
<p>Participants were first introduced to the task objective, workspace, and the four feedback formats. For each format, they provided feedback on four sample queries to practice both GUI-based and kinesthetic interactions. After the completing each format, the participants rated the following: (i) probability of responding to a query in that format, <inline-formula id="inf323">
<mml:math id="m327">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, (ii) perceived cost or effort required to provide feedback, <inline-formula id="inf324">
<mml:math id="m328">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and (iii) the overall task workload. This phase helped establish measures like feedback likelihood, perceived effort, and workload.</p>
</sec>
<sec id="s6-1-6-2">
<label>6.1.6.2</label>
<title>Main experience</title>
<p>Following training, participants completed the four learning conditions corresponding to different approaches under evaluation. In each condition, the participants provided feedback to train the robot to avoid collision while performing the object-delivery task. Depending on the feedback format selected by the querying strategy, participants either evaluated short simulation clips on the GUI or physically guided the robotic arm. At the end of each condition, the robot executed its learned policy based on its learning under that condition. The participants then observed its performance and completed a brief post-condition questionnaire assessing workload, trust, perceived competence, and user-alignment.</p>
</sec>
<sec id="s6-1-6-3">
<label>6.1.6.3</label>
<title>Closing</title>
<p>At the end of the study, participants compared the four learning approaches in terms of trade-offs between learning speed and safety. Participants reported their preferences on providing feedback through multiple formats versus relying on a single feedback format. These responses offered qualitative insight into AFS&#x2019;s practicality and user acceptance.</p>
</sec>
</sec>
<sec id="s6-1-7">
<label>6.1.7</label>
<title>Measures</title>
<p>We collected both quantitative and qualitative measures. The quantitative measure captured task-level performance through the frequency and the severity of NSEs (mild and severe). Qualitative measures captured participants&#x2019; perceptions of the following.<list list-type="order">
<list-item>
<p>
<bold>Workload:</bold> Participants&#x2019; perceived workload across the feedback formats and learning conditions were measured using the NASA Task Load Index (NASA TLX) (<xref ref-type="bibr" rid="B18">Hart and Staveland, 1988</xref>). The questionnaire scales were transformed to seven-point subscales ranging from &#x201c;Very Low&#x201d; (1) to &#x201c;Very High&#x201d; (7). Responses were collected during the training phase and after each condition in the main experience phase.</p>
</list-item>
<list-item>
<p>
<bold>Robot Attributes:</bold> Perceived robot attributes, like competence, warmth and discomfort, were measured using the nine-point Robotic Social Attributes Scale (RoSAS) (<xref ref-type="bibr" rid="B10">Carpinella et al., 2017</xref>), ranging from &#x201c;Strongly Disagree&#x201d; (1) to &#x201c;Strongly Agree&#x201d; (9). Participants completed this questionnaire after each learning condition.</p>
</list-item>
<list-item>
<p>
<bold>Trust:</bold> A custom 10-point trust scale <inline-formula id="inf325">
<mml:math id="m329">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>%</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> was used to measure participants&#x2019; confidence in the robot&#x2019;s ability to act safely under each learning condition. Participants rated their trust both before and after the robot&#x2019;s training phase to capture changes in its learning performance.</p>
</list-item>
<list-item>
<p>
<bold>User Alignment:</bold> Participants&#x2019; perception of user alignment was assessed using a custom seven-point Likert scale ranging from &#x201c;Strongly Disagree&#x201d; (1) to &#x201c;Strong Agree&#x201d; (7). Participants rated (i) how well the critical states queried by the robot aligned with their own assessment of which states were important for learning, and (ii) how well the feedback formats chosen across conditions matched their personal feedback preferences. Higher rating indicated stronger perceived alignment between the robot&#x2019;s querying strategy and the participants&#x2019; expectations.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s6-1-8">
<label>6.1.8</label>
<title>Analysis</title>
<p>Survey responses were compiled into cumulative RoSAS (competence, warmth, discomfort) and NASA-TLX workload scores. A repeated-measures ANOVA (rANOVA) tested for significant differences across learning conditions; we report the <inline-formula id="inf326">
<mml:math id="m330">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-statistic, <inline-formula id="inf327">
<mml:math id="m331">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-value and effect size as generalized eta-squared <inline-formula id="inf328">
<mml:math id="m332">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. When effects were significant, Tukey&#x2019;s post-hoc tests identified pairwise differences. All results are reported with means (M), standard errors (SE), and <inline-formula id="inf329">
<mml:math id="m333">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-values.</p>
</sec>
</sec>
<sec id="s6-2">
<label>6.2</label>
<title>Results</title>
<p>We evaluate hypotheses <bold>H1-H3</bold> using both objective and subjective measures. Data from all 30 participants were included in the analysis, as all sessions were completed successfully.</p>
<sec id="s6-2-1">
<label>6.2.1</label>
<title>Effectiveness of AFS in mitigating NSEs (H1)</title>
<p>
<xref ref-type="fig" rid="F10">Figure 10a</xref> shows the average penalty incurred under each condition. AFS approach incurred the least NSE penalty <inline-formula id="inf330">
<mml:math id="m334">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.83</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.21</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, substantially lower than AFS with random <inline-formula id="inf331">
<mml:math id="m335">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf332">
<mml:math id="m336">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>11.55</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.57</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and AFS with a fixed feedback format <inline-formula id="inf333">
<mml:math id="m337">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10.50</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.37</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. The RI baseline incurred higher penalties <inline-formula id="inf334">
<mml:math id="m338">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5.00</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.00</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> compared to AFS. These results confirm hypothesis <bold>H1</bold> and demonstrate that adaptively selecting both critical states and feedback formats reduced unsafe behaviors more effectively than random or fixed querying strategies.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Results from the user study on the Kinova 7DoF arm. <bold>(a)</bold> Average penalty incurred across methods in the human subjects study. <bold>(b)</bold> NASA-TLX workload across the four conditions.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g010.tif">
<alt-text content-type="machine-generated">(a) Bar chart showing average penalty for four methods: RI, AFS with Random &#x3A9;, AFS with Fixed Format, and AFS (Ours). Penalties vary, with AFS (Ours) having the lowest. (b) Box plot depicting NASA-TLX workload ratings across the same methods. A significant difference exists between RI and AFS with Random &#x3A9;, indicated by p &#x3C; 0.05.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s6-2-2">
<label>6.2.2</label>
<title>Learning efficiency and workload (H2)</title>
<p>We first compare the perceived workload across different feedback formats, followed by the results across learning conditions. Demonstration is the most widely used feedback format in existing works but was perceived as the most demanding (<xref ref-type="fig" rid="F11">Figure 11c</xref>). While corrections offer corrective action in addition to disapproving agent&#x2019;s action, it also imposed substantial effort on the users. Approval required the least workload but conveyed limited information. A repeated-measures ANOVA revealed a significant effect of feedback format on perceived workload, <inline-formula id="inf335">
<mml:math id="m339">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,87</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.33</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.023</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.046</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Post hoc comparisons indicated that Approval <inline-formula id="inf336">
<mml:math id="m340">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.11</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.12</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> imposed significantly lower workload <inline-formula id="inf337">
<mml:math id="m341">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.026</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> than Demo-Action Mismatch <inline-formula id="inf338">
<mml:math id="m342">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.62</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.19</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, while no other pairwise differences reached significance. This trade-off underscores the need for an adaptive selection strategy to balance informativeness with user effort.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>User study results. <bold>(a,b)</bold> RoSAS competence and NASA Task-Load across the four conditions in the main study; <bold>(c)</bold> NASA Task-Load across feedback formats.</p>
</caption>
<graphic xlink:href="frobt-12-1734564-g011.tif">
<alt-text content-type="machine-generated">Three box plots show different data analyses. (a) RoSAS competence across four conditions with similar ratings mostly above five. (b) Participants' trust perception before and after learning; trust decreases post-learning across the four conditions. (c) NASA-TLX scores across feedback formats; Rank and Corr formats show lower ratings with statistically significant differences noted.</alt-text>
</graphic>
</fig>
<p>The rANOVA analysis across the four learning conditions further revealed a significant effect in the NASA-TLX workload ratings <inline-formula id="inf339">
<mml:math id="m343">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,87</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.73</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.014</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.030</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Among the four conditions, AFS achieved one of the lowest perceived workload ratings <inline-formula id="inf340">
<mml:math id="m344">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.34</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.12</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, comparable to AFS with random <inline-formula id="inf341">
<mml:math id="m345">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf342">
<mml:math id="m346">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.26</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.15</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and lower than both AFS with fixed format <inline-formula id="inf343">
<mml:math id="m347">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.56</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.19</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and RI <inline-formula id="inf344">
<mml:math id="m348">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.64</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.19</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Tukey post-hoc tests showed that workload in AFS with random <inline-formula id="inf345">
<mml:math id="m349">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> imposed a significantly lower workload than RI <inline-formula id="inf346">
<mml:math id="m350">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.033</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Overall, these results support <bold>H2</bold>, indicating that adaptively selecting queries helps reduce perceived workload relative to the baselines (<xref ref-type="fig" rid="F10">Figure 10b</xref>).</p>
</sec>
<sec id="s6-2-3">
<label>6.2.3</label>
<title>Trust, competence, and preference alignment (H3)</title>
<p>Participants&#x2019; rating on the robot&#x2019;s ability to act safely increased after learning with AFS, as shown in <xref ref-type="fig" rid="F11">Figure 11b</xref>. A significant effect was also found for perceived robot competence <inline-formula id="inf347">
<mml:math id="m351">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,87</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10.6</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.001</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.082</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="fig" rid="F11">Figure 11a</xref>). AFS was rated highest <inline-formula id="inf348">
<mml:math id="m352">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>7.04</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.32</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, significantly greater than AFS with random <inline-formula id="inf349">
<mml:math id="m353">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf350">
<mml:math id="m354">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5.88</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.32</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.002</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and AFS with fixed format <inline-formula id="inf351">
<mml:math id="m355">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5.88</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.30</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.001</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, while comparable to RI <inline-formula id="inf352">
<mml:math id="m356">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6.68</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.32</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. These results support <bold>H3</bold>&#x2014;AFS was perceived as more competent and trustworthy compared to the baselines.</p>
<p>Descriptive analyses of user alignment on state criticality and feedback alignment ratings, indicated consistent trends across participants. While differences between conditions were not statistically significant <inline-formula id="inf353">
<mml:math id="m357">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, AFS consistently received higher ratings for feedback alignment <inline-formula id="inf354">
<mml:math id="m358">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.79</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.42</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> relative to state criticality <inline-formula id="inf355">
<mml:math id="m359">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.14</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.40</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, suggesting that participants found AFS&#x2019;s query selections relevant and aligned with their preferences. Participants (both those aware and unaware of similar robotic systems) perceived AFS&#x2019;s queries as critical for learning and well-aligned with their feedback preferences. Participants with prior research experience rated state criticality and format alignment comparable, indicating confidence in adaptivity of AFS&#x2019;s querying process.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s7">
<label>7</label>
<title>Discussion</title>
<p>Our experiments followed an increasingly realistic progression in design. In the experiments in simulation with both avoidable and unavoidable NSEs, AFS incurred lower penalties and overall costs compared to the baselines, demonstrating its ability to balance task performance with safety. The results of our pilot study, where users interacted with a simulated agent, showed that AFS effectively learns the participant&#x2019;s feedback preference model and uses them to select formats aligned with user expectations. Finally, the in-person user study with the Kinova arm, showed the practicality of using AFS in real-world settings, achieving favorable ratings on trust, workload, and user-preference alignment. These findings support our three hypotheses regarding the performance of AFS: (H1) it reduces unsafe behaviors more effectively than the baselines, (H2) it improves learning efficiency while reducing user workload, and (H3) it is perceived as more trustworthy and competent. The results collectively highlight that adaptively selecting both the query format and the states to pose the queries to the user enhances learning efficiency and reduces user effort.</p>
<p>Beyond confirming these hypotheses, the findings provide important design implications for human-in-the-loop learning systems. By modeling the trade-off between informativeness and effort, AFS offers a framework to balance user workload with the need for high-quality feedback. The learned feedback preference model allows the agent to adaptively select querying formats while minimizing human effort. Using KL-divergence as stopping criterion further enables adaptive termination of the querying process. This overcomes the problem of determining the &#x201c;right&#x201d; querying budget for a problem, and shows that AFS enables efficient learning while minimizing redundant human feedback. These design principles can inform the development of interactive systems that adapt query format and frequency based on agent&#x2019;s current knowledge and user feedback preferences. Overall the results show that AFS (1) consistently outperforms the baselines across different evaluation settings, and (2) can be effectively deployed in real-world human-robot interaction scenarios.</p>
<p>A key strength of this work lies in its extensive evaluation, from simulation to real robot studies, supporting AFS&#x2019;s robustness and practicality. One limitation, however, is that the current evaluation focuses on discrete environments. Extending AFS to continuous domains introduces challenges such as identifying critical states and estimating divergence-based information gain in high-dimensional spaces. While gathering feedback at the trajectory-level is relatively easier in continuous settings, gathering state-level feedback, which is the focus of this work, is challenging. These challenges stem from the need for scalable state representations and efficient sampling strategies, which will be a focus for future work.</p>
</sec>
<sec id="s8">
<label>8</label>
<title>Conclusion and future work</title>
<p>The proposed Adaptive Feedback Selection (AFS) facilitates querying a human in different formats in different regions of the state space, to effectively learn a reward function. Our approach uses information gain to identify critical states for querying, and the most informative feedback format to query in these states, while accounting for the cost and uncertainty of receiving feedback in each format. Our empirical evaluations using four domains in simulation and a human subjects study in simulation demonstrate the effectiveness and sample efficiency of our approach in mitigating avoidable and unavoidable negative side effects (NSEs). The subsequent in-person user study with a Kinova Gen3 7DoF arm further validates these finding, showing that AFS not only improves NSE avoidance but also enhances user trust, competence perception, and user-alignment. While AFS assumes that human feedback reflects a true underlying notion of safety, biased feedback can misguide the robot and lead to unintended NSEs. Understanding when such biases arise and how to correct for them remains an open challenge. Extending AFS with bias-aware inference mechanisms is a promising future direction. Future work will also focus on extending AFS to continuous state and action spaces, strengthening AFS&#x2019;s applicability to complex, safety-critical domains where user-aware interaction is essential.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s9">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="s10">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Human Research Protection Program and Institutional Review Board, Oregon State University. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec sec-type="author-contributions" id="s11">
<title>Author contributions</title>
<p>YA: Writing &#x2013; review and editing, Investigation, Data curation, Methodology, Conceptualization, Writing &#x2013; original draft, Visualization. NN: Writing &#x2013; review and editing, Writing &#x2013; original draft, Data curation. KS: Writing &#x2013; review and editing, Data curation. NF: Resources, Writing &#x2013; review and editing, Supervision. SS: Supervision, Funding acquisition, Resources, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s13">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s14">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s15">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s16">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2025.1734564/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2025.1734564/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1350293/overview">Chao Zeng</ext-link>, University of Liverpool, United Kingdom</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/444456/overview">Pasqualino Sirignano</ext-link>, Sapienza University of Rome, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2947013/overview">Chuanfei Hu</ext-link>, Southeast University, China</p>
</fn>
</fn-group>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>See Section 3.1 in the <xref ref-type="sec" rid="s16">Supplementary Materials</xref> for details on the dialog box and examples for each feedback format</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>A. Y.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Apprenticeship learning <italic>via</italic> inverse reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the twenty-first international conference on machin learning, (ICML)</source>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Amodei</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Olah</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Steinhardt</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Christiano</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Man&#xe9;</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Concrete problems in AI safety</article-title>. <source>arXiv Preprint arXiv:1606.06565</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1606.06565</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>B&#xe4;rmann</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kartmann</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Peller-Konrad</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Niehues</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Waibel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Asfour</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Incremental learning of humanoid robot behavior from natural interaction and large language models</article-title>. <source>Front. Robotics AI</source> <volume>11</volume>, <fpage>1455375</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2024.1455375</pub-id>
<pub-id pub-id-type="pmid">39449715</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beierling</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beierling</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Vollmer</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>The power of combined modalities in interactive robot learning</article-title>. <source>Front. Robotics AI</source> <volume>12</volume>, <fpage>1598968</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2025.1598968</pub-id>
<pub-id pub-id-type="pmid">40747445</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>B&#x131;y&#x131;k</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Losey</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Palan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Landolfi</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Shevchuk</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sadigh</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Learning reward functions from diverse sources of human feedback: optimally integrating demonstrations and preferences</article-title>. <source>Int. J. Robotics Res. (IJRR)</source> <volume>41</volume>, <fpage>45</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1177/02783649211041652</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Brown</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Riskaware active inverse reinforcement learning</article-title>,&#x201d; <volume>87</volume>. <publisher-name>Conference on Robot Learning</publisher-name>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Brown</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Coleman</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020a</year>). &#x201c;<article-title>Safe imitation learning <italic>via</italic> fast Bayesian reward inference from preferences</article-title>,&#x201d; in <source>
<italic>International conference on machine learning (ICML)</italic> (PMLR)</source>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brown</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Petrik</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020b</year>). <article-title>Bayesian robust optimization for imitation learning</article-title>. <source>Adv. Neural Inf. Process. Syst. (NeurIPS)</source>. <pub-id pub-id-type="doi">10.5555/3495724.3495933</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Candon</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tsoi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>V&#xe1;zquez</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Nonverbal human signals can help autonomous agents infer human preferences for their behavior</article-title>,&#x201d; in <source>Proceedings of the international conference on autonomous agents and multiagent systems</source>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Carpinella</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Wyman</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Perez</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Stroessner</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>The robotic social attributes scale (rosas): development and validation</article-title>,&#x201d; in <source>12th ACM/IEEE international conference on human robot interaction (HRI)</source>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Active reward learning from critiques</article-title>,&#x201d; in <source>IEEE international conference on robotics and automation (ICRA)</source>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Koppol</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Admoni</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Simmons</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Steinfeld</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). &#x201c;<article-title>Understanding the relationship between interactions and outcomes in humanintheloop machine learning</article-title>,&#x201d; in <source>International joint conference on artificial intelligence (IJCAI)</source>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Knox</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Allievi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021b</year>). &#x201c;<article-title>The empathic framework for task learning from implicit human feedback</article-title>,&#x201d; in <source>Conference on robot learning (CoRL)</source>.</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Karamcheti</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Palleti</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shivakumar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sadigh</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>No, to the right online language corrections for robotic manipulation <italic>via</italic> shared autonomy</article-title>,&#x201d; in <source>Proceedings of ACM/IEEE conference on human robot interaction (HRI)</source>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kaufmann</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>H&#xfc;llermeier</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). &#x201c;<article-title>Duo: diverse, uncertain, on-policy query generation and selection for reinforcement learning from human feedback</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence (AAAI)</source>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ghosal</surname>
<given-names>G. R.</given-names>
</name>
<name>
<surname>Zurek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A. D.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>The effect of modeling human rationality level on learning rewards from multiple feedback types</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence (AAAI)</source>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hadfield Menell</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Milli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Inverse reward design</article-title>. <source>Adv. Neural Inf. Process. Syst. (NeurIPS)</source>. <pub-id pub-id-type="doi">10.5555/3295222.3295421</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hart</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>Staveland</surname>
<given-names>L. E.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>Development of nasatlx (task load index): results of empirical and theoretical research</article-title>. <source>Adv. Psychology</source>. <pub-id pub-id-type="doi">10.1016/j.ecns.2024.101607</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Hassan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Alikhani</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Coherence-driven multimodal safety dialogue with active learning for embodied agents</article-title>,&#x201d; in <source>Proceedings of the 24th international conference on autonomous agents and multiagent systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Aronson</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Short</surname>
<given-names>E. S.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Modeling variation in human feedback with user inputs: an exploratory methodology</article-title>,&#x201d; in <source>Proceedings of ACM/IEEE international conference on human robot interaction (HRI)</source>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ibarz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Leike</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pohlen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Irving</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Legg</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Amodei</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Reward learning from human preferences and demonstrations in atari</article-title>. <source>Adv. Neural Inf. Process. Syst. (NeurIPS)</source>. <pub-id pub-id-type="doi">10.5555/3327757.3327897</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jeon</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Milli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Reward rational (implicit) choice: a unifying formalism for reward learning</article-title>. <source>Adv. Neural Inf. Process. Syst. (NeurIPS)</source>. <pub-id pub-id-type="doi">10.5555/3495724.3496095</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ji</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Safety gymnasium: a unified safe reinforcement learning benchmark</article-title>,&#x201d; in <source>Thirty-seventh conference on neural information processing systems datasets and benchmarks track (NeurIPS)</source>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Seo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Guide your agent with adaptive multimodal rewards</article-title>,&#x201d; in <source>Thirty-seventh conference on neural information processing systems</source>.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<collab>Kinova</collab> (<year>2025</year>). <article-title>Kinova gen3 ultra lightweight robot</article-title>.</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krakovna</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Orseau</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Martic</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Legg</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Measuring and avoiding side effects using relative reachability</article-title>. <source>arXiv Preprint arXiv:1806.01186</source>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krakovna</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Orseau</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ngo</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Martic</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Legg</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Avoiding side effects by considering future tasks</article-title>. <source>Adv. Neural Inf. Process. Syst. (NeurIPS)</source>. <pub-id pub-id-type="doi">10.5555/3495724.3497324</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lakkaraju</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kamar</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Caruana</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Horvitz</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Identifying unknown unknowns in the open world: representations and policies for guided exploration</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence (AAAI)</source>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Safe reinforcement learning with free-form natural language constraints and pre-trained language models</article-title>,&#x201d; in <source>The 23rd international conference on Autonomous Agents and Multi-Agent Systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Najar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chetouani</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Reinforcement learning with human advice: a survey</article-title>. <source>Front. Robotics AI</source> <volume>8</volume>, <fpage>8</fpage>&#x2013;<lpage>2021</lpage>. <pub-id pub-id-type="doi">10.3389/frobt.2021.584075</pub-id>
<pub-id pub-id-type="pmid">34141726</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ng</surname>
<given-names>A. Y.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>Algorithms for inverse reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the seventeenth international conference on machine learning (ICML)</source>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ramachandran</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Amir</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2007</year>). &#x201c;<article-title>Bayesian inverse reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the 20th international joint conference on artifical intelligence (IJCAI)</source>.</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramakrishnan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kamar</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Dey</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Horvitz</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Blind spot detection for safe sim-to-real transfer</article-title>. <source>J. Artif. Intell. Res. (JAIR)</source> <volume>67</volume>, <fpage>191</fpage>&#x2013;<lpage>234</lpage>. <pub-id pub-id-type="doi">10.1613/jair.1.11436</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gordon</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bagnell</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>A reduction of imitation learning and structured prediction to no-regret online learning</article-title>,&#x201d; in <source>Proceedings of the fourteenth international conference on artificial intelligence and statistics, (AISTATS)</source>.</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Saisubramanian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zilberstein</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Mitigating negative side effects <italic>via</italic> environment shaping</article-title>,&#x201d; in <source>International conference on Autonomous Agents and Multiagent Systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Saisubramanian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kamar</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zilberstein</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021a</year>). &#x201c;<article-title>A multiobjective approach to mitigate negative side effects</article-title>,&#x201d; in <source>Proceedings of the twenty-ninth international joint conference on artificial intelligence</source> (<publisher-name>International Joint Conferences on Artificial Intelligence Organization</publisher-name>).</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Saisubramanian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Roberts</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Zilberstein</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021b</year>). &#x201c;<article-title>Understanding user attitudes towards negative side effects of AI systems</article-title>,&#x201d; in <source>Extended abstracts of the 2021 conference on human factors in computing systems (CHI)</source>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saisubramanian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kamar</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zilberstein</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Avoiding negative side effects of autonomous systems in the open world</article-title>. <source>J. Artif. Intell. Res. (JAIR)</source> <volume>74</volume>, <fpage>143</fpage>&#x2013;<lpage>177</lpage>. <pub-id pub-id-type="doi">10.1613/jair.1.13581</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Saran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Short</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Efficiently guiding imitation learning agents with human gaze</article-title>,&#x201d; in <source>International conference on Autonomous Agents and Multiagent Systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Seo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Unhelkar</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Idil: imitation learning of intent-driven expert behavior</article-title>,&#x201d; in <source>Proceedings of the 23rd international conference on Autonomous Agents and Multiagent Systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Settles</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>1995</year>). <article-title>Active learning literature survey</article-title>. <source>Science</source>.</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Sontakke</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Arnold</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pertsch</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Biyik</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sadigh</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>RoboCLIP: one demonstration is enough to learn robot policies</article-title>,&#x201d; in <source>Thirty-seventh conference on neural information processing systems (NeurIPS)</source>.</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Srivastava</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Saisubramanian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Paruchuri</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zilberstein</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Planning and learning for Non-markovian negative side effects using finite state controllers</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence (AAAI)</source>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Strokina</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Pajarinen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Serbenyuk</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>K&#xe4;m&#xe4;r&#xe4;inen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ghabcheloo</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Visual rewards from observation for sequential tasks: autonomous pile loading</article-title>. <source>Front. Robotics AI</source> <volume>9</volume>, <fpage>9</fpage>&#x2013;<lpage>2022</lpage>. <pub-id pub-id-type="doi">10.3389/frobt.2022.838059</pub-id>
<pub-id pub-id-type="pmid">35712549</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tarakli</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Vinanzi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nuovo</surname>
<given-names>A. D.</given-names>
</name>
</person-group> (<year>2024</year>). <source>Interactive reinforcement learning from natural language feedback</source>. <publisher-name>IEEE/RSJ International Conference on Intelligent Robots and Systems IROS</publisher-name>.</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tien</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J. Z.</given-names>
</name>
<name>
<surname>Erickson</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>D. S.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Causal confusion and reward misidentification in preferencebased reward learning</article-title>,&#x201d; in <source>The eleventh international conference on learning representations (ICLR)</source>.</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Xue</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>An</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Reinforcement learning from diverse human preferences</article-title>,&#x201d; in <source>Proceedings of the thirty-third international joint conference on artificial intelligence, IJCAI</source> (<publisher-name>International Joint Conferences on Artificial Intelligence Organization</publisher-name>).</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Neary</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Topcu</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2024a</year>). &#x201c;<article-title>Multimodal pretrained models for verifiable sequential decision-making: planning, grounding, and perception</article-title>,&#x201d; in <source>Proceedings of the 23rd international conference on autonomous agents and multiagent systems (AAMAS)</source>.</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tien</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Biyik</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2024b</year>). &#x201c;<article-title>Trajectory improvement and reward learning from comparative language feedback</article-title>,&#x201d; in <source>Conference on robot learning (CoRL)</source>.</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zha</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kambhampati</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Learning from ambiguous demonstrations with self-explanation guided reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence</source>.</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Durfee</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Querying to find a safe policy under uncertain safety constraints in markov decision processes</article-title>,&#x201d; in <source>Proceedings of the AAAI conference on artificial intelligence (AAAI)</source>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>
