<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1598968</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2025.1598968</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>The power of combined modalities in interactive robot learning</article-title>
<alt-title alt-title-type="left-running-head">Beierling et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2025.1598968">10.3389/frobt.2025.1598968</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Beierling</surname>
<given-names>Helen</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3011767/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Beierling</surname>
<given-names>Robin</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/3122863/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Vollmer</surname>
<given-names>Anna-Lisa</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/123218/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff>
<institution>Interactive Robotics in Medicine and Care</institution>, <institution>Medical School OWL</institution>, <institution>Bielefeld University</institution>, <addr-line>Bielefeld</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/78087/overview">Silvia Rossi</ext-link>, University of Naples Federico II, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/666080/overview">Suresh Kumaar Jayaraman</ext-link>, Carnegie Mellon University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1866074/overview">Leo Zeyang Liu</ext-link>, University of California, Los Angeles, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Helen Beierling, <email>helen.beierling@uni-bielefeld.de</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>17</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1598968</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>06</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Beierling, Beierling and Vollmer.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Beierling, Beierling and Vollmer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>With the continuous advancement of Artificial intelligence (AI), robots as embodied intelligent systems are increasingly becoming more present in daily life like households or in elderly care. As a result, lay users are required to interact with these systems more frequently and teach them to meet individual needs. Human-in-the-loop reinforcement learning (HIL-RL) offers an effective way to realize this teaching. Studies show that various feedback modalities, such as preference, guidance, or demonstration can significantly enhance learning success, though their suitability varies among users expertise in robotics. Research also indicates that users apply different scaffolding strategies when teaching a robot, such as motivating it to explore actions that promise success. Thus, providing a collection of different feedback modalities allows users to choose the method that best suits their teaching strategy, and allows the system to individually support the user based on their interaction behavior. However, most state-of-the-art approaches provide users with only one feedback modality at a time. Investigating combined feedback modalities in interactive robot learning remains an open challenge. To address this, we conducted a study that combined common feedback modalities. Our research questions focused on whether these combinations improve learning outcomes, reveal user preferences, show differences in perceived effectiveness, and identify which modalities influence learning the most. The results show that combining the feedback modalities improves learning, with users perceiving the effectiveness of the modalities vary ways, and certain modalities directly impacting learning success. The study demonstrates that combining feedback modalities can support learning even in a simplified setting and suggests the potential for broader applicability, especially in robot learning scenarios with a focus on user interaction. Thus, this paper aims to motivate the use of combined feedback modalities in interactive imitation learning.</p>
</abstract>
<kwd-group>
<kwd>human-robot interaction</kwd>
<kwd>human-in-the-loop learning</kwd>
<kwd>reinforcement learning</kwd>
<kwd>interactive robot learning</kwd>
<kwd>multi-modal feedback</kwd>
<kwd>learning from demonstration</kwd>
<kwd>preference-based learning</kwd>
<kwd>scaffolding in robot learning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Human-Robot Interaction</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The growing sophistication of artificial intelligence (AI) has expanded the market for autonomous systems, such as autonomous cars and vacuum robots. There is a growing demand for intelligent agents that can simplify everyday tasks, particularly in home environments or care. These areas of application are characterized by diverse individual preferences and tasks. To address this, users, especially lay users, need to be empowered to teach robots new actions according to their preferences. Interactive imitation learning, including human-in-the-loop reinforcement learning (HIL-RL), with active user feedback, is a common method to realize this teaching <xref ref-type="bibr" rid="B15">Celemin et al. (2022)</xref>. Human feedback in HIL-RL scenarios is valuable, as reward functions normally designed to provide customized learning feedback for specific situations may underperform in different contexts <xref ref-type="bibr" rid="B25">Kaufmann et al. (2023)</xref>. Furthermore, it can be challenging to rationally define such functions via reward engineering, especially for tasks based on personal preferences (<xref ref-type="bibr" rid="B25">Kaufmann et al., 2023</xref>; <xref ref-type="bibr" rid="B26">Knox et al., 2023</xref>). There is a variety of literature spanning from older approaches (<xref ref-type="bibr" rid="B22">Isbell et al., 2001</xref>; <xref ref-type="bibr" rid="B27">Knox and Stone, 2008</xref>) to more recent ones (<xref ref-type="bibr" rid="B21">Hindemith et al., 2022</xref>; <xref ref-type="bibr" rid="B19">Ding et al., 2023</xref>) regarding the use of human feedback that were able to show positive results with various human feedback modalities. Thus, incorporating human feedback in HIL-RL is current and relevant, particularly in the field of robotics in everyday life scenarios.</p>
<p>However, when robots are taught by users through direct feedback, users naturally want to scaffold the learning robot. Originating in developmental psychology (<xref ref-type="bibr" rid="B49">Wood et al., 1976</xref>; <xref ref-type="bibr" rid="B38">Stone 1998a</xref>), scaffolding involves adjustments to the learning environment and providing guidance, constantly adapting to the progression of learning (<xref ref-type="bibr" rid="B48">Vollmer et al., 2016</xref>; <xref ref-type="bibr" rid="B37">Saunders et al., 2006</xref>; <xref ref-type="bibr" rid="B24">Jumaat and Tasir, 2014</xref>; <xref ref-type="bibr" rid="B39">Stone, 1998b</xref>; <xref ref-type="bibr" rid="B9">Breazeal, 1998</xref>). This scaffolding behavior also showed the potential to be valuable in robot teaching as well (<xref ref-type="bibr" rid="B2">Angeli and Valanides, 2020</xref>; <xref ref-type="bibr" rid="B9">Breazeal, 1998</xref>; <xref ref-type="bibr" rid="B10">Breazeal et al., 2006</xref>). Nevertheless, users tend to give, for example, overly positive ratings to push learning behavior in that direction due to their scaffolding intent, unknowingly distorting feedback and hindering learning (<xref ref-type="bibr" rid="B43">Thomaz and Breazeal, 2007</xref>; <xref ref-type="bibr" rid="B44">Thomaz and Cakmak, 2009</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). This can be mitigated, for example, by providing guidance as a separate form of feedback modality. Thus, the literature advocates modalities that allow users to express this supplementary feedback to enhance feedback quality and guide actions effectively (<xref ref-type="bibr" rid="B42">Thomaz and Breazeal, 2006</xref>; <xref ref-type="bibr" rid="B30">Liu et al., 2023</xref>; <xref ref-type="bibr" rid="B29">Li et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Bajcsy et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Celemin and Kober, 2023</xref>; <xref ref-type="bibr" rid="B35">Ravichandar et al., 2020</xref>; <xref ref-type="bibr" rid="B5">Arzate Cruz and Igarashi, 2020</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). This allows users, for instance, to mark actions as guidance, which are then functioning as a point of orientation for the ongoing learning process.</p>
<p>Although the literature presents various such beneficial modalities, they have largely only been tested in isolation or as a switching option between two modalities (<xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>; <xref ref-type="bibr" rid="B13">Casper et al., 2023</xref>), such as switching between demonstration and preference as a feedback modality <xref ref-type="bibr" rid="B8">B&#x131;y&#x131;k et al. (2022)</xref>. However, the combination of different feedback modalities, the selection of multiple modalities at once, and their simultaneous use remain open challenges (<xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>; <xref ref-type="bibr" rid="B5">Arzate Cruz and Igarashi, 2020</xref>; <xref ref-type="bibr" rid="B28">Li et al., 2019</xref>). Therefore, we conducted a study to explore the interaction behavior of lay users in a real robot teaching scenario when a variety of feedback modalities is available to them at once. We were especially interested in the questions whether all modalities are utilized and perceived equally or whether there are usage and perception differences. This is particularly relevant because this knowledge would enable the identification of existing patterns from which the user&#x2019;s mental model could be inferred, allowing the system to provide targeted explanations and support the user effectively. In addition, our goal was to determine whether their combined use offers a learning benefit to the system. This led us to the following four hypotheses for our study:<list list-type="simple">
<list-item>
<p>H1: Enhanced Learning through Multiple Modalities The learning success in a human-in-the-loop reinforcement learning framework can be enhanced by incorporating multiple modalities of direct user feedback.</p>
</list-item>
<list-item>
<p>H2: User Preferences and Modality Utilization Users do not utilize all modalities equally, but tend to show a preference for certain ones.</p>
</list-item>
<list-item>
<p>H3: Different Perception of Modalities Users subjectively perceive some to be more beneficial compared to others in contributing to learning success.</p>
</list-item>
<list-item>
<p>H4: Enhanced Learning through Individual Modalities The use of individual modalities is positively correlated with the success of the learning algorithm when they are employed.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>In this section, we aim to provide an overview of approaches in Human-in-the-Loop robot training that are based solely on user evaluations. From this overview, we will derive a baseline modality. This baseline modality can then be extended with supplementary modalities, allowing us to compare the added value of combined modalities against the provision of the use of individual baseline modalities or simply switching between them. Additionally, we will present the rationale behind the selection of the supplementary modalities used.</p>
<p>There is a rich body of literature on the use of human feedback in reinforcement learning, spanning from early approaches (<xref ref-type="bibr" rid="B22">Isbell et al., 2001</xref>; <xref ref-type="bibr" rid="B27">Knox and Stone, 2008</xref>) to more recent advances (<xref ref-type="bibr" rid="B21">Hindemith et al., 2022</xref>; <xref ref-type="bibr" rid="B19">Ding et al., 2023</xref>). These studies demonstrate the effectiveness of human feedback across various modalities and application domains. As a result, incorporating human feedback into reinforcement learning remains an active and relevant research direction, particularly in the field of robotics.</p>
<p>For our baseline modality, we focus on the most widley used feedback modalities. <xref ref-type="bibr" rid="B25">Kaufmann et al. (2023)</xref> provide a comprehensive survey of reinforcement learning approaches that integrate direct user feedback, categorizing them into preference-based and reward-based learning methods. In both cases, users offer numerical or binary evaluations of an agent&#x2019;s actions, constituting a form of evaluative feedback. Additionally, alternative modalities such as emotions (<xref ref-type="bibr" rid="B50">Yu and Tapus, 2019</xref>; <xref ref-type="bibr" rid="B41">Su et al., 2023</xref>), gestures (<xref ref-type="bibr" rid="B41">Su et al., 2023</xref>), or speech (<xref ref-type="bibr" rid="B16">Chen et al., 2022</xref>) can be utilized individually or in combination. However, this study specifically focuses on direct evaluative feedback methods.</p>
<p>
<xref ref-type="bibr" rid="B15">Celemin et al. (2022)</xref> further distinguish imitation learning feedback modalities in robotics by classifying them into evaluative feedback modalities (assessing how well the robot performs) and transition-based feedback modalities (advising how the robot should act). Both types can be given in either a direct or indirect manner. Their findings indicate that relative evaluative feedback modalities are particularly well suited for lay users, as it aligns with expressing preferences over a robot&#x2019;s actions (see Figure 3.5 in <xref ref-type="bibr" rid="B15">Celemin et al. (2022)</xref>).</p>
<p>This notion was also shown by <xref ref-type="bibr" rid="B21">Hindemith et al. (2022)</xref>, who compared preference-based and scalar feedback modalities within a reinforcement learning framework in a human-robot interaction setup. Their study, which used user feedback as a direct reward in a cup-and-ball task performed by a Pepper robot, revealed that users found it easier to compare two trajectories rather than to provide scalar ratings, which required maintaining consistency with prior evaluations. Based on these insights, we adopt a preference-based feedback modality as the baseline modality for our study.</p>
<p>In addition to this, we identified six supplementary modalities from the literature that extend and complement this baseline:</p>
<sec id="s2-1">
<title>2.1 Guidance</title>
<p>An earlier study by <xref ref-type="bibr" rid="B42">Thomaz and Breazeal (2006)</xref> highlights the importance of providing users with the option to provide information not only for present actions, but also for future actions, providing guidance to the robot. This method not only prevents users from misusing feedback modalities for guidance but also improves learning outcomes (<xref ref-type="bibr" rid="B43">Thomaz and Breazeal, 2007</xref>; <xref ref-type="bibr" rid="B44">Thomaz and Cakmak, 2009</xref>; <xref ref-type="bibr" rid="B7">Bignold et al., 2023</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). Regarding the classification of <xref ref-type="bibr" rid="B15">Celemin et al. (2022)</xref> we refer to <italic>guidance</italic> as a relative evaluative feedback modality. Allow users to mark favored actions as guiding for future learning.</p>
</sec>
<sec id="s2-2">
<title>2.2 Correction</title>
<p>Correction is the widely used opposite of guidance (<xref ref-type="bibr" rid="B30">Liu et al., 2023</xref>; <xref ref-type="bibr" rid="B17">Chernova and Veloso, 2009</xref>; <xref ref-type="bibr" rid="B32">Meri&#xe7;li et al., 2011</xref>; <xref ref-type="bibr" rid="B13">Casper et al., 2023</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). The literature suggests that using corrections for human-in-the-loop robot learning also benefits learning (<xref ref-type="bibr" rid="B30">Liu et al., 2023</xref>; <xref ref-type="bibr" rid="B29">Li et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Bajcsy et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Celemin and Kober, 2023</xref>). Here we relate to corrections classified as relative corrections in state-action-space, not absolute corrections, referring to <xref ref-type="bibr" rid="B15">Celemin et al. (2022)</xref>, meaning that users are able to mark actions as to be avoided instead of giving a direct example of the action.</p>
</sec>
<sec id="s2-3">
<title>2.3 Demonstration</title>
<p>An alternative to relative corrections is the option of demonste an action to the robot to assist or lead the learning process, which would be a direct correction (<xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). This method is widely employed in robotics as in learning from demonstration approaches. One possible popular approach is kinesthetic teaching, in which the user directly manipulates the robot to demonstrate the action (<xref ref-type="bibr" rid="B35">Ravichandar et al., 2020</xref>; <xref ref-type="bibr" rid="B31">Meri&#xe7;li et al., 2010</xref>; <xref ref-type="bibr" rid="B13">Casper et al., 2023</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>; <xref ref-type="bibr" rid="B14">Celemin and Kober, 2023</xref>).</p>
</sec>
<sec id="s2-4">
<title>2.4 Exploration</title>
<p>Exploration in learning algorithms refers to the process in which the algorithm tries different actions to discover which yield the best results (i.e., the highest reward). While the exploration versus exploitation dilemma is a long-standing challenge in machine learning, it becomes particularly pronounced in scenarios where the algorithm receives sparse feedback, such as human feedback. This dilemma involves deciding whether to try new approaches to discover better options or to stick with known strategies that yield the best results. This dilemma can be addressed by allowing users to directly control the exploration process by lowering or increasing the exploration rate (<xref ref-type="bibr" rid="B3">Arakawa et al., 2018</xref>; <xref ref-type="bibr" rid="B5">Arzate Cruz and Igarashi, 2020</xref>; <xref ref-type="bibr" rid="B34">Raffin et al., 2022</xref>; <xref ref-type="bibr" rid="B46">Torne et al., 2023</xref>; <xref ref-type="bibr" rid="B44">Thomaz and Cakmak, 2009</xref>).</p>
</sec>
<sec id="s2-5">
<title>2.5 Speed</title>
<p>A modality, closely aligned with the action advice, involves providing guidance on specific attributes of an action (<xref ref-type="bibr" rid="B5">Arzate Cruz and Igarashi, 2020</xref>; <xref ref-type="bibr" rid="B4">Argall et al., 2011</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>). Unlike providing absolute feedback in the state-action space based on an entire demonstration, this approach focuses on pinpointing individual attributes of an action perceived as suboptimal by the user. We selected execution speed as a relevant natural attribute for user advice.</p>
</sec>
<sec id="s2-6">
<title>2.6 Fallback</title>
<p>Fallback represents the option to revert to a previous state of action and resume learning from there. This modality is grounded in the concept of <italic>Correction</italic> or negative <italic>Guidance</italic>, as outlined by <xref ref-type="bibr" rid="B42">Thomaz and Breazeal (2006)</xref>. In instances where the agent explores the action space and converges to undesired behavior, a emphFallback modality allows to retrace the explored path and to restart from the best action achieved thus far. This differs from <italic>Correction</italic>, where learning is only influenced according to the marked action, while the general direction of learning is maintained based on past positive and negative actions (<xref ref-type="bibr" rid="B20">Ecoffet et al., 2021</xref>).</p>
<p>Although there is considerable research on individual supplementary modalities, their combination remains an open question. A survey focusing on interactive robotics and human-robot interactions, examining trend-setting human feedback modalities, highlights the open challenge of combining feedback modalities and their application to reinforcement learning algorithms (<xref ref-type="bibr" rid="B5">Arzate Cruz and Igarashi, 2020</xref>). Similarly, the potential of combining feedback modalities in interactive reinforcement has been emphasized (<xref ref-type="bibr" rid="B28">Li et al., 2019</xref>; <xref ref-type="bibr" rid="B15">Celemin et al., 2022</xref>).</p>
<p>There are already studies exploring the use of multiple modalities in robot learning. However, these approaches typically rely on a human-determined switch between different modalities rather than leveraging supplementary modalities simultaneously.</p>
<p> <xref ref-type="bibr" rid="B8">B&#x131;y&#x131;k et al. (2022)</xref> combine demonstrations with preference-based feedback modality, showing that integrating both sources leads to faster and more accurate reward learning. Their method, <italic>DemPref</italic>, selects between demonstrations and preference queries, balancing efficiency and informativeness.</p>
<p>Another perspective comes from <xref ref-type="bibr" rid="B23">Jeon et al. (2020)</xref> and <xref ref-type="bibr" rid="B23">Jeon et al. (2020)</xref>, who propose a unifying formalism to interpret various types of human feedback. They introduce the concept of Reward-Rational Implicit Choice (RRIC), where human behavior, whether demonstrations, corrections, comparisons, or even actions like turning off a robot, is viewed as a rational selection from an implicit set of choices. This formalism provides a general framework that encompasses a wide range of feedback modalities, helping to structure and unify reward inference across different modalities. While <xref ref-type="bibr" rid="B23">Jeon et al. (2020)</xref> model different feedback modalities as separate sources of reward information that are used sequentially or context-dependently, our work actively enables the simultaneous use of multiple modalities, allowing users to intuitively combine different feedback modalities in real time to influence the robot&#x2019;s learning process. Furthermore, we validate our approach through an empirical user study, demonstrating the impact of multimodal feedback modalities on learning success and user perception and utilization.</p>
<p>By doing so, we address the current research gap on simultaneously offered combined modalities, positioning our work within state-of-the-art interactive robot learning while building upon and extending the approaches of the field.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<sec id="s3-1">
<title>3.1 Implementation</title>
<p>The modalities were implemented on a Kinova Gen2/Jaco (Jaco2) robotic arm <xref ref-type="bibr" rid="B18">Inc. K (2023)</xref> using the robotic operating system (ROS) Noetic <xref ref-type="bibr" rid="B36">Robotics (2020)</xref> on Ubuntu 20.04.</p>
<sec id="s3-1-1">
<title>3.1.1 Probabilistic movement primitives (ProMP)</title>
<p>To represent movements, we used an implementation of probabilistic movement primitives (ProMP) (<xref ref-type="bibr" rid="B33">Paraschos et al., 2013</xref>; <xref ref-type="bibr" rid="B1">Alexander Fabisch, 2020</xref>). ProMP depict movements based on parameterized distributions, making them highly adaptable as they cover an entire distribution of trajectories rather than singular trajectories. Moreover, they are notable for their compact design, which requires adjustments solely to their parameters to accommodate changes in all trajectories. The following parameter values are used for the ProMP implementation:<list list-type="simple">
<list-item>
<p>&#x2022; The number of degrees of freedom <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dims</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and the number of time steps per trajectory <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>steps</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> define the motion structure.</p>
</list-item>
<list-item>
<p>&#x2022;The weight dimension of the ProMP <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>weights&#x2009;per&#x2009;dim</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> determines the number of parameters to optimize.</p>
</list-item>
<list-item>
<p>&#x2022;The exploration rate <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>exploration&#x2009;rate</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> influences the variability of the generated movements.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s3-1-2">
<title>3.1.2 PIBB: policy improvement with black box</title>
<p>We implemented the Policy Improvement Black-Box (PI<sup>BB</sup>) algorithm for learning purposes <xref ref-type="bibr" rid="B40">Stulp and Sigaud (2012)</xref>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>PI<sup>BB</sup>: Policy Improvement with Black Box combined with ProMPs.<list list-type="simple">
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: Parameter distribution</p>
</list-item>
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: Number of samples</p>
</list-item>
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: Eliteness factor</p>
</list-item>
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: Covariance decay factor</p>
</list-item>
<list-item>
<p>
<bold>Require:</bold> <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: Reward and guidance decay factors</p>
</list-item>
<list-item>
<p>
<bold>Ensure:</bold> Updated parameter distribution <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>1:&#x2003;<bold>Sampling:</bold>
</p>
</list-item>
<list-item>
<p>2:&#x2003;<bold>for</bold> <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>3:&#x2003;&#x2003;<inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>4:&#x2003;&#x2003;Compute reward <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>5:&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>6:&#x2003;<bold>Applying decay:</bold>
</p>
</list-item>
<list-item>
<p>7:&#x2003;<bold>for</bold> <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>8:&#x2003;&#x2003;<bold>if</bold> <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>guidance</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>9:&#x2003;&#x2003;&#x2003;<inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>10:&#x2003;&#x2003;<bold>else</bold>
</p>
</list-item>
<list-item>
<p>11:&#x2003;&#x2003;&#x2003;<inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>12:&#x2003;&#x2003;<bold>end if</bold>
</p>
</list-item>
<list-item>
<p>13:&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>14:&#x2003;<bold>Normalizing rewards:</bold>
</p>
</list-item>
<list-item>
<p>15:&#x2003;<inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>16:&#x2003;<bold>for</bold> <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <bold>do</bold>
</p>
</list-item>
<list-item>
<p>17:&#x2003;&#x2003;&#x2003;<inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2190;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>18:&#x2003;&#x2003;&#x2003;<bold>if</bold> <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>guidance</mml:mtext>
<mml:mtext>&#x2009;or&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>correction</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> <bold>then</bold>
</p>
</list-item>
<list-item>
<p>19:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;<inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2190;</mml:mo>
<mml:mn>1.3</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>20:&#x2003;&#x2003;&#x2003;<bold>end if</bold>
</p>
</list-item>
<list-item>
<p>21:&#x2003;<bold>end for</bold>
</p>
</list-item>
<list-item>
<p>22:&#x2003;<bold>Computing weights:</bold>
</p>
</list-item>
<list-item>
<p>23:&#x2003;<inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>24:&#x2003;<bold>Updating the distribution:</bold>
</p>
</list-item>
<list-item>
<p>25:&#x2003;<inline-formula id="inf27">
<mml:math id="m27">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>26:&#x2003;<inline-formula id="inf28">
<mml:math id="m28">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>The PI<sup>BB</sup> approach optimizes a distribution of ProMP weights based on the user rewards. The functionality of the algorithm was implemented as illustrated in pseudocode <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
<p>When a new sample is added, the policy distribution <inline-formula id="inf29">
<mml:math id="m29">
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is updated in several steps. First, a set of new samples is generated by drawing parameter vectors <inline-formula id="inf30">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the current distribution:<disp-formula id="equ1">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>For each sample ProMP weight, a mean trajectory is calculated, and the corresponding reward <inline-formula id="inf31">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is evaluated. Next, reward decay is applied to previous samples. If there are recent guidance samples, rewards of older samples are scaled by the guidance decay factor <inline-formula id="inf32">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, otherwise, the standard reward decay factor <inline-formula id="inf33">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is used:<disp-formula id="equ2">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
<mml:mspace width="1em"/>
<mml:mtext>if&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>guidance_samples</mml:mtext>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ3">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
<mml:mtext>otherwise</mml:mtext>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The rewards are then normalized by scaling them between the minimum and maximum values:<disp-formula id="equ4">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</disp-formula>where guidance and correction samples receive an additional weighting factor:<disp-formula id="equ5">
<mml:math id="m38">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2190;</mml:mo>
<mml:mn>1.3</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mspace width="1em"/>
<mml:mspace width="1em"/>
<mml:mtext>if&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>guidance_samples</mml:mtext>
<mml:mtext>&#x2009;or&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>correction_samples</mml:mtext>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The PIBB weight update is computed by transforming the normalized rewards:<disp-formula id="equ6">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The new mean of the distribution is then updated using these weights:<disp-formula id="equ7">
<mml:math id="m40">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Finally, the covariance matrix decays to reduce exploration over time:<disp-formula id="equ8">
<mml:math id="m41">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>This process integrates the newly generated samples into the policy distribution while adjusting the influence of past trajectories.</p>
<p>The reward decay factor <inline-formula id="inf34">
<mml:math id="m42">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is applied to reduce the influence of older samples over time. This ensures that more recent trajectories contribute more significantly to the policy update. A distinct guidance decay factor <inline-formula id="inf35">
<mml:math id="m43">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is applied when new guidance samples are introduced. This factor gradually reduces the influence of older samples more significantly. This ensures that the upcoming movements closely align with the guidance movement. To effectively normalize the rewards, an eliteness parameter <inline-formula id="inf36">
<mml:math id="m44">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is used, which scales the weight assignment to favor the highest performing samples. This parameter controls the selection pressure, ensuring that top-performing ProMP parameters have a greater impact on shaping the updated distribution. The <bold>covariance</bold> decay factor <inline-formula id="inf37">
<mml:math id="m45">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.973</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> progressively reduces the variance of the Gaussian policy distribution over iterations. This mechanism encourages convergence by refining the exploration space while still allowing for incremental adjustments. In addition, samples labeled as corrections or guidance feedback receive a weight adjustment factor of 1.3, increasing their impact relative to standard samples. This weighting ensures that corrections and guidance cues contribute more prominently to the learning process, reinforcing intended behavior modifications. As a result, corrections have a stronger negative impact. This is because undesirable behavior should continue to be avoided throughout the learning process. One can think of it as speaking the first word, an achievement at the time but one that gradually loses significance. In contrast, touching a hot stove remains something to be avoided, no matter how far learning progresses.</p>
<p>In our study, we also developed a graphical user interface (GUI) to showcase the available modalities and to capture user interactions for our analysis. We configured two versions of our GUI: one that contains only the basic modality (group 1), and another that offers all the supplementary modalities to the participants (group 2), see <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
</sec>
<sec id="s3-1-3">
<title>3.1.3 User interaction and controls</title>
<p>The GUI for each of the experimental conditions is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. Users could provide feedback on a preference pair by clicking on the left heart button to indicate that the left movement was preferred or on the right heart button for the right movement. Additionally, users had the option to replay the movements by clicking the &#x201c;Replay&#x201d; button. For group 2, clicking the &#x201c;Save&#x201d; button marked a movement as a fallback, highlighting the button in white. Movements could also be labeled as &#x201c;Guidance&#x201d; by clicking the signpost button or as &#x201c;Correction&#x201d; by clicking the eraser button. When a movement was actively marked, the corresponding button was highlighted in white. The exploration could be adjusted using a slider that was initially set at a medium level. The slider allowed users to reduce the exploration to the moon icon or increase it to the binocular icon. Similarly, movement speed was controlled through the same mechanism. By clicking the central &#x201c;Demonstrate&#x201d; button at the bottom, users entered the demonstration mode, where they could start, stop, and restart the demonstration. If learning progressed in an undesired direction, users could load the fallback by clicking the &#x201c;Load Save Point&#x201d; button. This action immediately restored the fallback and generated two new movements based on it. Upon clicking &#x201c;Submit,&#x201d; all markings (e.g., Guidance, Correction) and slider settings were recorded for the generation of the next two movements. After submitting, users saw a grayed-out screen displaying the message &#x201c;Please look at the robot,&#x201d; during which they could not interact with the interface. The two newly generated movements were then presented. The GUI implemented a basic form of modality conflict handling. Specifically, when a movement was labeled as <italic>Guidance</italic>, any previous <italic>Correction</italic> label was automatically removed, making the two mutually exclusive. Similarly, labeling a movement as <italic>Fallback</italic> also cleared any existing <italic>Correction</italic> label. In cases where a user attempted to load a fallback trajectory without having previously saved one, no action was taken. However, beyond these basic mechanisms, the system did not enforce additional contextual constraints, which means that semantically questionable actions, such as labeling successful movements as <italic>Corrections</italic> were not actively prevented.</p>
</sec>
</sec>
<sec id="s3-2">
<title>3.2 Modalities</title>
<p>In this section, we describe the concrete implementation of the different modalities.</p>
<sec id="s3-2-1">
<title>3.2.1 Preference</title>
<p>For the preference modality, two different behaviors are presented from which participants could then mark their preferred ones. We used the reward function:<disp-formula id="equ9">
<mml:math id="m46">
<mml:mrow>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>100,100</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi mathvariant="monospace">B</mml:mi>
<mml:mi mathvariant="monospace">o</mml:mi>
<mml:mi mathvariant="monospace">t</mml:mi>
<mml:mi mathvariant="monospace">h</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">p</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">f</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">d</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>100</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi mathvariant="monospace">F</mml:mi>
<mml:mi mathvariant="monospace">i</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">s</mml:mi>
<mml:mi mathvariant="monospace">t</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">a</mml:mi>
<mml:mi mathvariant="monospace">c</mml:mi>
<mml:mi mathvariant="monospace">t</mml:mi>
<mml:mi mathvariant="monospace">i</mml:mi>
<mml:mi mathvariant="monospace">o</mml:mi>
<mml:mi mathvariant="monospace">n</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">p</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">f</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">d</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100,100</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi mathvariant="monospace">S</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">c</mml:mi>
<mml:mi mathvariant="monospace">o</mml:mi>
<mml:mi mathvariant="monospace">n</mml:mi>
<mml:mi mathvariant="monospace">d</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">a</mml:mi>
<mml:mi mathvariant="monospace">c</mml:mi>
<mml:mi mathvariant="monospace">t</mml:mi>
<mml:mi mathvariant="monospace">i</mml:mi>
<mml:mi mathvariant="monospace">o</mml:mi>
<mml:mi mathvariant="monospace">n</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">p</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">f</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">d</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi mathvariant="monospace">N</mml:mi>
<mml:mi mathvariant="monospace">o</mml:mi>
<mml:mi mathvariant="monospace">n</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mspace width="0.6em"/>
<mml:mi mathvariant="monospace">p</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">f</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">r</mml:mi>
<mml:mi mathvariant="monospace">e</mml:mi>
<mml:mi mathvariant="monospace">d</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>This approach not only enables the articulation of a preference for one option but also accommodates the expression of preference for neither or both options. Our choice for this implementation was driven, among other factors, by the need for the algorithm to perform rapidly and sample-efficiently. To amplify the differentiation in actions, we increased the disparity between the ProMPs drawn from the distribution, addressing the issue that users struggle to express their preferences when actions are too similar. Additionally, we created a significant contrast between positive and negative rewards to make negative ProMPs even less likely. We also opted for samplewise updates to the sampling instead of batchwise, as users in direct interaction expect immediate responses <xref ref-type="bibr" rid="B47">Vollmer and Hemion (2018)</xref>.</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Supplementary modalities</title>
<sec id="s3-2-2-1">
<title>3.2.2.1 Guidance</title>
<p>Was achieved by significantly increasing the influence of the action chosen as guidance in the selection of future actions, while drastically diminishing the impact of all other actions. This is achieved by influencing the decay of rewards by increasing it; this only leaves the last sample marked as guidance. Furthermore, the influence of guidance samples on the mean is increased. Lastly, the guidance samples receive higher rewards of 150 instead of 100.</p>
</sec>
<sec id="s3-2-2-2">
<title>3.2.2.2 Correction</title>
<p>Unlike <italic>Guidance</italic> samples, <italic>Correction</italic> samples do not introduce additional decay, as negative actions should retain their influence in contrast to previously positive actions. Moreover, as <italic>Guidance</italic>, their impact on the mean is amplified. However, since they receive negative rewards, this amplification occurs in the opposite direction, effectively enhancing their influence negatively. Additionally, <italic>Correction</italic> samples incur a greater penalty, receiving a negative reward of <inline-formula id="inf38">
<mml:math id="m47">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>150</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> instead of <inline-formula id="inf39">
<mml:math id="m48">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-2-2-3">
<title>3.2.2.3 Demonstrations</title>
<p>Were performed through kinesthetic teaching and utilized as a new average for the new distribution of the PIBB algorithm. Given that users anticipate behavior that closely mirrors the demonstration, past feedback is disregarded; otherwise, movements, particularly in the later stages of the learning trajectory, would deviate significantly from the demonstration due to the influence of previous actions.</p>
</sec>
<sec id="s3-2-2-4">
<title>3.2.2.4 Exploration</title>
<p>Was facilitated by adjusting the base exploration factor upward or downward on five levels, ensuring, however, that exploration consistently decreases over time.<disp-formula id="equ10">
<mml:math id="m49">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mn>0.9</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
</sec>
<sec id="s3-2-2-5">
<title>3.2.2.5 Speed</title>
<p>Was realized as accelerating or decelerating of all movements after the trajectories have been generated, eliminating the need to incorporate an additional dimension into the learning process.</p>
</sec>
<sec id="s3-2-2-6">
<title>3.2.2.6 Fallback</title>
<p>Was implemented by marking a trajectory, more specifically the ProMP weights of this trajectory, as mean that users can return to as needed. When a fallback is marked, it is also defined as a guidance as well. This triggers all positive effects of a guidance, since a fallback sample is marked as the best so far by the user. The marking as guidance leads to a more positive evaluation, resulting in a score of 150, and influences the further calculation of the distribution as described in the guidance section. If the user returns to the marked action, it has the same effect as providing a demonstration. This means that when loading a fallback, the previous progress is discarded, except for the reduction in exploration, and the learning process continues with the loaded fallback as the new mean of the distribution. The decision for this implementation was based on the users&#x2019; expectation of similar behavior to the saved trajectory, rather than the distorted behavior that would arise if past actions were incorporated into the calculation of the next movement. This distortion would also happen, even if the fallback was set as the new mean and the old actions were kept. Thus, all old actions must be discarded. Additionally, it is based on the fact that users want the progress made after the fallback action to be discarded when loading.</p>
</sec>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 Study design</title>
<sec id="s3-3-1">
<title>3.3.1 Participants</title>
<p>Participants were recruited on campus through flyers, posters and mailing lists and received a compensation of 10 &#x20ac;. We recruited participants who were considered <italic>lay users</italic>, defined as individuals without prior experience with the specific algorithm or the robot used in the study. This approach reflects our aim including a representative sample of potential end users, such as those who might encounter robots in everyday or care-related environments. All subjects gave their informed consent for inclusion before participating in the study. The participant cohort for the study was randomly assigned to two groups.</p>
</sec>
<sec id="s3-3-2">
<title>3.3.2 Task</title>
<p>The task designed for the study was a minigolf challenge without obstacles, chosen for its balanced blend of enjoyment, clarity, and practicality. The setup of the task is depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>. The simplicity of the objective made the goal and proper technique apparent for evaluation. Additionally, the task focuses on the actual execution of the trajectory rather than the target, as seen in tasks like pick-and-place. Therefore, there is no clear or optimal way to do it. At the same time, it offers a wide range of possible execution methods, which can vary significantly between participants. This was already demonstrated in previous studies. An example from the previous studies includes participants who, despite the clear instructions regarding the task&#x2019;s goal, preferred to execute the task smoothly with a graceful swing rather than focusing on hitting the target. In contrast, others were more focused on fulfilling the task itself and preferred lowering the racket to the playing surface followed by pushing the ball into the hole, although this resulted in much more abrupt trajectories. Furthermore, the straightforward nature of the task allowed for easy reset between attempts. In this process, robot action pairs were evaluated 40 times, even if the task had been completed beforehand. The participants were instructed to succeed (i.e., the robot hits the hole) as often as possible. A trial was considered successful only if the ball was hit directly from a predefined starting position into the hole in a single stroke. The attempt was not counted as successful if the ball touched the boundary. After each action, if the ball was hit, it was reset to the starting position. Both runs were initialized identically using a poor demonstration of the task, which accelerates the algorithm, by indicating the basic nature of the movement (a forward swing) while still preserving the need for learning and human intervention. Moreover, the arm&#x2019;s reset position was the same for both groups and for every trial. Each task trial proceeded as follows: Participants were shown the GUI with a black overlay and asked to observe the robot. Then, the first movement was presented. After the movement was executed, the robot was reset and the setup was restored. The participant was then informed that the second movement would follow, which was subsequently initiated. After the second movement, the GUI was activated, allowing the participant to provide a rating while the setup was restored again. Once the rating was submitted, this process was repeated.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Study setup. Red depicts the Kinova Jaco 2 robot platform, green the task relevant objects, purple the camera for study recording, blue the workspace environment and yellow the user interface.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g001.tif">
<alt-text content-type="machine-generated">An illustration of the study setup on the left and a real-life equivalent on the right. Both feature a desk with a chair, a computer monitor, a robotic arm manipulator with a minigolf bat, and a tripod-mounted camera. The illustration is color grouped to highlight the study relevant parts of the setup and their role within the study.</alt-text>
</graphic>
</fig>
<p>It was possible to teach the task in very few movements <inline-formula id="inf40">
<mml:math id="m50">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. This was achieved through a strategy that involved the use of demonstrations and providing guidance when approaching the ball. Exploration was only reduced after a success. Additionally, reinforcing successful hits with positive rewards, using fallback storage, and strictly penalizing deviations through corrective feedback contributed to efficient learning.</p>
<p>Despite the possibility of completing the task more quickly, we decided to set a fixed number of task repetitions, opting for a notably higher count of 40 trials. The rationale behind this fixed number is to ensure comparability between participants and to gather a similar amount of input from each. The choice of 40 trials was based on previous experiments, which showed that participants were able to achieve at least one success within this timeframe, even with a more restrictive approach. This success left participants with a positive feeling about the study and ensured that all participants completed all 40 trials without opting to quit.</p>
</sec>
<sec id="s3-3-3">
<title>3.3.3 Procedure</title>
<p>The study began with a welcome and briefing on the task of teaching a robotic arm to play minigolf. The protocol detailed the use of a user interface for rating the robot&#x2019;s task executions. The GUI for group 1 was designed for a simple interaction; see <xref ref-type="fig" rid="F2">Figure 2</xref> left. It displays two options, movement 1 and movement 2, each with a replay button and a heart symbol for expressing preference. At the bottom, the participants had a submit button to confirm the choices. The second GUI for group 2 was configured to evaluate robotic movements with all modalities, see <xref ref-type="fig" rid="F2">Figure 2</xref> right. In addition to the features available to group 1, it also has the option to save movements as fallback. Furthermore, central sliders allow users to adjust exploration rates and movement speed. The central button on the bottom facilitates the demonstration of movements, and on the right there is the option to load a fallback movement.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>This Figure showcases the translated version of the interface for group 1, with only the preference selection the baseline version and group 2, equipped with all combined modalities the multimodal version (translated from German).</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g002.tif">
<alt-text content-type="machine-generated">Split-screen graphic displaying a rating interface. Left side shows two heart icons labeled &#x22;Movement 1&#x22; and &#x22;Movement 2&#x22; with a &#x22;Submit&#x22; button. Right side includes sliders for &#x22;Exploration-Rate&#x22; and &#x22;Movement-Speed,&#x22; additional icons like &#x22;Replay&#x22; and &#x22;Save?&#x22;, and buttons to load, demonstrate, or submit.</alt-text>
</graphic>
</fig>
<p>In the description of the individual interfaces, the conditions of each element were explained to the users, along with their effects, such as how Guidance marks a trajectory as a reference. However, the internal implementation of these effects within the algorithm was not described. The different modalities were clearly differentiated in their function and the corresponding UI elements were highlighted. For example, it was explained that saving a fallback allows users to return to the saved state and that Guidance, as described, marks the movement as a reference. However, it was not mentioned that Fallback internally utilizes the functionality of Guidance. Since the study investigated how lay users interact with these modalities and since users were not aware of how such mechanisms work internally, an introduction to the algorithmic implementation was omitted. After the introduction to the corresponding GUI, the experimenter outlined the recording of choices and timing for analysis purposes, concluding with a post-interaction questionnaire described below and compensation details. After receiving the above information, the participants were asked for their consent to participate in the study, as well as for the recording and use of their data. They were explicitly informed of their right to withdraw from the study at any time or request the deletion of their data.</p>
</sec>
<sec id="s3-3-4">
<title>3.3.4 Data acquisition and analysis</title>
<p>According to the hypotheses and aims of the study, the data collection focused on two main objectives: evaluating learning progress and assessing user satisfaction and preference.</p>
<p>The learning process was analyzed through video recordings of the playing field. Our primary metric for assessing learning progress was the completion of the task, specifically measuring how often the task was completed successfully and the timing of the first success. Success was defined as the ball being hit by the robot with the club and going directly into the hole without touching the edges. In addition to comparing the total number of successes, we also looked at individual movements (1&#x2013;80), noting the percentage of participants in each group that scored a successful hit on the try. In this study, we focused on task success and first hit as primary evaluation metrics, since the objective was to understand how different feedback modalities influence the learning process. Importantly, task success and first hit offer a consistent basis for comparison across highly diverse teaching strategies and the resulting trajectories. Given the variability in user behavior and the flexibility of interaction design, these outcome-oriented metrics allow us to meaningfully relate modality usage to learning effectiveness, regardless of how the behavior was taught. These hit success metrics were measured between groups, with the preference rating serving as the baseline modality for both groups. Group 1 only had access to the base modality, while group 2 also had access to supplementary modalities 2. This setup allows for assessing the impact of extra modalities on learning in a between-subjects design.</p>
<p>In terms of user satisfaction, overall satisfaction with the system was measured for both groups on a 5-point Likert scale. The metric was also compared between groups using the same question, generally referring to the system and not to specific modalities, making this a between-subjects design as well. We calculated significances across groups to analyze the data.</p>
<p>In addition to the between-subjects design, we also had research questions that focused solely on group 2, which had access to the supplementary modalities, such as their preferences and perception of these modalities. Therefore, the following research questions were addressed using a within-subjects design.</p>
<p>Group 2 participants also responded to the System Usability Scale (SUS) questionnaire <xref ref-type="bibr" rid="B11">Brooke (1995)</xref> for each supplementary modality. From the calculated SUS scores, the average was determined for each supplementary modality and compared pairwise. The preferences for the modalities were measured not only through SUS evaluations but also directly through the ranking of the supplementary modalities. We also collected the reasons behind these rankings. For each option in the ranking, we calculated the percentage frequency of its placement in each position. The qualitative statements of the participants were analyzed by categorizing them into thematic groups and then evaluating these categories according to their frequency. Lastly, we analyze the relationship between learning success and modality usage. To assess differences between the two experimental groups, we applied Kruskal&#x2013;Wallis tests for all between-subject comparisons, given the nonparametric nature of the data. For within-subject analyses, such as the evaluation of user preferences across multiple modalities, we used Friedman tests. When significant effects were detected, we conducted Conover&#x2019;s all-pairs post-hoc tests with Holm correction for the between-subject and Wilcoxon signed-rank tests for the within-subject analysis. All questionnaire items for group 1 and group 2 can be found in the <xref ref-type="sec" rid="s14">Supplementary Appendix</xref>.</p>
</sec>
</sec>
<sec id="s3-4">
<title>3.4 Ethics declarations</title>
<p>The research design for this study was reviewed and approved by the local ethics committee of Paderborn University in the scope of TRR 318 Constructing Explainability. Participants gave their informed consent before the experiment.</p>
</sec>
<sec id="s3-5">
<title>3.5 LLM usage</title>
<p>ChatGPT or similar tools were only used to improve the text syntactic and grammatical. All suggestions made by the AI tools were thoroughly checked by all authors.</p>
</sec>
</sec>
<sec sec-type="results" id="s4">
<title>4 Results</title>
<p>We now present the results of our study, structured around our hypotheses. Our study involved 33 German-speaking participants divided into two groups, with group 1 &#x201c;baseline&#x201d; using a basic interface to express preferences between two movements and group 2 &#x201c;multimodal&#x201d; having access to the described additional supplementary modalities. group 1, consisted of 15 individuals: 6 male, 8 female, and 1 identifying as diverse. group 2 comprised 18 participants: 9 male and 9 female. Across groups, there were individuals (12 in group 1, 9 in group 2) with a higher education entrance qualification and 13 (3 in group 1, 9 in group 2) with a higher education degree, totaling 33 participants. All 33 participants were German-speaking, aligned with the German-only interface of the system used. A detailed description of the implementation, modalities, tasks, procedure, and data analysis is provided in <xref ref-type="sec" rid="s3">Section 3</xref>. We begin with the evaluation of learning success H1.</p>
<p>Comnparison of the total number of successful hits between group 1 &#x201c;baseline&#x201d; and group 2 &#x201c;multimodal&#x201d; showed a significant difference in the performance of the two groups, with group 2 &#x201c;multimodal&#x201d; successfully hitting more frequent. A Wilcoxon rank-sum test found a significant difference between group 1 (Median <inline-formula id="inf41">
<mml:math id="m51">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, Mean <inline-formula id="inf42">
<mml:math id="m52">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8.33</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, SD<inline-formula id="inf43">
<mml:math id="m53">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>9.80</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) and group 2 (Median<inline-formula id="inf44">
<mml:math id="m54">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>26.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, Mean<inline-formula id="inf45">
<mml:math id="m55">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>25.56</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, SD <inline-formula id="inf46">
<mml:math id="m56">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>15.62</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) (W<inline-formula id="inf47">
<mml:math id="m57">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>38</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, p<inline-formula id="inf48">
<mml:math id="m58">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0013</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Cliff&#x2019;s Delta <inline-formula id="inf49">
<mml:math id="m59">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">&#x394;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.683</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>95</mml:mn>
<mml:mi>%</mml:mi>
<mml:mtext>&#x2009;CI:&#x2009;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.89</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.22</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> indicates a large effect, with group 2 consistently showing higher values. The results are visualized in <xref ref-type="fig" rid="F3">Figure 3</xref>, which presents the percentage of participants who were successful in each movement, divided into the two groups.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>This graph illustrates the percentage of success per movement (80 movements resulting from 40 trials with 2 movements each) across all participants within each group.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g003.tif">
<alt-text content-type="machine-generated">Line graph titled &#x22;Success Frequency&#x22; showing success rates for Groups 1 and 2 across movements numbered one to eighty. Group 1, in blue, has fluctuating low success rates. Group 2, in orange, shows a higher and more variable success rate, peaking around movement fifty and seventy-five. Both groups have success rates ranging from zero to one hundred percent.</alt-text>
</graphic>
</fig>
<p>Participants in group 2 &#x201c;multimodal&#x201d; achieved their first hit on average after 17.56 ((Median &#x3d; 16, SD &#x3d; 13.65)) attempts, while participants in group 1 &#x201c;baseline&#x201d; required 34.60 (Median &#x3d; 26, SD &#x3d; 27.93) attempts to reach their first successful hit. A Wilcoxon rank-sum test found a trend toward a difference between group 1 and group 2 (<inline-formula id="inf50">
<mml:math id="m60">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>164.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf51">
<mml:math id="m61">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0814</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Cliff&#x2019;s Delta <inline-formula id="inf52">
<mml:math id="m62">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">&#x394;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.371</mml:mn>
<mml:mtext>,&#x2009;</mml:mtext>
<mml:mn>95</mml:mn>
<mml:mi>%</mml:mi>
<mml:mtext>&#x2009;CI:&#x2009;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.06</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.69</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> suggests a moderate effect, with group 1 tending to show higher values.</p>
<p>Next, we will present the participants&#x2019; usage preferences for the supplementary modalities to answer the second hypothesis H2. The general satisfaction level of group 1 &#x201c;baseline&#x201d; was consistently moderate, with <inline-formula id="inf53">
<mml:math id="m63">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.31</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ((Median <inline-formula id="inf54">
<mml:math id="m64">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.14</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). The mean score for overall satisfaction in group 2 &#x2033;multimodal&#x201d; was notably higher, at <inline-formula id="inf55">
<mml:math id="m65">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4.17</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median <inline-formula id="inf56">
<mml:math id="m66">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf57">
<mml:math id="m67">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.857</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). A Wilcoxon rank-sum test found a significant difference between group 1 and group 2 <inline-formula id="inf58">
<mml:math id="m68">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>80.5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0223</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Cliff&#x2019;s Delta <inline-formula id="inf59">
<mml:math id="m69">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">&#x394;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.441</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mn>95</mml:mn>
<mml:mi>%</mml:mi>
<mml:mtext>&#x2009;CI:&#x2009;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.71</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.06</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> indicates a moderate effect, with group 2 tending to show higher values. Due to the demonstration modality being sparsely (n &#x3d; 2) used, we excluded it from the evaluation of user preference and utilization, as the results would not be representative. To assess the usability of the user interface and ensure that its design does not hinder users, as well as to evaluate learnability, participants were asked to complete the System Usability Scale (SUS) questionnaire. On a scale ranging from 0 to 100, SUS scores interpret usability and learnability. With mean SUS scores of <inline-formula id="inf60">
<mml:math id="m70">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>79.17</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median <inline-formula id="inf61">
<mml:math id="m71">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>79.16667</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8.83</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) for <italic>Guidance</italic>, <inline-formula id="inf62">
<mml:math id="m72">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>79.67</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median<inline-formula id="inf63">
<mml:math id="m73">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>80.00</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>14.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) for <italic>Speed</italic>, <inline-formula id="inf64">
<mml:math id="m74">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>74.22</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median<inline-formula id="inf65">
<mml:math id="m75">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>75.00</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>15.80</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) for <italic>Correction</italic>, <inline-formula id="inf66">
<mml:math id="m76">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>72.67</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median<inline-formula id="inf67">
<mml:math id="m77">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>72.66667</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>12.12</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) for <italic>Fallback</italic>, and <inline-formula id="inf68">
<mml:math id="m78">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>68.82</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (Median<inline-formula id="inf69">
<mml:math id="m79">
<mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>68.82353</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16.94</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) for <italic>Exploration</italic>, the system is generally perceived as &#x2019;good&#x2019; in usability. A Friedman test found a significant difference in SUS scores across modalities <inline-formula id="inf70">
<mml:math id="m80">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3c7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>17.78</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0014</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Post-hoc Conover&#x2019;s all-pairs test (Holm correction) revealed significant differences, particularly between Exploration and Guidance (<inline-formula id="inf71">
<mml:math id="m81">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf72">
<mml:math id="m82">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.513</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, large) and Exploration and Speed (<inline-formula id="inf73">
<mml:math id="m83">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.0023</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf74">
<mml:math id="m84">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.727</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, large). The medians of the groups ranged from 68.82 (Exploration) to 80.00 (Speed), with Speed and Guidance scoring the highest. No significant differences were found in the other pairwise comparisons presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>p-Values of pairwise asymptotic Friedman Test with <inline-formula id="inf75">
<mml:math id="m85">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>17.777</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf76">
<mml:math id="m86">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>f</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf77">
<mml:math id="m87">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.00136</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th align="left">Correction</th>
<th align="left">Exploration</th>
<th align="left">Fallback</th>
<th align="left">Guidance</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Exploration</td>
<td align="left">0.1288</td>
<td align="left">-</td>
<td align="left">-</td>
<td align="left">-</td>
</tr>
<tr>
<td align="left">Fallback</td>
<td align="left">0.6544</td>
<td align="left">0.8035</td>
<td align="left">-</td>
<td align="left">-</td>
</tr>
<tr>
<td align="left">Guidance</td>
<td align="left">0.8035</td>
<td align="left">0.0080</td>
<td align="left">0.0821</td>
<td align="left">-</td>
</tr>
<tr>
<td align="left">Speed</td>
<td align="left">0.6544</td>
<td align="left">0.0023</td>
<td align="left">0.0299</td>
<td align="left">0.8035</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, participants were asked to rank the supplementary modalities according to their overall satisfaction, with the most popular ones placed at the top and decreasing in popularity from there. We have illustrated the results of the ranking of supplementary modalities in <xref ref-type="fig" rid="F4">Figure 4</xref>, showing the proportion of rankings for each modality each time it was ranked. The results indicate a preference for the <italic>Guidance</italic> modality, which was placed mainly in the first and second rank. <italic>Speed</italic> is favored as well, with a similar portion of its first rank, although it also shows a spread across third to even fifth ranks in contrast to <italic>Guidance</italic>. The <italic>Correction</italic> modality is ranked with moderate preference, apparent from its distribution mostly between the second to fourth ranks, indicating a balanced reception. Similarly, <italic>Fallback</italic> shows a slight lean towards the first rank but also towards the fifth rank. <italic>Exploration</italic> has a majority of its rankings in the lower fourth and fifth ranks. Overall, <italic>Guidance</italic> and <italic>Speed</italic> were the most liked, while <italic>Exploration</italic> is far behind in the ranking.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>This diagram showcases the ranking distribution of the supplementary modalities. It illustrates the percentage placement of each supplementary modality across the ranking positions by group 2 &#x201c;multimodal&#x201d; participants.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g004.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x22;Ranking of Modalities&#x22; shows five modalities: Guidance, Correction, Exploration, Speed, and Fallback. Each modality is divided into five ranks from first to fifth place with percentages. Guidance has the most first-place rankings at 38%, Speed leads in second place with 39%, while Fallback has the highest fifth-place percentage at 39%. The chart uses distinct colors for each rank.</alt-text>
</graphic>
</fig>
<p>We evaluated two key metrics for usage frequencies: a) The total usage means per participant where a modality, when utilized, is counted once; b) The relative count which aggregates across all participants, measuring how frequently each modality was used overall. The results are displayed in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>This Figure shows two perspectives on modality usage. It displays how often each feedback modality was used across all interactions of all participants and how many participants used each modality at least once. This allows for comparing overall usage frequency with how widely each modality was adopted, highlighting whether some modalities were broadly preferred or mainly used by a few participants.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g005.tif">
<alt-text content-type="machine-generated">Bar chart titled &#x22;Modality Usage&#x22; showing percentages of participants/interactions using various modalities. Dark blue bars represent usage at least once: Speed (93.75%), Correction (93.75%), Fallback saved (87.50%), Exploration (81.25%), Guidance (56.25%), Fallback load (50.00%), Demonstration (25.00%). Light blue bars indicate total modality usage percentages across all modalities.</alt-text>
</graphic>
</fig>
<p>The <italic>Demonstration</italic> modality saw minimal use, with just <inline-formula id="inf78">
<mml:math id="m88">
<mml:mrow>
<mml:mn>0.67</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of all interactions and only <inline-formula id="inf79">
<mml:math id="m89">
<mml:mrow>
<mml:mn>25</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of participants using it. However, most of the participants initiated this interaction but stopped before performing a kinesthetic teaching demonstration. Only <inline-formula id="inf80">
<mml:math id="m90">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> participants completed and submitted a demonstration. The <italic>Fallback</italic> modality consists of two parts; saving of the fallback point and the load of such a point. In contrast, <italic>Fallback load</italic> was used by a higher percentage of participants <inline-formula id="inf81">
<mml:math id="m91">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, despite accounting for only <inline-formula id="inf82">
<mml:math id="m92">
<mml:mrow>
<mml:mn>3.53</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of all interactions. On the other hand, <italic>Fallback saved</italic> was not only used by a majority <inline-formula id="inf83">
<mml:math id="m93">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>87.5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of participants but also represents a higher portion of interactions at <inline-formula id="inf84">
<mml:math id="m94">
<mml:mrow>
<mml:mn>16.97</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <italic>Speed</italic> stood out with the highest engagement, utilized by <inline-formula id="inf85">
<mml:math id="m95">
<mml:mrow>
<mml:mn>93.75</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the participants and making up <inline-formula id="inf86">
<mml:math id="m96">
<mml:mrow>
<mml:mn>12.44</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of all interactions. <italic>Exploration</italic> was also popular, with <inline-formula id="inf87">
<mml:math id="m97">
<mml:mrow>
<mml:mn>81.25</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the participants engaging with it and <inline-formula id="inf88">
<mml:math id="m98">
<mml:mrow>
<mml:mn>26.05</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> share of total modality use. Similarly, <italic>Correction</italic> was used by the same percentage of participants as <italic>Speed</italic> <inline-formula id="inf89">
<mml:math id="m99">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>93.75</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, but with a higher percentage of interactions with <inline-formula id="inf90">
<mml:math id="m100">
<mml:mrow>
<mml:mn>22.52</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> participants who used it. Meanwhile, <italic>Guidance</italic> was used by <inline-formula id="inf91">
<mml:math id="m101">
<mml:mrow>
<mml:mn>56.25</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of participants and in <inline-formula id="inf92">
<mml:math id="m102">
<mml:mrow>
<mml:mn>17.82</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of all interactions. Overall, the <italic>Speed</italic> and <italic>Correction</italic> modalities lead in participant usage, with <italic>Correction</italic> also notable for interaction frequency alongside <italic>Exploration</italic>, whereas <italic>Demonstration</italic> lags behind with the lowest interaction rate and participant engagement.</p>
<p>To answer the third hypothesis H3 regarding the perceived value of the modalities, we also captured the participants&#x2019; explanations for the rankings analyzed above. These are depicted in <xref ref-type="fig" rid="F6">Figure 6</xref>, categorized into positive justifications for higher placements and critical reasons for lower rankings.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Participants&#x2019; reasons for preferring or rejecting supplementary feedback modalities, categorized by modality type and rationale. Positive reasons are shown in green, negative reasons in red. The height of each bar reflects the number of participants who mentioned a given reason, providing a quick overview of participants&#x2019; perceptions.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g006.tif">
<alt-text content-type="machine-generated">Bar charts comparing reasons for preference and rejection of the modelities used in the study. The first chart shows &#x201c;Goal Efficiency&#x201d; and &#x201c;Understandable&#x201d; as top reasons for acceptance, with &#x201c;Guidance&#x201d; and &#x201c;Speed&#x201d; as most favored modality. The second chart highlights &#x201c;Not-Useful&#x201d; and &#x201c;Not-Understandable&#x201d; as top rejection reasons, led by &#x201c;Exploration&#x201d; and &#x201c;Guidance&#x201d;.</alt-text>
</graphic>
</fig>
<p>Regarding the perceived use for a positive influence on learning, we considered the reasons &#x201c;Goal Efficiency&#x201d;, &#x201c;Fast Elimination&#x201d;, &#x201c;Usefulness&#x201d; and &#x201c;Fast Approximation&#x201d; since these rationals are most closely related to the research question. Guidance was valued for its &#x201c;Goal Efficiency&#x201d; and with the most goal-related reasons given. Correction was appreciated for its &#x201c;Fast Elimination&#x201d; and its &#x201c;Usefulness&#x201d;. <italic>Fallback</italic> was also rated for its &#x201c;Usefulness&#x201d;. Exploration was appreciated for its &#x201c;Fast Approximation&#x201d;. Regarding the perceived negative influence on learning, the only reason directly related to learning success was &#x201c;Not-Useful&#x201d;, which was most frequently noted for the low rankings of the Exploration modality. For all other rankings that are not related to learning, we refer the reader to the <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<p>The last hypothesis H4 is concerned with whether some modalities have objectively more influence on robot learning. Spearman&#x2019;s rank correlation was used to analyze the correlations of hits per try and modalities usage per try. A correlation between <italic>Exploration</italic> and hit success revealed a statistically significant, albeit weak, negative correlation with <inline-formula id="inf93">
<mml:math id="m103">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.21</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.025</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Thus, with a decrease of the exploration rate, the likelihood of hitting the target marginally increases. The analysis of <italic>Speed</italic> revealed a modest positive correlation <inline-formula id="inf94">
<mml:math id="m104">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.2389221</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.03898</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. This suggests that a higher <italic>Speed</italic> value correlates with a slightly better chance of hitting the target. In contrast, the correlation with the load <inline-formula id="inf95">
<mml:math id="m105">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and the save <inline-formula id="inf96">
<mml:math id="m106">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.007</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of a fallback was not only very weak but also lacked statistical significance with <inline-formula id="inf97">
<mml:math id="m107">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.7</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf98">
<mml:math id="m108">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Guidance also showed a weak negative correlation with hit success, with <inline-formula id="inf99">
<mml:math id="m109">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.03</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, which was also not statistically significant <inline-formula id="inf100">
<mml:math id="m110">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, suggesting that <italic>Guidance</italic> does not significantly influence hit success. Between <italic>Correction</italic> and hit success, a statistically significant negative correlation (<inline-formula id="inf101">
<mml:math id="m111">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.07</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf102">
<mml:math id="m112">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) was found. However, the correlation is below 0.1, and therefore cannot even be considered weak. Overall, the analyses show that while an increase in <italic>Speed</italic> and a decrease in <italic>Exploration</italic> positively influence hit success, <italic>Fallback</italic>, <italic>Guidance</italic>, and <italic>Correction</italic> show no significant or notable correlations with hit success. In the following section, we discuss these results answering our research question.</p>
</sec>
<sec sec-type="discussion" id="s5">
<title>5 Discussion</title>
<p>In the discussion, we will review and address each hypothesis.<list list-type="simple">
<list-item>
<p>H1: Enhanced Learning through Multiple Modalities. Our results affirm that simultaneously providing multiple feedback modalities significantly enhances the learning process, as evidenced by the higher performance of group 2 &#x201c;multimodal&#x201d; with respect to total success. This group was significantly more successful with respect to the number of hits and showed a clear trend in succeeding faster for the first time. Both confirm that multiple modalities not only enhance initial learning success, but also increase further learning progress. This might be connected to the fact that users were able to actually apply their scaffolding teaching strategies.</p>
</list-item>
<list-item>
<p>H2: User Preferences and Modality Utilization. Generally, the users preferred the system with the combined modalities. This is evident by the higher overall satisfaction observed in group 2 &#x201c;multimodal&#x201d;. Combined with the positive SUS score results for the modalities, this suggests that the design of the user interface is not hindering users. In addition, the results of the SUS questionnaires indicate that the learnability of all modalities was rated between moderate and good. Although the SUS questionnaire is not a classical measure for assessing interface complexity, it nevertheless provides useful insight into users&#x2019; perceived complexity particularly through items 2 (&#x201d;<italic>I found the system unnecessarily complex.</italic>&#x201d;), 3 (&#x201d;<italic>I thought the system was easy to use.</italic>&#x201d;) and 4 (&#x201d;<italic>I think that I would need the support of a technical person to be able to use this system.</italic>&#x201d;). These items indirectly capture how intuitive and easy to learn the interface and its modal interactions appear to users. In this study, the reported means (<inline-formula id="inf103">
<mml:math id="m113">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf104">
<mml:math id="m114">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mn>3</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf105">
<mml:math id="m115">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mn>4</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) and medians (<inline-formula id="inf106">
<mml:math id="m116">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf107">
<mml:math id="m117">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>3</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf108">
<mml:math id="m118">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>4</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) on these items suggest that users generally did not perceive the modalities or their interface representations as too complex or difficult to use. However, this only captures the complexity of each individual modality; in the future, measures such as NASA-TLX should be included to assess the cognitive load of combining multiple modalities. Furthermore, our results show that users do not engage with all modalities equally. The modalities favored by the participants were reflected in their frequency of use, ranking, reasons provided for their choices, and SUS scores for each supplementary modality. Notably, <italic>Guidance</italic> and <italic>Speed</italic> stand out for their high use and positive reception, attributed to their perceived impact on the learning process and intuitiveness. Both also stand out in the pairwise comparison regarding the SUS score compared to <italic>Exploration</italic>. In contrast, <italic>Exploration</italic> and <italic>Demonstration</italic> were less favored. The former was criticized for its ambiguous effects. For the latter, the lower preference is evident in the lack of engagement with the <italic>Demonstration</italic> modality. Despite the explicit introduction of the <italic>Demonstration</italic> modality in the introduction of the study, the use of hover-over tool tips, the prominent placement of the demonstration button (cf. <xref ref-type="fig" rid="F2">Figure 2</xref>), and a simple demo interface that resembles a recorder (see <xref ref-type="fig" rid="F7">Figure 7</xref>) including an additional explanation of the process whenever participants started the demonstration, the participants&#x2019; engagement with the modality remained limited.</p>
</list-item>
</list>
</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Original (German) and translated (English) versions of the demonstration interface shown side by side. It overlays the regular interface when the <italic>Demonstration</italic> button is clicked.</p>
</caption>
<graphic xlink:href="frobt-12-1598968-g007.tif">
<alt-text content-type="machine-generated">Instructions in German and English guide users to click &#x22;Start&#x22; to begin a demonstration, &#x22;Stop&#x22; to finish, and to restart if needed. Buttons labeled &#x22;Start&#x22;, &#x22;Submit&#x22;, and &#x22;Close&#x22; are shown.</alt-text>
</graphic>
</fig>
<p>The sparse interaction might be caused by the perceived effort or lack of necessity also found in the literature regarding the usage of demonstration in HIL-RL <xref ref-type="bibr" rid="B13">Casper et al. (2023)</xref>; <xref ref-type="bibr" rid="B12">Brown et al. (2019)</xref>. It might also be the case that participants could not assess the value of the demonstration. The exact function of the algorithm was not explained in the context of the study and the necessary expertise was not necessarily present. In addition, the participants might have found the robot intimidating. Combined with the increased physical effort required and the perceived comparable value of other modalities, their preference for familiar functions like rating with hearts or icons for preferences, as well as modalities with predictable outcome as the <italic>Speed</italic> modality, may have outweighed the demonstration&#x2019;s benefits. Moreover, the presented setup of multiple simultaneously available modalities has rarely been studied, and understanding the behavior and preferences of participants in this context is highly relevant. Thus, the neglect of well-established modalities should be explored further in future research. An additional noteworthy observation concerns the use of the <italic>Fallback</italic> modality. Fallback saving may have been used more frequently than loading because participants might have aimed to preserve their progress and may have seen little need to return to a previously saved state. Saving a fallback point might have been associated with a predictable and positive outcome, it marked a successful movement, and entailed no risk to the current learning state. In contrast, loading a fallback point could potentially disrupt the ongoing learning trajectory, introducing uncertainty, and requiring stronger commitment from the user. Participants may have hesitated to risk overwriting progress by returning to a prior state.<list list-type="simple">
<list-item>
<p>H3: Different Perception of Modalities. Our findings support the hypothesis that users perceive certain modalities as more beneficial than others in improving the learning process when choosing. <italic>Guidance</italic>, with its high engagement and favorable ranking, was considered particularly effective, highlighting its role in achieving learning goals efficiently. Although <italic>Correction</italic> and <italic>Fallback</italic> were recognized for their utility, their engagement levels and mixed rankings suggest a nuanced perception of their effectiveness in learning enhancement. Both <italic>Correction</italic> and <italic>Fallback</italic> may have ended up in the middle range of preference because they are primarily used in response to errors or setbacks, that is, in situations where the learning or teaching process is not going as intended. However, the overall learning process may have progressed smoothly for many participants, reducing the perceived need to engage with these corrective strategies. <italic>Guidance</italic>, on the other hand, may have gained popularity precisely because it represents a form of positive feedback, reinforcing successful actions, and supporting learners proactively rather than reactively. <italic>Exploration</italic> was frequently used and valued for fast approximation; however, it was also often labeled as &#x201c;Not-Useful&#x201d; and received the lowest ratings, making it one of the underutilized modalities. The perception of Exploration may have suffered not only from the lack of immediate transparency but also from the prerequisite of a certain level of underlying understanding required to use it effectively. <italic>Demonstrations</italic> were used very rarely, indicating a low preference among participants. The low engagement with the <italic>Demonstration</italic> modality may be explained by factors outlined in H2, such as the perceived physical effort involved, a lack of understanding of its added value and the availability of effective alternatives. Hence, when given a choice, these modalities are likely to be dismissed in favor of others.</p>
</list-item>
<list-item>
<p>H4: Enhanced Learning through Individual Modalities. Our results have shown that <italic>Exploration</italic> and <italic>Speed</italic> have a positive impact on hit success. Here, <italic>Exploration</italic> needs to be reduced upon successful hits to maintain performance, and <italic>Speed</italic> is required to be increased to ensure the ball&#x2019;s reach. Although <italic>Exploration</italic> was seen as the least useful and preferred modality, the results indicate that when used effectively (decreasing on success and increasing in the beginning), it positively influences learning success. Movement attributes such as <italic>Speed</italic> are perceived positively, frequently interacted with, understandable, and directly influence learning success. Thus, they appear to represent a natural teaching method for participants in this task context, eventhough they are underestimated in the field <xref ref-type="bibr" rid="B45">Tien et al. (2024)</xref>. In contrast to modalities such as <italic>Speed</italic>, which were frequently used and positively correlated with learning success, and others like <italic>Exploration</italic> or <italic>Demonstration</italic>, which showed or are known for a potential positive effect but were underutilized, there was also <italic>Guidance</italic> that was highly favored by participants but did not show statistically measurable benefit for learning outcomes. The high subjective rating of <italic>Guidance</italic> may stem from its association with already successful movements, leading to a positive memory bias, as well as from the strong sense of control it offers users during interaction. However, its effects are delayed and may not manifest within the same trial, making them difficult to capture with a binary success metric. Additionally, inconsistent usage across participants and interaction with other modalities may have further diluted its measurable impact. A similar pattern can be observed with <italic>Correction</italic>. This may be partially due to the lack of contextual constraints in the system, which allowed users to provide semantically inconsistent feedback, for example, labeling actions as corrections even when they brought the robot closer to the intended goal, or marking actions as guidance based on higher-level intent rather than concrete trajectory advantages, which may not have been beneficial for the learning process. In this study, we intentionally did not restrict users from such feedback behavior to explore whether users would engage in counterproductive labeling in a free-form multi-modal environment. The findings of the positive perception and low effect suggest that future systems should consider offering greater transparency or explanation regarding how each modality influences the learning process. Providing lightweight scaffolding or contextual cues could help users better understand the consequences of their feedback without compromising their autonomy or limiting natural teaching strategies.</p>
</list-item>
</list>
</p>
<p>The diversity of strategies observed among participants resulted in highly varied patterns of modelity usage, making it difficult to identify consistent correlations between specific combinations of feedback modalities and learning success. However, this lack of clear patterns does not necessarily imply an absence of underlying structure. Prior work by Vollmer et al. <xref ref-type="bibr" rid="B47">Vollmer and Hemion (2018)</xref> has shown that even when users are restricted to a single modality, they employ a wide range of teaching strategies, highlighting the inherently individualized nature of human teaching behavior. Furthermore, studies such as <xref ref-type="bibr" rid="B8">B&#x131;y&#x131;k et al. (2022)</xref> and <xref ref-type="bibr" rid="B23">Jeon et al. (2020)</xref> suggest that the effectiveness of human feedback is influenced by their timing, frequency, and contextual fit. These findings underscore the need for adaptive systems that go beyond fixed modalities. Our findings, in combination with these prior work, suggest that the mixed and inconsistent use of different modalities may reflect the fact that offering a wide range of feedback options accommodates diverse teaching strategies. From this perspective, the absence of clear usage patterns is not necessarily problematic; in fact, it may indicate that users are flexibly selecting modalities according to their individual preferences. In contrast, the presence of strong patterns could point to limitations in the accessibility or usability of certain modalities. However, it is also possible that clearer correlations between modality use and learning success would have emerged if modalities had been offered more flexible and adaptively, in line with user needs. This points to an important direction for future work: exploring dynamically personalized modality provision to better support different instructional approaches.</p>
<sec id="s5-1">
<title>5.1 Limitations</title>
<p>Despite the promising results of our study, several limitations must be acknowledged that may affect the generalizability and interpretation of our findings.</p>
<p>First, although we evaluated the usability of each modality using the System Usability Scale (SUS), this instrument was not originally designed to assess cognitive load. As such, it provides only an indirect approximation of the cognitive load induced by the complexitiy of the interface. Future work should incorporate dedicated measures such as NASA-TLX to gain a more precise understanding of the cognitive load caused by using multiple modalities simultaneously.</p>
<p>Second, the <italic>Demonstration</italic> modality, despite being introduced and available, was used very rarely. While this finding aligns with the prior literature as described above, it limits the interpretability of our results for this modality. Low usage may result from perceived physical effort, intimidation by the robot, or a lack of understanding of its benefit. For HIL-RL, this highlights the need for targeted explanation and transparency mechanisms that clarify the added value of each modality. Those scaffoldings should be investigated particularly for modalities with high investments and implicit effects like <italic>Demonstration</italic> and <italic>Exploration</italic> in order to offset effort and hesitation, and to support more informed and balanced modality usage.</p>
<p>Third, our study was conducted in a simple learning scenario, since this setup allowed a controlled comparison of user strategies and modality preferences. This limits the generalizability of our findings to more complex task domains. It remains unclear whether the observed preferences and usage patterns would hold under these more complex conditions. Moreover, user preferences may shift. This limitation also extends to the learning algorithm itself, if it would perform effectively in more complex task domains. Additionally, it is an open question whether the learned policy would generalize well to different contexts beyond the original training setting. Future work should therefore investigate how the combined use of feedback modalities in complex environments and whether additional scaffolding, constraint mechanisms, or adaptive interfaces are needed to maintain effectiveness and usability.</p>
<p>Fourth, the diversity of teaching strategies observed among the participants made it difficult to identify consistent correlations between the use of individual modalities and the learning outcomes. This highlights the need for adaptive and personalized modality presentation, which can dynamically respond to user behavior to support more consistent learning progress.</p>
<p>Taken together, these limitations do not undermine the core findings but rather highlight important directions for refining experimental design, user interfaces, and system adaptivity in future research on multi-modal human-in-the-loop robot learning.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>Overall, we conduct a study to explore the combined use of multiple feedback modalities in HIL-RL to improve robot learning and user satisfaction. The novelty lies in examining how lay users interact with various simultaneously offered feedback modalities identifying their benefit and usage preferences. In summary, we found that offering a variety of feedback modalities does not hinder users, but rather enhances the learning process of the system. Additionally, our results show that even in a relatively simple task environment, providing multiple feedback modalities can positively influence the learning process. While the observed improvements are based on a specific learning framework and task setting, the underlying approaches (PI<sup>BB</sup>, ProMPs) are conceptually transferable. This study suggests that similar patterns in the use of modalities and teaching strategies could likewise emerge in more complex or less structured tasks, indicating a potential for broader applicability. Although this remains to be empirically validated, it highlights the relevance of considering simultaneous and diverse multimodal interaction early in the development of interactive learning systems, especially in domains where user guidance and adaptability play a critical role. Furthermore, we observed that users have preferences for certain modalities that do not always align with the measurable contributions of each modality to the learning process. We thus advocate for a broader variation and integration of multiple simultaneously provided modalities within the domain of interactive robot learning. In our selection of modalities, our results show that, for example, clarifying <italic>Exploration</italic> could result in even more significant learning improvements. Thus, our second call for further research is to focus particularly on transparency and explaining modalities. This especially seems to hold for modalities such as <italic>Exploration</italic> without a directly observable effect, as,for example, for <italic>Speed</italic>. This allows users to utilize the modality that best meets their individual teaching and scaffolding strategies and preferences. Regarding the question of modality stacking, related frameworks such as DemPref by <xref ref-type="bibr" rid="B8">B&#x131;y&#x131;k et al. (2022)</xref> suggest that diminishing returns of modality usage are not necessarily caused by the number of modalities themselves, but rather by when and how extensively they are employed. Their findings show that demonstrations, for instance, offer substantial value early on but become less informative with increased usage, whereas preferences are more fine-grained and useful in later phases. <xref ref-type="bibr" rid="B23">Jeon et al. (2020)</xref> also do not define diminishing returns in terms of the number of modalities used. Instead, they suggest that the frequency and context of modality use are critical factors influencing the informativeness of human feedback. Consequently, rather than emphasizing a fixed set of modalities, the frameworks highlight the importance of flexibility and adaptivity, advocating for systems that dynamically adjust the frequency of the feedback modelities. This indicates that beyond parallel application, systems should offer greater flexibility and adaptivity by dynamically selecting modalities based on context and user needs. Moreover, future frameworks should provide explicit guidance and constraints regarding the frequency of modality use, with the goal of supporting users more effectively and preventing cognitive overload.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="s8">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Ethik-Kommission der Universit&#xe4;t Paderborn. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>HB: Software, Writing &#x2013; review and editing, Conceptualization, Writing &#x2013; original draft, Methodology, Formal Analysis, Visualization, Investigation, Resources, Project administration, Data curation. RB: Writing &#x2013; review and editing, Data curation, Formal Analysis. A-LV: Methodology, Supervision, Project administration, Conceptualization, Funding acquisition, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. The research was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation): TRR 318/1 2021 &#x2013; 438445824. We acknowledge the financial support of the Open Access Publication Fund of Bielefeld University for the article processing charge.</p>
</sec>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. ChatGPT or similar tools were only used for syntactic and grammatical text improvements. All suggestions made by AI tools were thoroughly checked by all authors.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s14">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2025.1598968/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2025.1598968/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>SUPPLEMENTARY FILE 1</label>
<caption>
<p>Questionnaire group 1 (translated from German).</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FILE 2</label>
<caption>
<p>Questionnaire group 2 (translated from German without demonstration SUS questionnaire).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Supplementaryfile1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Supplementaryfile2.pdf" id="SM2" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web"> <person-group person-group-type="author">
<name>
<surname>Alexander Fabisch</surname>
<given-names>J. K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Movement primitives</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/dfki-ric/movement_primitives">https://github.com/dfki-ric/movement_primitives</ext-link>
</comment> (<comment>Accessed January 9, 2023</comment>) <comment>[abstract]</comment>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Angeli</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Valanides</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Developing young children&#x2019;s computational thinking with educational robotics: an interaction effect between gender and scaffolding strategy</article-title>. <source>Comput. Hum. Behav.</source> <volume>105</volume>, <fpage>105954</fpage>. <pub-id pub-id-type="doi">10.1016/j.chb.2019.03.018</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arakawa</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kobayashi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Unno</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tsuboi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Maeda</surname>
<given-names>S.-i.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Dqn-tamer: human-in-the-loop reinforcement learning with intractable feedback</article-title>. <source>arXiv Prepr. arXiv:1810</source>, <fpage>11748</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1810.11748</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Argall</surname>
<given-names>B. D.</given-names>
</name>
<name>
<surname>Browning</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Veloso</surname>
<given-names>M. M.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Teacher feedback to scaffold and refine demonstrated motion primitives on a mobile robot</article-title>. <source>Robotics Aut. Syst.</source> <volume>59</volume>, <fpage>243</fpage>&#x2013;<lpage>255</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2010.11.004</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Arzate Cruz</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Igarashi</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>A survey on interactive reinforcement learning: design principles and open challenges</article-title>,&#x201d; in <source>Proceedings of the 2020 ACM designing interactive syst</source> (<publisher-name>ACM</publisher-name>), <fpage>1195</fpage>&#x2013;<lpage>1209</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bajcsy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Losey</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>O&#x2019;Malley</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A. D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Learning from physical human corrections, one feature at a time</article-title>,&#x201d; in <source>Proc. Of 2018 ACM/IEEE int. Conf. On human-robot interaction</source>, <fpage>141</fpage>&#x2013;<lpage>149</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bignold</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cruz</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Dazeley</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Vamplew</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Foale</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Human engagement providing evaluative and informative advice for interactive reinforcement learning</article-title>. <source>Neural Comput. Appl.</source> <volume>35</volume>, <fpage>18215</fpage>&#x2013;<lpage>18230</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-021-06850-6</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>B&#x131;y&#x131;k</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Losey</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Palan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Landolfi</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Shevchuk</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sadigh</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Learning reward functions from diverse sources of human feedback: optimally integrating demonstrations and preferences</article-title>. <source>Int. J. Robotics Res.</source> <volume>41</volume>, <fpage>45</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1177/02783649211041652</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Breazeal</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>1998</year>). <source>Learning by scaffolding. <italic>Thesis proposal</italic>
</source>, <fpage>1</fpage>&#x2013;<lpage>106</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breazeal</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Berlin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brooks</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gray</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Using perspective taking to learn from ambiguous demonstrations</article-title>. <source>Robotics Aut. Syst.</source> <volume>54</volume>, <fpage>385</fpage>&#x2013;<lpage>393</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2006.02.004</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brooke</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1995</year>). <article-title>Sus: a quick and dirty usability scale</article-title>. <source>
<italic>Usability Eval. Ind.</italic> 189</source>. <pub-id pub-id-type="doi">10.1201/9781498710411-35</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Brown</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Nagarajan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Extrapolating beyond suboptimal demonstrations via inverse reinforcement learning from observations</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>), <fpage>783</fpage>&#x2013;<lpage>792</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Casper</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Davies</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gilbert</surname>
<given-names>T. K.</given-names>
</name>
<name>
<surname>Scheurer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rando</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Open problems and fundamental limitations of reinforcement learning from human feedback</article-title>. <source>arXiv Prepr. arXiv:2307.15217</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2307.15217</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Celemin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kober</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Knowledge-and ambiguity-aware robot learning from corrective and evaluative feedback</article-title>. <source>Neural comput. Appl.</source> <volume>35</volume>, <fpage>16821</fpage>&#x2013;<lpage>16839</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-022-08118-z</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Celemin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Dattari</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chisari</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Franzese</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>de Souza Rosa</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Prakash</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Interactive imitation learning in robotics: a survey</article-title>. <source>Found. Trends Robotics</source> <volume>10</volume>, <fpage>1</fpage>&#x2013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1561/2300000072</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Leu</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Real-time multi-modal human&#x2013;robot collaboration using gestures and speech</article-title>. <source>J. Manuf. Sci. Eng.</source> <volume>144</volume>, <fpage>101007</fpage>. <pub-id pub-id-type="doi">10.1115/1.4054297</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chernova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Veloso</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Interactive policy learning through confidence-based autonomy</article-title>. <source>J. Artif. Intell. Res.</source> <volume>34</volume>, <fpage>1</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1613/jair.2584</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>A. Z.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Learning a universal human prior for dexterous manipulation from human preference</article-title>. <source>arXiv Prepr. arXiv:2304.04602</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2304.04602</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ecoffet</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Huizinga</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lehman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Stanley</surname>
<given-names>K. O.</given-names>
</name>
<name>
<surname>Clune</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>First return, then explore</article-title>. <source>Nature</source> <volume>590</volume>, <fpage>580</fpage>&#x2013;<lpage>586</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-03157-9</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hindemith</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bruns</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Noller</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Hemion</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vollmer</surname>
<given-names>A.-L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Interactive robot task learning: human teaching proficiency with different feedback approaches</article-title>. <source>IEEE Trans. Cogn. Dev. Syst.</source> <pub-id pub-id-type="doi">10.1109/TCDS.2022.3186270</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="web">
<collab>Inc., K</collab> (<year>2023</year>). <article-title>Kinova assistiv</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://assistive.kinovarobotics.com/product/jaco-robotic-arm">https://assistive.kinovarobotics.com/product/jaco-robotic-arm</ext-link>
</comment> (<comment>Accessed February 06, 2023</comment>) </citation>
</ref>
<ref id="B22">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Isbell</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shelton</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Kearns</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>A social reinforcement learning agent</article-title>,&#x201d; in <source>Proceedings of the fifth international conference on Autonomous agents</source>, <fpage>377</fpage>&#x2013;<lpage>384</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jeon</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Milli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Reward-rational (implicit) choice: a unifying formalism for reward learning</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>4415</fpage>&#x2013;<lpage>4426</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jumaat</surname>
<given-names>N. F.</given-names>
</name>
<name>
<surname>Tasir</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Instructional scaffolding in online learning environment: a meta-analysis</article-title>,&#x201d; in <source>2014 international conference on teaching and learning in computing and engineering</source> (<publisher-name>IEEE</publisher-name>), <fpage>74</fpage>&#x2013;<lpage>77</lpage>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kaufmann</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bengs</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>H&#xfc;llermeier</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A survey of reinforcement learning from human feedback</article-title>. <source>arXiv Prepr. arXiv:2312.14925</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2312.14925</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Knox</surname>
<given-names>W. B.</given-names>
</name>
<name>
<surname>Allievi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Banzhaf</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Schmitt</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Reward (mis) design for autonomous driving</article-title>. <source>Artif. Intell.</source> <volume>316</volume>, <fpage>103829</fpage>. <pub-id pub-id-type="doi">10.1016/j.artint.2022.103829</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Knox</surname>
<given-names>W. B.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Tamer: training an agent manually via evaluative reinforcement</article-title>,&#x201d; in <source>2008 7th IEEE international conference on development and learning</source> (<publisher-name>IEEE</publisher-name>), <fpage>292</fpage>&#x2013;<lpage>297</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gomez</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nakamura</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Human-centered reinforcement learning: a survey</article-title>. <source>IEEE Trans. Hum.-Mach. Syst.</source> <volume>49</volume>, <fpage>337</fpage>&#x2013;<lpage>349</lpage>. <pub-id pub-id-type="doi">10.1109/thms.2019.2912447</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Canberk</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Losey</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Sadigh</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Learning human objectives from sequences of physical corrections</article-title>,&#x201d; in <source>2021 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>2877</fpage>&#x2013;<lpage>2883</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Swaminathan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kolobov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>C.-A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Interactive robot learning from verbal correction</article-title>. <source>arXiv Prepr. arXiv:2310.17555</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2310.17555</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Meri&#xe7;li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Veloso</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Akin</surname>
<given-names>H. L.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Complementary humanoid behavior shaping using corrective demonstration</article-title>,&#x201d; in <source>
<italic>2010 10th IEEE-RAS international Conference on humanoid robots</italic> (IEEE)</source>, <fpage>334</fpage>&#x2013;<lpage>339</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meri&#xe7;li</surname>
<given-names>&#xc7;.</given-names>
</name>
<name>
<surname>Veloso</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ak&#x131;n</surname>
<given-names>H. L.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Task refinement for autonomous robots using complementary corrective human feedback</article-title>. <source>Int. J. Adv. Robotic Syst.</source> <volume>8</volume>, <fpage>16</fpage>. <pub-id pub-id-type="doi">10.5772/10575</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paraschos</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Daniel</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Probabilistic movement primitives</article-title>. <source>
<italic>Adv. neural Inf. Process. Syst.</italic> 26</source>.</citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Raffin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kober</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Stulp</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Smooth exploration for robotic reinforcement learning</article-title>,&#x201d; in <source>
<italic>Conference on robot learning</italic> (PMLR)</source>, <fpage>1634</fpage>&#x2013;<lpage>1644</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ravichandar</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Polydoros</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Chernova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Billard</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recent advances in robot learning from demonstration</article-title>. <source>Annu. Rev. Control Rob. Auton. Syst.</source> <volume>3</volume>, <fpage>297</fpage>&#x2013;<lpage>330</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-control-100819-063206</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Robotics</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Ros noetic ninjemys</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://wiki.ros.org/noetic">http://wiki.ros.org/noetic</ext-link>
</comment>.(<comment>Accessed February 21, 2023</comment>) </citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Saunders</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nehaniv</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Dautenhahn</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>Teaching robots by moulding behavior and scaffolding the environment</article-title>,&#x201d; in <source>Proceedings of the 1st ACM SIGCHI/SIGART conference on Human-robot interaction</source>, <fpage>118</fpage>&#x2013;<lpage>125</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stone</surname>
<given-names>C. A.</given-names>
</name>
</person-group> (<year>1998a</year>). <article-title>The metaphor of scaffolding: its utility for the field of learning disabilities</article-title>. <source>J. Learn. Disabil.</source> <volume>31</volume>, <fpage>344</fpage>&#x2013;<lpage>364</lpage>. <pub-id pub-id-type="doi">10.1177/002221949803100404</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stone</surname>
<given-names>C. A.</given-names>
</name>
</person-group> (<year>1998b</year>). <article-title>Should we salvage the scaffolding metaphor?</article-title> <source>J. Learn. Disabil.</source> <volume>31</volume>, <fpage>409</fpage>&#x2013;<lpage>413</lpage>. <pub-id pub-id-type="doi">10.1177/002221949803100411</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stulp</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Sigaud</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Policy improvement methods: between black-box optimization and episodic reinforcement learning</article-title>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Su</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sandoval</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Laribi</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Recent advancements in multimodal human&#x2013;robot interaction</article-title>. <source>Front. Neurorob.</source> <volume>17</volume>, <fpage>1084000</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2023.1084000</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Breazeal</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>Reinforcement learning with human teachers: evidence of feedback and guidance with implications for learning performance</article-title>,&#x201d; in <source>Proceedings of the 21st national conference on artificial intelligence</source> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>AAAI Press</publisher-name>), <volume>6</volume>, <fpage>1000</fpage>&#x2013;<lpage>1005</lpage>.</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Breazeal</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Robot learning via socially guided exploration</article-title>. <source>Dev. Learn.</source>, <fpage>82</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1109/devlrn.2007.4354078</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Cakmak</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Learning about objects with human teachers</article-title>,&#x201d; in <source>Proceedings of the 4th ACM/IEEE international conference on Human robot interaction</source>, <fpage>15</fpage>&#x2013;<lpage>22</lpage>.</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tien</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>B&#x131;y&#x131;k</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Optimizing robot behavior via comparative language feedback</article-title>
</citation>
</ref>
<ref id="B46">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Torne</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Balsells</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Desai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Breadcrumbs to the goal: goal-conditioned exploration from human-in-the-loop feedback</article-title>,&#x201d; in <source>Proceedings of the 37th international conference on neural information processing systems</source>, <fpage>63222</fpage>&#x2013;<lpage>63258</lpage>.</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vollmer</surname>
<given-names>A.-L.</given-names>
</name>
<name>
<surname>Hemion</surname>
<given-names>N. J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A user study on robot skill learning without a cost function: optimization of dynamic movement primitives via naive user feedback</article-title>. <source>Front. Rob. AI</source> <volume>5</volume>, <fpage>77</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2018.00077</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vollmer</surname>
<given-names>A.-L.</given-names>
</name>
<name>
<surname>Wrede</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Rohlfing</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Oudeyer</surname>
<given-names>P.-Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Pragmatic frames for teaching and learning in human&#x2013;robot interaction: review and challenges</article-title>. <source>Front. neurorobotics</source> <volume>10</volume>, <fpage>10</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2016.00010</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wood</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bruner</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Ross</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>1976</year>). <article-title>The role of tutoring in problem solving</article-title>. <source>J. child Psychol. psychiatry</source> <volume>17</volume>, <fpage>89</fpage>&#x2013;<lpage>100</lpage>. <pub-id pub-id-type="doi">10.1111/j.1469-7610.1976.tb00381.x</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tapus</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Interactive robot learning for multimodal emotion recognition</article-title>,&#x201d; in <source>Social robotics: 11th international conference, ICSR 2019, Madrid, Spain, november 26&#x2013;29, 2019, proceedings 11</source> (<publisher-name>Springer</publisher-name>), <fpage>633</fpage>&#x2013;<lpage>642</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>