<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink"> <front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1340334</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2024.1340334</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Semantic learning from keyframe demonstration using object attribute constraints</article-title>
<alt-title alt-title-type="left-running-head">Sen et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2024.1340334">10.3389/frobt.2024.1340334</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Sen</surname>
<given-names>Busra</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2377964/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Elfring</surname>
<given-names>Jos</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/1668831/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Torta</surname>
<given-names>Elena</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/886373/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>van de Molengraft</surname>
<given-names>Ren&#xe9;</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff>
<institution>Department of Mechanical Engineering, Eindhoven University of Technology</institution>, <addr-line>Eindhoven</addr-line>, <country>Netherlands</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2141059/overview">Emanuel Sousa</ext-link>, Centro de Computa&#xe7;&#xe3;o Gr&#xe1;fica, Portugal</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1327780/overview">Shrey Pareek</ext-link>, Target, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1820475/overview">Vladimir Popov</ext-link>, Ural Federal University, Russia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Busra Sen, <email>b.sen@tue.nl</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>18</day>
<month>07</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1340334</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>11</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>06</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Sen, Elfring, Torta and van de Molengraft.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Sen, Elfring, Torta and van de Molengraft</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Learning from demonstration is an approach that allows users to personalize a robot&#x2019;s tasks. While demonstrations often focus on conveying the robot&#x2019;s motion or task plans, they can also communicate user intentions through object attributes in manipulation tasks. For instance, users might want to teach a robot to sort fruits and vegetables into separate boxes or to place cups next to plates of matching colors. This paper introduces a novel method that enables robots to learn the semantics of user demonstrations, with a particular emphasis on the relationships between object attributes. In our approach, users demonstrate essential task steps by manually guiding the robot through the necessary sequence of poses. We reduce the amount of data by utilizing only robot poses instead of trajectories, allowing us to focus on the task&#x2019;s goals, specifically the objects related to these goals. At each step, known as a keyframe, we record the end-effector pose, object poses, and object attributes. However, the number of keyframes saved in each demonstration can vary due to the user&#x2019;s decisions. This variability in each demonstration can lead to inconsistencies in the significance of keyframes, complicating keyframe alignment to generalize the robot&#x2019;s motion and the user&#x2019;s intention. Our method addresses this issue by focusing on teaching the higher-level goals of the task using only the required keyframes and relevant objects. It aims to teach the rationale behind object selection for a task and generalize this reasoning to environments with previously unseen objects. We validate our proposed method by conducting three manipulation tasks aiming at different object attribute constraints. In the reproduction phase, we demonstrate that even when the robot encounters previously unseen objects, it can generalize the user&#x2019;s intention and execute the task.</p>
</abstract>
<kwd-group>
<kwd>learning from demonstration</kwd>
<kwd>keyframe demonstrations</kwd>
<kwd>object attributes</kwd>
<kwd>task goal learning</kwd>
<kwd>semantic learning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Robot Learning and Evolution</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>As robots become increasingly integrated into diverse environments, from factories to homes, the need for more intuitive and adaptable programming methods becomes paramount. Traditional robot programming methods fall into two main categories: online and offline programming (<xref ref-type="bibr" rid="B45">Pan et al., 2012</xref>). Online programming includes methods like lead-through, where the user records the robot&#x2019;s path via a teach-pendant, and walk-through, where the user physically guides the robot through the desired motions (<xref ref-type="bibr" rid="B45">Pan et al., 2012</xref>; <xref ref-type="bibr" rid="B56">Villani et al., 2018</xref>). This approach is user-friendly as it doesn&#x2019;t require programming skills. However, its lack of adaptability necessitates the re-recording of the entire sequence even for minor changes in the environment (<xref ref-type="bibr" rid="B56">Villani et al., 2018</xref>). Offline programming, on the other hand, involves defining each robot&#x2019;s movement for a specific task (<xref ref-type="bibr" rid="B45">Pan et al., 2012</xref>). This approach is often limited to highly controlled environments and requires expertise in robotics.</p>
<p>Inspired by people&#x2019;s ability to learn by imitating others (<xref ref-type="bibr" rid="B13">Calinon, 2018</xref>), Learning from Demonstration (LfD) offers a promising solution to these limitations. Unlike offline robot programming, LfD does not require explicit programming of a robot&#x2019;s task. Instead, it allows users to teach robots their skills through demonstrations. LfD does not merely repeat pre-recorded robot motions like online robot programming (<xref ref-type="bibr" rid="B52">Ravichandar et al., 2020</xref>). In each demonstration, LfD approaches can extract the task constraints implicitly and generalize these learned skills to different environments (<xref ref-type="bibr" rid="B30">Hussein et al., 2018</xref>; <xref ref-type="bibr" rid="B52">Ravichandar et al., 2020</xref>). Moreover, it can handle a wider range of tasks compared to traditional offline robot programming. This is evidenced by its successful applications in various fields such as healthcare tasks (<xref ref-type="bibr" rid="B22">Fong et al., 2019</xref>; <xref ref-type="bibr" rid="B46">Pareek and Kesavadas, 2020</xref>), household tasks (<xref ref-type="bibr" rid="B57">Ye and Alterovitz, 2017</xref>), and industrial tasks (<xref ref-type="bibr" rid="B51">Ramirez-Amaro et al., 2019</xref>).</p>
<p>The process of LfD begins with a demonstration phase, where the user imparts relevant information about the robot&#x2019;s state and the environment through demonstrations. These demonstrations can be performed through kinesthetic teaching, teleoperation, or passive observation (<xref ref-type="bibr" rid="B52">Ravichandar et al., 2020</xref>). Following this, the robot learns to encode the recorded data to identify commonalities among the demonstrations. The encoding method typically depends on the learning paradigm, which could be at a low level (e.g., robot trajectories) or a high level (e.g., action order, semantic rules) (<xref ref-type="bibr" rid="B8">Billard et al., 2008</xref>).</p>
<p>At low-level LfD, the demonstrated trajectories, which are continuous sequences of waypoints with timestamps, can be parameterized in either the joint and/or task space (<xref ref-type="bibr" rid="B16">Chernova and Thomaz, 2014</xref>). To generalize these trajectories to different initial robot states or different object positions, the user needs to provide multiple demonstrations, which can increase their workload (<xref ref-type="bibr" rid="B4">Akgun et al., 2012b</xref>). Moreover, since trajectories include time information, the quality of the user&#x2019;s demonstration is crucial to avoid jerky movements. Recent studies have proposed learning from time-independent sparse data, known as keyframes, instead of recording full trajectories at a high rate (<xref ref-type="bibr" rid="B4">Akgun et al., 2012b</xref>). These keyframe or trajectory demonstrations typically include robot states (i.e., joint angles, end-effector poses) and information about the poses of objects involved in the task. However, to semantically understand the goal behind a demonstration, more than just a robot pose or object pose is needed.</p>
<p>Understanding the user&#x2019;s intent about the task goals or which objects the robot should give attention to are part of issues that need to be addressed when adopting the high-level LfD approaches (<xref ref-type="bibr" rid="B25">Fonooni et al., 2015</xref>). In this study, the term &#x201c;user intention&#x201d; denotes any task goals related to the objects used throughout the task, and the attributes of the objects (such as color, shape, size, etc.) that are relevant for understanding the task goals. To generalize task goals regarding object attributes, the user is expected to change the objects in each demonstration to meet her/his intentions. For example, in <xref ref-type="fig" rid="F1">Figure 1</xref>, the user wants to collect the same fruits in a box. If the user only demonstrates the task once, the robot might not learn if the task is about the specific type of fruit used in the demonstration or about fruit in general. However, demonstrating the task multiple times with different fruits allows for generalization.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The user changes the objects used in the demonstrations to teach her intention. Our proposed method computes similarities between demonstrations and within a demonstration from keyframes. The robot learns the purpose of the task from the object attribute relations.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g001.tif"/>
</fig>
<p>In the first demonstration, the robot learns to &#x201c;pick up the orange fruits and place it into the box.&#x201d; In the second demonstration, the user changes the objects, and then the robot learns to &#x201c;pick up any fruit and place it into the box&#x201d;. In the last demonstration, the robot learns that &#x201c;the fruit in the box must be the same as the fruit to be picked&#x201d;. After three demonstrations, the robot learns to &#x201c;pick up the fruit that is the same as the fruit in the box and place it into the box&#x201d;. During task reproduction, the robot may encounter different fruits, but it should know the desired constraints (the fruit in the box and the fruit to be picked must be the same). The robot does not need to see these objects during teaching. It checks the fruit in the box and then finds the fruit that satisfies the constraint. Giving attention to the objects allows the robot to generalize the user&#x2019;s intention to unseen environments by uncovering a set of similar objects&#x2019; attributes used in each demonstration. These task goals, or user intentions, are taught implicitly; the only thing the user needs to do is to change the objects to convey her intention and bring the robot to the desired robot poses during demonstrations to fulfil the intended task. The end-effector pose, object poses, and object attributes are recorded as a keyframe at each step.</p>
<p>The distribution of these keyframes depends on the task&#x2019;s requirements and user decisions, so we cannot assume users will use the same number of keyframes for different demonstrations meant to teach the same task. Even among multiple demonstrations of the same task, the number of keyframes can differ. For instance, in <xref ref-type="fig" rid="F1">Figure 1</xref>, the user records three keyframes in the first demonstration, four in the second, and four in the last demonstration. For an adequate summary of a task, a minimum number of keyframes is required for the robot to understand the task, as explained later. Non-expert users can, however, be expected to record additional keyframes that contain superfluous information and complicate learning. An alignment method is required to cluster the keyframes from different demonstrations to generalize the robot&#x2019;s motion. After alignment, clusters can be a mix of required and unrequired keyframes. Required keyframes, referred to as object-centric, direct attention to the objects that reflect the user&#x2019;s intention and provide accurate relative poses between the end-effector and the objects. Object-centric keyframes are defined as those that change an object&#x2019;s attribute in the environment. Task-unimportant keyframes, on the other hand, are defined as remaining keyframes that do not alter any attribute during the demonstrations. We consider that novice users might record these task-unimportant keyframes. In summary, our research is motivated by the following challenges:<list list-type="simple">
<list-item>
<p>&#x27a2; The learning becomes more complicated when there is a varying number of keyframes. How can we extract conceptually duplicate keyframes from each demonstration?</p>
</list-item>
<list-item>
<p>&#x27a2; How can we derive the user&#x2019;s intention regarding object attributes to generalize the task to previously unseen scenes?</p>
</list-item>
</list>
</p>
<p>This paper is organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> summarizes the related work on keyframes and task goals concerning the objects&#x2019; relationships in learning from demonstration, and it outlines our contributions to this field. <xref ref-type="sec" rid="s3">Section 3</xref> explains our proposed solution. <xref ref-type="sec" rid="s4">Section 4</xref> presents the experimental setup, and <xref ref-type="sec" rid="s5">Section 5</xref> validates our results by demonstrating various tasks that include different object attributes. Finally, <xref ref-type="sec" rid="s6">Section 6</xref> discusses the results, highlights the current limitations, and suggests possible future research directions.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>Trajectory demonstrations are commonly used in literature to show robot motions. These demonstrations are instrumental in scenarios where velocity and timing are crucial to the required skills. However, due to their uninterrupted nature, demonstrations by novice users operating high-degree-of-freedom robots can be challenging (<xref ref-type="bibr" rid="B16">Chernova and Thomaz, 2014</xref>). Furthermore, multiple demonstrations are necessary to generalize the demonstrated skills to task parameters such as different object poses and varying initial robot positions. As the duration of each demonstration may differ, a time alignment method is employed to align these trajectories (<xref ref-type="bibr" rid="B43">Muhlig et al., 2009</xref>). It is also important to note that these trajectories&#x2019; smoothness depends on the user&#x2019;s abilities. Therefore, optimization techniques are often required to prevent jerky movements (<xref ref-type="bibr" rid="B52">Ravichandar et al., 2020</xref>).</p>
<p>
<xref ref-type="bibr" rid="B4">Akgun et al. (2012b)</xref> proposed using keyframe demonstrations to circumvent the issues above. Keyframes are described as sparse robot poses in joint configuration or task space. These keyframes are similar to the sub-goals that are highlighted after trajectory segmentations to learn task plans similar to the approach proposed by <xref ref-type="bibr" rid="B39">Lioutikov et al. (2015)</xref> and <xref ref-type="bibr" rid="B44">Niekum et al. (2015)</xref>. However, in keyframe demonstrations, these sparse poses are given by the user. Experiments conducted by <xref ref-type="bibr" rid="B4">Akgun et al. (2012b)</xref> where novice users demonstrate keyframes, highlight the advantages of keyframes over trajectories. While these studies by <xref ref-type="bibr" rid="B4">Akgun et al. (2012b)</xref> do not primarily focus on generalization capabilities, multiple demonstrations would be more straightforward with keyframes as they do not present time alignment issues.</p>
<p>Despite these advantages, learning robot skills with keyframes is rare. One disadvantage is the lack of time information. In the field of robot motion generation from keyframes, various methods have been proposed to address this problem. <xref ref-type="bibr" rid="B31">Jankowski et al. (2022)</xref> developed a technique that creates smooth and adaptable robot trajectories from sparse key position demonstrations. This method solves a time-optimal control problem for each key position and adapts in real time to the current state of the robot and the environment. On the other hand, <xref ref-type="bibr" rid="B4">Akgun et al. (2012b)</xref> proposed a method that generates a sparse trajectory of joint angles, automatically adds keyframes for start and/or end positions if they are omitted, and calculates time data for each keyframe based on distance and a constant average velocity.</p>
<p>An alignment and/or a clustering method are required to extract similar keyframes/trajectories from multiple demonstrations. The approach by <xref ref-type="bibr" rid="B3">Akgun et al. (2012a)</xref> involves temporally aligning multiple skill demonstrations using an iterative process. This process employs Dynamic Time Warping (DTW) and an alignment pool, and sequences are selected based on the lowest pairwise DTW cost. Keyframes aligned to the same keyframe from another demonstration are clustered together. The maximum number of keyframes is chosen as the cluster number, and the Gaussian Mixture Model (GMM) is applied to cluster the keyframes (<xref ref-type="bibr" rid="B4">Akgun et al., 2012b</xref>). In the studies by <xref ref-type="bibr" rid="B37">Kurenkov et al. (2015)</xref>; <xref ref-type="bibr" rid="B48">Perez-D&#x2019;Arpino and Shah (2017)</xref> the number of clusters is determined by the rounded average number of keyframes, and K-means is used for clustering. Another approach by <xref ref-type="bibr" rid="B5">Akgun and Thomaz, (2016)</xref> uses Hidden Markov Models, initializing the number of states as the minimum number of keyframes. This learning model is applied separately for goal keyframes and action keyframes. Lastly, the study by <xref ref-type="bibr" rid="B31">Jankowski et al. (2022)</xref> operates under the assumption that the number of keyframes is identical among demonstrations. In these previous studies, to generalize the robot motion, either the number of keyframes is assumed to be constant, or alignment or clustering methods are used to deal with a varying number of keyframes. We remove the assumption of an equal number of keyframes in each demonstration, but also we do not use alignment or clustering methods to remove the dependency of generalization capabilities to these methods.</p>
<p>Although <xref ref-type="bibr" rid="B5">Akgun and Thomaz (2016)</xref> record the objects&#x2019; states as a goal keyframe to monitor the execution of action keyframes, they assume that there exists one object in the task. Differently, in our study, we do not assume which objects are important for the task, so we record the end-effector pose, all object attributes in the environment, and their poses as a keyframe. This definition provides the extraction of object-centric keyframes, even if the user records both object-centric keyframes and task-unimportant keyframes. These object-centric keyframes help us to give attention to the objects and guarantee that relative poses between the end-effector and objects would not be affected by incorrect clustering of keyframes because of task-unimportant keyframes.</p>
<p>We focus on understanding user intent behind demonstrations by observing objects. Inverse Reinforcement Learning (IRL) also aims to comprehend user intent by recovering a reward function that justifies expert demonstrations (<xref ref-type="bibr" rid="B1">Ab Azar et al., 2020</xref>; <xref ref-type="bibr" rid="B6">Arora and Doshi, 2021</xref>). This function is often estimated from expert trajectories in robotics for various applications, such as navigation for socially compliant robots (<xref ref-type="bibr" rid="B36">Kretzschmar et al., 2016</xref>; <xref ref-type="bibr" rid="B35">Kollmitz et al., 2020</xref>; <xref ref-type="bibr" rid="B55">Sun et al., 2020</xref>) and path and velocity preferences for robot arms (<xref ref-type="bibr" rid="B9">Bobu et al., 2018</xref>; <xref ref-type="bibr" rid="B7">Avaei et al., 2023</xref>). IRL typically constrains the reward space with predefined features (<xref ref-type="bibr" rid="B53">Shek et al., 2023</xref>), except for some recent work that updates these features online (<xref ref-type="bibr" rid="B40">Louren&#xe7;o et al., 2023</xref>). These features usually include relative distances or velocity and acceleration preferences. Unlike IRL, we don&#x2019;t learn a reward function but have a predefined object attribute set. Our study&#x2019;s contribution is teaching semantics about these attributes from keyframe demonstrations and generalizing this to unseen environments.</p>
<p>In robotics, semantics can be viewed as a robot&#x2019;s ability to understand the significance of environmental entities. Numerous applications have been developed incorporating semantics. For example, mobile robots construct semantic maps (<xref ref-type="bibr" rid="B18">Deng et al., 2024</xref>) enhancing navigation (<xref ref-type="bibr" rid="B50">Qi et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Achat et al., 2023</xref>) and facilitating dynamic object searches in domestic settings (<xref ref-type="bibr" rid="B27">Guo et al., 2022</xref>; <xref ref-type="bibr" rid="B59">Zhang et al., 2023</xref>). Human-robot interaction (HRI) uses semantics to generate natural questions about environmental objects (<xref ref-type="bibr" rid="B42">Moon and Lee, 2020</xref>), execute tasks specified in natural language using Natural Language Processing (NLP) (<xref ref-type="bibr" rid="B11">Bucker et al., 2022</xref>; <xref ref-type="bibr" rid="B10">2023</xref>), and physical human-robot interaction (<xref ref-type="bibr" rid="B40">Louren&#xe7;o et al., 2023</xref>). In robot manipulation, semantics is used in object grasping based on attributes like fragility or softness (<xref ref-type="bibr" rid="B38">Kwak et al., 2022</xref>). These are a few examples of semantics applications in robotics. However, our paper focuses on a subset of applications related to the use of semantics for LfD in industrial robot manipulation tasks.</p>
<p>Semantics in LfD can be used to simplify user programming. <xref ref-type="bibr" rid="B20">Eiband et al. (2023)</xref> used position and force-torque sensing to identify semantic skills, which were classified using a support vector machine. A decision tree classifier was used to understand the correlation between the robot&#x2019;s movements, environmental data, and activities (<xref ref-type="bibr" rid="B51">Ramirez-Amaro et al., 2019</xref>). <xref ref-type="bibr" rid="B54">Steinmetz et al. (2019)</xref> identified skills from a predefined set, described them using the Planning Domain Definition Language (PDDL), and conveyed semantically annotated skills to the user via an interface. <xref ref-type="bibr" rid="B58">Zanchettin (2023)</xref> proposed a method for semantically representing demonstrated skills, enabling the robot to identify workspace elements and understand the skill&#x2019;s preconditions and effects.</p>
<p>One purpose of using object attributes in LfD is to select the appropriate actions; <xref ref-type="bibr" rid="B15">Chella et al. (2006)</xref> use conceptual spaces introduced by <xref ref-type="bibr" rid="B49">Peter (2000)</xref> which are metric spaces, formed by quality dimensions such as time, color, shape, and weight. The proposed approach by <xref ref-type="bibr" rid="B15">Chella et al. (2006)</xref> decides which action the robot should perform, based on the similarities between the scenes in the reproduction and demonstration actions. Differently, <xref ref-type="bibr" rid="B23">Fonooni et al. (2012)</xref> use semantic networks in LfD to generalize learned skills to new objects and situations by comparing network nodes before and after the demonstration. The environment in the demonstration and the environment in the reproduction can be different. Transfer learning and object mapping are applied in LfD to generalize the task to unseen environments (<xref ref-type="bibr" rid="B12">Bullard et al., 2016</xref>; <xref ref-type="bibr" rid="B21">Fitzgerald et al., 2018</xref>). Although these studies consider object attributes to generalize the demonstrations to unseen environments, they do not take into account the user intention related to object attributes.</p>
<p>
<xref ref-type="bibr" rid="B32">Kaelbling et al. (2001)</xref> state that <italic>&#x201c;It is hard to imagine a truly intelligent agent that does not conceive of the world in terms of objects and their properties and relations to other objects.&#x201d;</italic> This statement is the main idea of our research, and its importance is also shown by the research of <xref ref-type="bibr" rid="B17">Cubek et al. (2015)</xref> which inspired our research. For example, spatial relations between objects might be crucial for the robot to define task goals (<xref ref-type="bibr" rid="B26">French et al., 2023</xref>) represent the task goal as spatial relations between scene objects, using a scene graph with objects as nodes and inter-object relations as edges. The aim is to generalize spatial relation goals to different environments, including those with varied objects. Similarly, spatial relations (distance and angle for labeled objects) between the served object and other table objects are learned for a food-serving task, enabling the robot to identify possible object arrangements in unseen scenes (<xref ref-type="bibr" rid="B34">Kawasaki and Takahashi, 2021</xref>). <xref ref-type="bibr" rid="B29">Hristov and Ramamoorthy (2021)</xref> label demonstrated trajectories using high-level concepts, including spatial labels like &#x201c;behind&#x201d; and &#x201c;on top of,&#x201d; and temporal labels like &#x201c;quickly.&#x201d; This approach helps in generating new trajectories by combining these labels.</p>
<p>The approach proposed by <xref ref-type="bibr" rid="B14">Chao et al. (2010)</xref> involves learning a set of criteria for an unchanged object&#x2019;s attributes such as color to meet some expectations such as location. <xref ref-type="bibr" rid="B25">Fonooni et al. (2015)</xref> expanded their previous work by adding the Ant Colony Algorithm to determine the relevant nodes in the Semantic Network to focus on significant aspects of the demonstration, such as the shape of the object. Then they add priming to their method to reduce the number of demonstrations required in similar contexts (<xref ref-type="bibr" rid="B24">Fonooni et al., 2016</xref>). <xref ref-type="bibr" rid="B51">Ramirez-Amaro et al. (2019)</xref> proposed an approach using an ontology-based graph to adjust a demonstrated skill based on object attributes. For example, if a task is demonstrated using an orange, and the perceived object is an apple, the task steps are modified as there is no &#x201c;squeeze&#x201d; activity for the apple, without requiring a new demonstration. A recent study by <xref ref-type="bibr" rid="B19">Du et al. (2024)</xref> focused on sorting tasks based on the object colors, noting key trajectory positions to execute the task in various environments. However, their study only records and matches the pick and target objects&#x2019; colors, not considering the intention between demonstrations or object attribute relations. Our work&#x2019;s goal is much closer to the research of <xref ref-type="bibr" rid="B17">Cubek et al. (2015)</xref>. In their approach, they change the used objects in every demonstration to understand the user&#x2019;s goal. They focus on pick-and-place tasks, and they extract similar object attributes for active (grasped) and passive (released) objects between demonstrations. However <xref ref-type="bibr" rid="B14">Chao et al. (2010)</xref> and <xref ref-type="bibr" rid="B17">Cubek et al. (2015)</xref>, do not consider the possible similarities within demonstrations, <xref ref-type="bibr" rid="B25">Fonooni et al. (2015</xref>; <xref ref-type="bibr" rid="B24">2016)</xref> assume that a semantic network includes all the necessary concepts and objects that the robot can work with, and nodes of the semantic network are discrete. Moreover, while in previous studies, tasks were handled as involving two objects, in this study, we define the relevant objects in the environment more comprehensively, and this allows us to teach more complex tasks. Our contributions can be summarized as follows:<list list-type="simple">
<list-item>
<p>&#x2022; We propose a keyframe-based learning approach in which non-required keyframes are automatically identified and removed such that demonstrations with varying number of keyframes can be handled.</p>
</list-item>
<list-item>
<p>&#x2022; The proposed method identifies similarities between discrete and continuous object attributes of objects within a single demonstration as well as across demonstrations, thereby allowing for a better generalization to unseen scenes and the ability to learn more complicated tasks.</p>
</list-item>
<list-item>
<p>&#x2022; The proposed method has been validated through a comprehensive set of LfD experiments. Implicitly, user intentions regarding object attributes are extracted in three different tasks: stacking, sorting, and serving.</p>
<list list-type="simple">
<list-item>
<p>&#x2713; In the stacking task, we demonstrate that discrete attributes of objects, such as color and shape, play an important role.</p>
</list-item>
<list-item>
<p>&#x2713; The sorting task involves an increased number of objects, showcasing the proposed solution&#x2019;s ability to handle both continuous (size) and discrete (shape) attributes successfully.</p>
</list-item>
<list-item>
<p>&#x2713; Finally, in the serving task, we reveal that user desires may include objects observed in the environment throughout the task, even if they are not actively used by the robot.</p>
</list-item>
</list>
</list-item>
</list>
</p>
<p>These experiments highlight the method&#x2019;s capacity to generalize tasks beyond those previously presented in related work.</p>
<p>To the authors&#x2019; knowledge, this is the first study that learns user intentions regarding the relations between the attributes of objects and generalizes this intention to unseen scenes by taking advantage of keyframe demonstrations.</p>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methods</title>
<p>This section introduces the proposed method, summarized graphically in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The steps of the proposed method from demonstration stage to reproduction stage.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g002.tif"/>
</fig>
<sec id="s3-1">
<title>3.1 Demonstration of the task</title>
<p>The learning process commences once the user collects data from the robot through kinesthetic teaching, a method where the user directly manipulates the robot and upon guiding the robot to a desired pose, keyframes are recorded (<xref ref-type="bibr" rid="B52">Ravichandar et al., 2020</xref>). These keyframes are defined as descriptions of the end-effector pose, the poses of objects present in the scene, and the attributes of these objects, such as color, size, shape, etc. A demonstration is a sequence of keyframes, denoted as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold-italic">i</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the set of keyframes for the <italic>i</italic>th demonstration. The variable <italic>n</italic> denotes the number of demonstrations, and <italic>m</italic>
<sub>
<italic>i</italic>
</sub> is the number of keyframes in the <italic>i</italic>th demonstration. This definition allows for a different number of keyframes for different demonstrations of the same task.</p>
<p>The term <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the <italic>j</italic>th keyframe of the <italic>i</italic>th demonstration. <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is an ordered set, which means that <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is always recorded after <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">k</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold-italic">j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="bold-italic">i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. The <italic>j</italic>th keyframe of the <italic>i</italic>th demonstration is represented by:<disp-formula id="e2">
<mml:math id="m7">
<mml:mrow>
<mml:msubsup>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Here <inline-formula id="inf6">
<mml:math id="m8">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> contains the end-effector pose and gripper status, and <inline-formula id="inf7">
<mml:math id="m9">
<mml:mrow>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is an environment set that includes the set of object poses and object attributes in the <italic>j</italic>th keyframe of the <italic>i</italic>th demonstration. This representation allows for a comprehensive description of both the robot&#x2019;s state and the state of the environment at each keyframe. The sets represented in Eq. <xref ref-type="disp-formula" rid="e2">2</xref> are further described as follows:<disp-formula id="e3">
<mml:math id="m10">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m11">
<mml:mrow>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf8">
<mml:math id="m12">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>&#x3f5;</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the position of the end-effector, the orientation of end-effector is represented by unit quaternion&#x2019;s real part <inline-formula id="inf9">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mo mathvariant="double-struck">&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and vector part <inline-formula id="inf10">
<mml:math id="m14">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf11">
<mml:math id="m15">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a discrete value describing the gripper status. It can be either zero (open) or one (closed). In this set, <inline-formula id="inf12">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of objects in each demonstration, a value that can change from one demonstration to another. For instance, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, while the first and second demonstrations include three objects, the last demonstration scene includes four objects. <inline-formula id="inf13">
<mml:math id="m17">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> contains the attributes of the <inline-formula id="inf14">
<mml:math id="m18">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th object in the <italic>j</italic>th keyframe of the <italic>i</italic>th demonstration. This representation allows for a detailed description of each object&#x2019;s characteristics within the demonstration.</p>
</sec>
<sec id="s3-2">
<title>3.2 Reduction of keyframes</title>
<p>The number of keyframes can change in every demonstration, as specified in Eq. <xref ref-type="disp-formula" rid="e1">1</xref>. We categorize these recorded keyframes into two types: object-centric keyframes and task-unimportant keyframes. Object-centric keyframes are defined as those that alter an attribute in the environment set, while task-unimportant keyframes are considered as the remaining keyframes. Task-unimportant keyframes provide information on how to reach object-centric keyframes. As long as we prevent collisions, the method by which the robot reaches a goal state is not crucial for achieving the desired outcome.</p>
<p>In LfD, it is necessary to generalize the data obtained from multiple demonstrations to accommodate changes in task parameters, such as object poses and/or attributes. As discussed in the related work, traditional methods in keyframe demonstrations cluster similar robot motions using unsupervised learning or alignment methods (<xref ref-type="bibr" rid="B4">Akgun et al., 2012b</xref>). However, these methods do not elucidate the semantic similarity of these keyframes. For instance, the first step of <xref ref-type="fig" rid="F2">Figure 2</xref> presents several keyframe demonstrations for a stacking task. The user picks up an object and places it on top of another object. For example, in the first step of this figure, the fourth keyframe in the first demonstration is close to the releasing keyframe. This could lead to a cluster that includes both the releasing keyframes from each demonstration and this task-unimportant keyframe. However, it is necessary to have the same object-centric keyframes in a cluster. These keyframes provide the accurate relative poses between the end-effector and the object. By grouping these object-centric keyframes from different demonstrations, we can focus directly on the same relevant objects for each demonstration. Therefore, eliminating task-unimportant keyframes instead of using all the keyframes recorded by the user yields this outcome. We have the following assumptions:<list list-type="simple">
<list-item>
<p>&#x2022; Each task necessitates a minimum number of keyframes, referred to as object-centric, to adequately summarize the task being learned by the robot.</p>
</list-item>
<list-item>
<p>&#x2022; Users are expected to record at least the object-centric keyframes, and possibly more, namely, task-unimportant keyframes.</p>
</list-item>
</list>
</p>
<p>The recognition and elimination of task-unimportant keyframes from the data is a contribution of this work. Our aim is to identify object-centric keyframes from the set recorded by the user. The presence of any object in the environment set, as defined by Eq. <xref ref-type="disp-formula" rid="e4">4</xref>, depends on relevant object poses and attributes, as follows:<disp-formula id="e5">
<mml:math id="m19">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2003;</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m20">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m21">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>As indicated in Eq. <xref ref-type="disp-formula" rid="e5">5</xref>, <inline-formula id="inf15">
<mml:math id="m22">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf16">
<mml:math id="m23">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the poses and attributes of object <inline-formula id="inf17">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the <italic>j</italic>th keyframe of the <italic>i</italic>th demonstration, respectively. <inline-formula id="inf18">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the set of <inline-formula id="inf19">
<mml:math id="m26">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> provides information about whether an object is held by the robot or is in the environment. We assume that if the robot manipulates the object, the value of <inline-formula id="inf20">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is &#x201c;grasping&#x201d;, oppositely its value is &#x201c;on the table&#x201d;. <inline-formula id="inf21">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is defined in the next section, can be a combination of discrete (color, shape, etc.) and continuous (size, mass, etc.) attributes. It is important to note that <inline-formula id="inf22">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf23">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can change in any keyframe of any demonstration. For instance, as seen in the first step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the object is held by the robot in the second keyframe of the first demonstration, while a similar object is held in the third keyframe of the second demonstration.</p>
<p>Object-centric keyframes necessitate changes in the object&#x2019;s attribute set, denoted as <inline-formula id="inf24">
<mml:math id="m31">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, which is described in Eq. <xref ref-type="disp-formula" rid="e7">7</xref>. Therefore, a reduced keyframe set is given by:<disp-formula id="e8">
<mml:math id="m32">
<mml:mrow>
<mml:mfenced open="" close="" separators="|">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>K</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3be;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The first keyframe, <inline-formula id="inf25">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, is recorded before the user initiates the demonstration. After reducing keyframes, each demonstration has a similar number of keyframes, including object-centric ones. For instance, as depicted in the first step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the first keyframe is always retained. In the second keyframe of the second demonstration, the robot does not alter anything in the environment, so we can remove it. In the third keyframe of the second demonstration, the object is grasped, leading to an update in the reduced keyframe set <inline-formula id="inf26">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>K</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. After applying Eq. <xref ref-type="disp-formula" rid="e8">8</xref>, as shown in the second step of <xref ref-type="fig" rid="F2">Figure 2</xref>, we have three keyframes in each demonstration.</p>
</sec>
<sec id="s3-3">
<title>3.3 Reference frame selection</title>
<p>Each demonstration presents a unique scene, featuring a variety of objects and object poses. For successful reproduction, it&#x2019;s crucial to generalize the robot&#x2019;s motion to accommodate unseen objects and varying object poses. Relying solely on a single reference frame, such as the robot base frame, may not facilitate understanding the desired relative poses between the end-effector and objects. This is because object poses can vary from one demonstration to another. To effectively generalize the robot&#x2019;s motion to a new environment and comprehend the relevant attributes of the objects, it&#x2019;s important to use the pose of an object as a reference frame for each keyframe. In our scenario, after the reduction of keyframes, each object-centric keyframe from different demonstrations corresponds to an object. These keyframes are defined when there&#x2019;s a change in an object&#x2019;s attribute. Our algorithm takes into account three different situations:<list list-type="simple">
<list-item>
<p>&#x2022; The robot may pick up an object.</p>
</list-item>
<list-item>
<p>&#x2022; An object attribute can change in any keyframe, exemplified by the glass being full in one keyframe and empty in the previous one.</p>
</list-item>
<list-item>
<p>&#x2022; The robot can place an object into target positions.</p>
</list-item>
</list>
</p>
<p>The selection of the reference frame for these three situations is detailed in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>.</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>Selection of Reference frames and Reduction of Keyframes.<list list-type="simple">
<list-item>
<p>
<bold>Input</bold>: <inline-formula id="inf27">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>
<bold>Output</bold>: Reference frame, F<sub>j</sub>, for each reduced keyframes</p>
</list-item>
<list-item>
<p>
<bold>
<italic>Initialize</italic>
</bold>: <inline-formula id="inf28">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>K</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; F<sub>j</sub> <inline-formula id="inf29">
<mml:math id="m37">
<mml:mrow>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> {robot base frame}</p>
</list-item>
<list-item>
<p>
<bold>
<italic>for</italic>
</bold> j &#x3d; 1: <inline-formula id="inf30">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<italic>//For each keyframe in the ith demonstration</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;<bold>
<italic>if</italic>
</bold> Update <inline-formula id="inf31">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>K</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2002;</mml:mtext>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using Eq. <xref ref-type="disp-formula" rid="e8">8</xref>
</p>
</list-item>
<list-item>
<p>
<italic>//If the attribute set has changed</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>for</italic>
</bold> <inline-formula id="inf32">
<mml:math id="m40">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <italic>&#x3d; 1:</italic> <inline-formula id="inf33">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>:</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<bold>
<italic>if</italic>
</bold> <inline-formula id="inf34">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf35">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;<italic>//The position of the object that is held or whose properties change</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;<inline-formula id="inf36">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;<bold>Break;</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<bold>
<italic>elseif</italic>
</bold> <inline-formula id="inf37">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b6;</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf38">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:</p>
</list-item>
<list-item>
<p>&#x2003;<italic>//The position of the object that is released or whose pose change</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;<inline-formula id="inf39">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;<bold>Break;</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<bold>
<italic>else</italic>
</bold>:</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;&#x2003;<inline-formula id="inf40">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<bold>
<italic>end</italic>
</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>end</italic>
</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;<bold>
<italic>end</italic>
</bold>
</p>
</list-item>
<list-item>
<p>
<bold>
<italic>end</italic>
</bold>
</p>
</list-item>
<list-item>
<p>
<bold>
<italic>def</italic>
</bold> <inline-formula id="inf41">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>:</p>
</list-item>
<list-item>
<p>&#x2003;Initialize relation: {}</p>
</list-item>
<list-item>
<p>&#x2003;count &#x3d; 1</p>
</list-item>
<list-item>
<p>&#x2003;flag &#x3d; 0</p>
</list-item>
<list-item>
<p>&#x2003;objects &#x3d; <inline-formula id="inf42">
<mml:math id="m50">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="" close="" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>//Objects except for</italic> <inline-formula id="inf43">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;<bold>
<italic>for</italic>
</bold> k &#x3d; 1: <inline-formula id="inf44">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>:</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>if</italic>
</bold> <inline-formula id="inf45">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mtext>&#x2002;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mspace width="0.2em"/>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mtext>objects</mml:mtext>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<inline-formula id="inf46">
<mml:math id="m54">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mo>&#x2009;</mml:mo>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
<mml:mi>o</mml:mi>
<mml:msup>
<mml:mi>n</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;flag &#x3d; 1;</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;break;<italic>//If the released object is on another object, the function is ended</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>elseif</italic>
</bold> <inline-formula id="inf47">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mtext>&#x2002;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
<mml:mspace width="0.2em"/>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mspace width="0.2em"/>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<inline-formula id="inf48">
<mml:math id="m56">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:msup>
<mml:mi>o</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;<italic>//The released object is next to another object</italic>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>else</italic>
</bold>:</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;&#x2003;<inline-formula id="inf49">
<mml:math id="m57">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mo>&#x2033;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x2003;<bold>
<italic>end</italic>
</bold>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x27a2; If the flag &#x3d; 1, <inline-formula id="inf50">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2003;&#x27a2; Elseif find the index of next to label&#x2003;&#x2003;<inline-formula id="inf51">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> find object index in <inline-formula id="inf52">
<mml:math id="m60">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> array that&#x2003;&#x2003;has &#x201c;next to&#x201d; label</p>
</list-item>
<list-item>
<p>&#x2003;&#x27a2; Else <inline-formula id="inf53">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
</list>
</p>
</statement>
</p>
<p>Although the reference frame is chosen as the object pose whose attributes change in a keyframe in the first two conditions, it&#x2019;s crucial to understand the reasoning behind the robot&#x2019;s action when it places an object in a target position. To achieve this, we compute the spatial relations between the picked objects and other objects in the environment. In this work, we primarily focus on the &#x201c;on top/in&#x201d; and &#x201c;next to&#x201d; situations. However, these semantic labels can be extended using relevant studies (<xref ref-type="bibr" rid="B60">Ziaeetabar et al., 2017</xref>) to other spatial relations for more precise results.</p>
<p>When a robot releases an object, we are not interested in the released object itself, but rather the reason behind this action. We analyse the position of the released object relative to others using two labels: &#x201c;on top of&#x201d; and &#x201c;next to&#x201d;. Let&#x2019;s assume object A is placed in a target location with only one other object, i.e., object B. If A rests on top of B, object B becomes the reference object for that keyframe. Similarly, if A is dropped next to B, object B serves as the reference object. However, situations arise where the dropped object A might be both on top of object B and next to object C. In such cases, the algorithm prioritizes the &#x201c;on top of&#x201d; relationship and object B is still chosen as the reference object. Because the concept of &#x201c;next to&#x201d; is inherently subjective, depending on user preference or task. For instance, a 1 cm distance might not be considered &#x201c;next to&#x201d; when working with small objects, while a 10 cm distance might be in other tasks. Therefore, if the result indicates that an object is &#x201c;next to&#x201d; another, our algorithm proceeds with the computation of the reference frame. The algorithm checks all objects without any specific order; at first object C can be checked, but there can be a possibility for an &#x201c;on top of&#x201d; situation. As shown in the second step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the robot picks up an object in the second keyframe, and the pose of the grasped object is described as a reference frame. The robot then places this object on top of another object. Even though the pose of the released object has changed in the attribute set, the reference frame for the third keyframe must be relevant to the object in the targe pose. In the last demonstration, the larger rectangle is chosen as the reference object because it satisfies the more definitive &#x201c;on top of&#x201d; relation.</p>
</sec>
<sec id="s3-4">
<title>3.4 Constraints for object and end-effector relative poses</title>
<p>After reducing the keyframes, we ended up with a similar number of keyframes, <italic>m</italic>, in each demonstration. This allows us to group these keyframes from different demonstrations based on the keyframe index, <italic>j</italic>&#x2208;[1,<italic>m</italic>], as we ensure an equal number of keyframes. The purpose of collecting these keyframes is two-fold: to extract similar relative poses between the end-effector and objects, and to identify the attributes of each object within the group. We use the reduced keyframe set in Eq. <xref ref-type="disp-formula" rid="e8">8</xref> and group them as follows:<disp-formula id="e9">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>n</mml:mi>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:munder>
<mml:mo>&#x22c3;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Here, c<sub>p</sub> is one of the groups that includes the keyframes representing the same relations between the robot and the objects. Each keyframe in the c<sub>p</sub> must alter the environment&#x2019;s attributes in the same way. For instance, if the object attribute changes from &#x201c;on the table&#x201d; to &#x201c;grasping&#x201d; in a keyframe, these keyframes are collected to the same group, c<sub>p</sub>. As depicted in the second step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the second keyframes show the &#x201c;grasping&#x201d; and the last keyframes show the &#x201c;releasing&#x201d;. Therefore, they can be combined in order as illustrated in the third step. In the initial keyframe group, we retain all object information in the scene. The reason for this is explained in <xref ref-type="sec" rid="s3-5">Section 3.5</xref>. However, other keyframe groups only include the robot&#x2019;s pose and the poses of objects associated with reference frames, along with their attributes.</p>
<p>In order to be able to generalize the task to new environments, the robot needs to understand the desired relative pose between the end-effector and relevant objects. The poses of the end-effector and the objects are recorded as specified in Eqs <xref ref-type="disp-formula" rid="e3">3</xref>, <xref ref-type="disp-formula" rid="e6">6</xref>, respectively. In object-centric keyframes, we expect the end-effector poses in each demonstration to be similar with respect to relevant objects. Therefore, averaging these relative poses results in the desired relative pose. Then in a new environment with different object poses, when we know the desired relative pose we can compute the desired end-effector poses easily. To compute the average of relative positions, we simply divide the summation of relative positions in each keyframe group by the demonstration number. However, averaging quaternions is non-trivial. In this work, quaternions will be averaged using the method proposed by <xref ref-type="bibr" rid="B41">Markley et al. (2007)</xref>.</p>
</sec>
<sec id="s3-5">
<title>3.5 Object attribute constraints between/within demonstrations</title>
<p>Once we determine the desired relative distance between the end-effector and objects for each keyframe, the robot can successfully complete the task, even with variations in object poses. As long as the robot is aware of the relevant objects, learning the desired relative poses between the robot and these objects is sufficient for accurately executing the robot&#x2019;s motion. However, our aim is to learn a broader range of tasks, including more abstract goals. These goals might pertain to the objects in the scene; for example, placing a small object on top of a larger one, or placing a cube next to a cylinder. Furthermore, users may want to demonstrate the similarity of objects within demonstrations; the goal could be to place cups next to plates of the same color. In this work, we aim for these goals to be learned by the robot without explicit instruction from the user. In this section, we describe how the robot can learn the similarities between and within demonstrations as a constraint to successfully achieve and semantically understand the task.</p>
<p>Before computing these constraints, we consider three different object types, similar to the approach proposed by <xref ref-type="bibr" rid="B15">Chella et al. (2006)</xref>. However, our aim differs from theirs; we seek to understand the common attribute between used objects to generalize the user&#x2019;s intention. The three different object categories are:</p>
<p>Reference Objects: These are objects that are relevant to each keyframe, and the poses of these objects are chosen as the reference frame for each keyframe. We assume only one reference object exists per keyframe within a demonstration.</p>
<p>Situational Objects: These are objects that have a spatial relation with respect to reference objects in the first keyframes. However, to define an object under this category, the same spatial relations must exist for each demonstration.</p>
<p>Other Objects: These are objects that are neither reference nor situational objects.</p>
<p>For example, consider a task as depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>, where the robot is required to pick up the rectangle that is next to the triangle and place it on top of the larger rectangle. In the third step, the smaller rectangle serves as the reference object for the second keyframe, and the larger rectangle is the reference object for the third keyframe. The smaller rectangle is next to the triangle in each demonstration, hence this triangle is defined as a situational object. In the last demonstration, the larger rectangle is also next to a circle. However, the spatial relation between the larger rectangle and this circle does not exist in each demonstration. As a result, this circle is categorized as an &#x201c;other&#x201d; object.</p>
<p>In Eq. <xref ref-type="disp-formula" rid="e9">9</xref>, the first keyframe group includes all object attributes to find the situational objects. To extract situational objects we use the &#x201c;on top,&#x201d; &#x201c;next to,&#x201d; and &#x201c;has&#x201d; labels. For instance, if a situational object is on top of a reference object, we interpret this as &#x201c;the reference object has a situational object&#x201d;. Similarly, if a reference object is &#x201c;next to or on top of&#x201d; a situational object, we interpret these constraints as &#x201c;a reference object is next to/on top of a situational object&#x201d;. For these situational objects, another keyframe group is generated and added to the object-centric keyframes. However, this new keyframe group does not contain any information about the robot&#x2019;s pose. While the robot motion is generated using object-centric keyframes, these new sets can define the task goals, but the robot&#x2019;s goal poses are defined using reference objects. For instance, in the fifth step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the second and third keyframes provide information about the robot&#x2019;s motion, but the fourth keyframe group does not. However, if the user&#x2019;s intention is to pick up the rectangle next to the prism, it implies that these situational objects may also be involved in the task goal&#x2019;s constraint.</p>
<p>In this study, object attributes can be both discrete and continuous attributes, and they are represented as follows:<disp-formula id="e10">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>In Eq. <xref ref-type="disp-formula" rid="e10">10</xref>, <inline-formula id="inf54">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the value for a continuous attribute <inline-formula id="inf55">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, such as size or mass and similarly <inline-formula id="inf56">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the discrete value for a discrete feature <inline-formula id="inf57">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> such as color, shape, or category. We assume that each attribute is independent of other attributes. Firstly, we determine from the data, whether constraints must be imposed on a particular object attribute or not. This allows the robot to better understand the task and which attribute of reference and situational object is relevant during the reproduction step. Considering that the number of groups of situational objects is q, and the number of groups of object-centric keyframes is m-1, the total number of keyframe groups is calculated as q &#x2b; m-1. We compute the distance of each discrete attribute value between demonstrations; then the summation of absolute values of these distances is computed as follows:<disp-formula id="e11">
<mml:math id="m68">
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>If <inline-formula id="inf58">
<mml:math id="m69">
<mml:mrow>
<mml:mi>&#x3c7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is zero, it implies that the attribute is identical across all demonstrations, and a constraint must be added for the specific keyframe group. If this is not the case, the value of this attribute does not impact the task and can be disregarded by the robot. As illustrated in the sixth step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the shape of the objects in each group is the same, so the shape is considered a constraint. However, the color of the objects varies in every demonstration, therefore it is not defined as a constraint.</p>
<p>However, there might not always be similarities between a particular attribute in each demonstration when considering only one object. For instance, the color of the object that the robot picks may vary from one demonstration to another. Nonetheless, there might be a constraint between object pairs within each demonstration, e.g., the object must always be placed on an object with the same color. To handle such constraints, we compute a difference vector for the discrete attributes of each object in one demonstration as follows:<disp-formula id="e12">
<mml:math id="m70">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2200;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>If there is a constraint between the objects used in one demonstration, the value of <inline-formula id="inf59">
<mml:math id="m71">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> must be equal for each demonstration, so the result of<disp-formula id="e13">
<mml:math id="m72">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>needs to be zero. Otherwise, it implies that there are no object attribute constraints within the demonstration. If it is zero Eq. <xref ref-type="disp-formula" rid="e12">12</xref> will be another constraint in the reproduction to define each object. For instance, in the seventh step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the color of the triangle in the fourth keyframe group and the color of the rectangle in the third keyframe group varies in each demonstration. However, in each demonstration, the color of the triangle matches the color of the rectangle. This means that the color of the triangle and the color of the rectangle can be any color, but as a constraint, their colors must be the same. It is important to note that although the shape of reference objects is also the same, the shape of these objects does not vary in each demonstration. As this constraint is already computed between demonstrations using Eq. <xref ref-type="disp-formula" rid="e11">11</xref>, we do not check the shape of these objects within the demonstration.</p>
<p>Similar reasoning applies to continuous object attributes. With continuous attributes, the robot may observe that the object to be picked always has a different size. However, determining whether these sizes are similar enough to impose a constraint requires a measure of similarity and a defined threshold. Therefore, we assume that the continuous attribute values are normally distributed, and we compute the standard deviation, <inline-formula id="inf60">
<mml:math id="m73">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and mean, <inline-formula id="inf61">
<mml:math id="m74">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, values of the attributes for each object given in the same keyframe. For a continuous variable, the range of <inline-formula id="inf62">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>&#x3c4;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which falls within <inline-formula id="inf63">
<mml:math id="m76">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, is considered a constraint for the object, as long as <inline-formula id="inf64">
<mml:math id="m77">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. If the standard deviation of a continuous variable exceeds a threshold, <inline-formula id="inf65">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, then this feature is assumed to be unconstrained. Large variations in continuous attributes suggest that these attributes are not crucial for object selection. This threshold must be predefined. For example, in the seventh step of <xref ref-type="fig" rid="F2">Figure 2</xref>, a size range is computed as a constraint for the size of each object, and <inline-formula id="inf66">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>c</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Similarly, we check the similarities of different object&#x2019;s continuous attributes within the demonstration using the correlation coefficient. A correlation of &#x2212;1 defines a perfect negative correlation and a value of 1 defines a perfect positive correlation. In this work, only inter-object attributes with an absolute correlation of 0.65 or more are considered to be constrained to one another. For example, in each demonstration, small objects can be placed on top of big objects; in this case, if there are two big objects and one small object in the environment satisfying the constraints, the robot cannot make a decision about which object will be used. In this work, the relation between continuous object attributes that are correlated is modeled using a linear relation, however, any other model can be used if preferred. Therefore, we represent a linear equation to define the constraint,<disp-formula id="e14">
<mml:math id="m80">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f4;</mml:mi>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>Where the vectors <italic>X</italic> and <italic>Y</italic> are learned through linear regression, and in reproduction, the relevant objects can be chosen using the constraint for task objects and other objects used within the demonstration. For example, in the seventh step of <xref ref-type="fig" rid="F2">Figure 2</xref>, the size difference between the small rectangle and the larger rectangle is consistently 0.02 m in each demonstration, therefore the coefficients of <italic>X</italic> and <italic>Y</italic> are recorded as 0.02 and 1 respectively. These constraints are utilized during the reproduction stage.</p>
<p>In this method, it is up to the user whether similar object attributes are observed in all demonstrations or if the same inter-object constraints exist in each demonstration. Users must utilize the objects according to their purpose during demonstrations. It is not necessary for discrete or continuous attributes to always have constraints. For instance, if the user does not change objects but uses a different geometric configuration in each demo, our method only takes into account the observed objects and their attributes for the task.</p>
</sec>
<sec id="s3-6">
<title>3.6 Reproduction</title>
<p>The inputs of the reproduction include constraints for each keyframe group, between keyframes, the number of reference objects, and the new environment with object attributes and poses. The first step classifies objects based on desired attributes for each keyframe group. For example, in <xref ref-type="fig" rid="F2">Figure 2</xref>, there are two reference objects and one situational object after demonstrations. Initially, six candidate objects exist for each keyframe in the reproduction scene. The first reference object must be a rectangle of size 0.02&#x2013;0.04 m, with possible candidates being O<sub>2</sub>-O<sub>6</sub>. The second reference object must also be rectangular but of size 0.04&#x2013;0.06 m, with candidates being O<sub>3</sub>-O<sub>4</sub>-O<sub>5</sub>-O<sub>6</sub>. The situational object is a triangle of size 0.06m; the only valid object is O<sub>1</sub>. A candidate object matrix is created, including all possible object index combinations for each keyframe. Columns of this matrix represent each keyframe group. Some examples of matrix rows are O<sub>2</sub>-O<sub>3</sub>-O<sub>1</sub>, O<sub>2</sub>-O<sub>4</sub>-O<sub>1</sub>, and O<sub>6</sub>-O<sub>5</sub>-O<sub>1</sub>. Constraints between keyframes are then checked. The order of this check doesn&#x2019;t affect the final outcome, provided all constraints are considered.</p>
<p>First, constraints between the situational and reference object are applied. In the example of <xref ref-type="fig" rid="F2">Figure 2</xref>, the first reference object must be next to the situational object. Relevant columns containing these objects are taken from the candidate object matrix, such as O<sub>2</sub>-O<sub>1</sub> and O<sub>6</sub>-O<sub>1</sub>. Since O<sub>2</sub> is next to O<sub>1</sub>, rows that include O<sub>6</sub> in the first column are removed. Next, discrete object attribute constraints between keyframes are checked. The second reference object and situational object must have the same color. The rows that have the index of objects O<sub>4</sub>, O<sub>6</sub> in the second column are removed, as they are not red. Lastly, continuous object attribute constraints are checked between keyframes. The second object&#x2019;s size is twice the first reference object size. Since O<sub>5</sub> is not valid for this constraint, the rows that include the index of O<sub>5</sub> in the second column is removed. The result objects for each keyframes in order are O<sub>2</sub>-O<sub>3</sub>-O<sub>1</sub>. Ultimately, this process results in a single candidate object for each keyframe. The reference object number is crucial for computing the desired robot&#x2019;s pose. In this example, the first two columns (O<sub>2</sub>-O<sub>3</sub>) are taken to compute the desired end-effector pose. This reproduction procedure is summarized in <xref ref-type="sec" rid="s12">Supplementary Algorithm S1</xref>.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments</title>
<sec id="s4-1">
<title>4.1 Experimental setup</title>
<p>The Optitrack Motion Capture System, with four OptiTrack Prime 41/22 cameras, is used to detect object poses in the scene. The cameras are positioned to overlook the table during experiments. Their view, as illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>, is captured using the Motive software on a Windows PC. The tracking of rigid bodies is done through the Motive software. A global frame is established during calibration, with a corner of the table designated as the global frame.</p>
<p>Passive reflective tape markers are used for object detection by the Optitrack system. Four markers were attached on top of each object due to their small size. The object pose is defined as the center of these markers. We consider five object attributes: color, shape, category, instance, and size. The &#x201c;category&#x201d; is a generic label for objects, such as toys or furniture. An &#x201c;instance&#x201d; of a category is a specific element within that category.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Experimental setup with a motion capture system. Multiple cameras are positioned around the table to capture the poses of the objects (geometric shapes) on the table. The inset shows a close-up view of the objects and markers. The view from one of the cameras is illustrated.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g003.tif"/>
</fig>
<p>A 7-DOF Franka Emika Panda arm is used for manipulation tasks. To calibrate the robot&#x2019;s base frame pose relative to the global frame, we use the OpenCV eye-to-hand library with the (<xref ref-type="bibr" rid="B47">Park and Martin, 1994</xref>).</p>
<p>During experiments, the user can move the robot freely. In each keyframe, the pose of the end-effector with respect to the robot base frame and the gripper positions are recorded through a Robot Operating System (ROS) node. The recorded data, including keyframe index, object poses, object attributes, and end-effector pose, are collected and used as input for the learning algorithm implemented in MATLAB. In the reproduction stage, the new environment information, including the pose of the objects in the scene and their attributes, and the generated constraints from the demonstration are input to <xref ref-type="sec" rid="s12">Supplementary Algorithm S1</xref>. This algorithm identifies the relevant object for each keyframe in the scene and computes the result end-effector poses to satisfy the relative pose constraints between the robot and the object. We assume the existence of a trajectory planner that can generate a collision-free path between the desired end-effector poses.</p>
</sec>
<sec id="s4-2">
<title>4.2 Task descriptions</title>
<p>In our study, we validate our results using three tasks: stacking, sorting, and serving. <xref ref-type="sec" rid="s12">Supplementary Videos</xref> demonstrating each experiment can be found in the <xref ref-type="sec" rid="s12">Supplementary Materials</xref> as Video 1 (stacking), Video 2 (sorting), and Video 3 (serving), respectively. The task descriptions are given in the following lines.<list list-type="simple">
<list-item>
<p>&#x2022; Stacking Task: This experiment demonstrates the robot&#x2019;s ability to learn and generalize constraints about discrete object attributes. The goal is for the robot to <italic>&#x201c;Pick a cube and place it on top of a same-colored cylinder&#x201d;.</italic>
</p>
</list-item>
<list-item>
<p>&#x2022; Sorting Task: The second experiment emphasizes the importance of continuous object attribute constraints. The task is to <italic>&#x201c;Sort three objects of the same shape, each with a size difference of 0.01m from the next, from the largest to the smallest in right-to-left order.&#x201d;</italic>
</p>
</list-item>
<list-item>
<p>&#x2022; Serving Task: The final experiment highlights the relevance of situational objects and spatial constraints. The robot is taught to <italic>&#x201c;Choose another fruit from the box containing the same fruit as the fruit on the plate and place it next to the fruit on the plate.&#x201d;</italic>
</p>
</list-item>
</list>
</p>
</sec>
</sec>
<sec sec-type="results" id="s5">
<title>5 Results</title>
<p>This section introduces the outcomes of the demonstrations and the results of reproduction for each individual tasks.</p>
<sec id="s5-1">
<title>5.1 Stacking task</title>
<sec id="s5-1-1">
<title>5.1.1 Demonstration results</title>
<p>During the learning phase of the stacking task, the robot consistently picks a cube and places it on top of a cylinder. The user varies the colour of the cube and cylinder in each demonstration. All objects utilized during the demonstration and reproduction phases are depicted in <xref ref-type="fig" rid="F4">Figure 4</xref>. It&#x2019;s important to note that not all objects are used during the training phase. Some are introduced in the reproduction phase to validate the generalization capabilities of our proposed approach. As depicted in <xref ref-type="fig" rid="F5">Figure 5</xref>, three distinct demonstrations were executed. The initial demonstration involved a red cube and a red cylinder, the subsequent demonstration incorporated a blue cube and a blue cylinder, and the final demonstration utilized a green cube and a green cylinder. The configurations of these objects within each demonstration are illustrated in <xref ref-type="fig" rid="F5">Figure 5A</xref>. <xref ref-type="fig" rid="F5">Figure 5B</xref> provides a graphical representation of the end-effector positions and orientations, with the number of keyframes in each demonstration being 6, 4, and 3, respectively. Lastly, <xref ref-type="fig" rid="F5">Figure 5C</xref> presents the scenes corresponding to the final object-centric keyframes in each demonstration.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Variety of Geometric Objects for Stacking Task. Objects O1 (green cube), O2 (red cube), O3 (blue cube), and O5 (green cylinder), O6 (red cylinder), O7 (blue cylinder) are observed during the demonstration phase. Objects O4 (yellow cube), O8 (yellow cylinder), and O9 (red torus) are allocated for the reproduction phase.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g004.tif"/>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Demonstrations of the Stacking Task: This figure presents three distinct demonstrations, each arranged vertically and utilizing a unique set of objects. The first demonstration involves objects O2 and O6, the second incorporates objects O3 and O7, and the third employs objects O1 and O5. Each demonstration: <bold>(A)</bold> Depicts the initial scene before the demonstrations. <bold>(B)</bold> Graphically represents the recorded end-effector positions (x, y, z) and orientations (q<sub>x</sub>, q<sub>y</sub>, q<sub>z</sub>, q<sub>w</sub>) relative to the robot&#x2019;s base frame. <bold>(C)</bold> Showcases the concluding object-centric keyframes. </p>
</caption>
<graphic xlink:href="frobt-11-1340334-g005.tif"/>
</fig>
<p>We define the variable &#x3b6; based on the gripper status. At the start of each demonstration for a scene, its value is set to &#x201c;on the table&#x201d;. If the gripper status changes from open to closed in a keyframe, &#x3b6; is defined as &#x201c;grasping.&#x201d; Conversely, if it changes from closed to open, &#x3b6; is defined as &#x201c;on the table&#x201d;. In this example, only the positions of the cube and &#x3b6; change. Therefore, the reference frame for the second keyframe is the cube object location. Then, &#x3b6; changes from &#x201c;grasping&#x201d; to &#x201c;on the table&#x201d; for the cube. In this situation, our proposed method identifies a reference frame using other objects in the scene. Since the cube is on top of the cylinder, the reference frame for the third keyframe is defined as the cylinder location, as outlined in <xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref>. The first keyframe group,c<sub>1</sub>, represents the recorded scene before the demonstration, it is not important in learning to generalize the robot&#x2019;s motion, its reference is taken.</p>
<p>As the robot base frame. After we apply Eq. <xref ref-type="disp-formula" rid="e8">8</xref>, in this task, we only have two object-centric keyframes. The attributes for each keyframes are computed as follows:<disp-formula id="equ1">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>toy</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>geometric</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>cube</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mn>0.04</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>grasping</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ18">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">3</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>toy</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>geometric</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>cylinder</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mn>0.03</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>on&#x2009;the&#x2009;table</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Given that there is no constraint on the colour of the two objects, Eq. <xref ref-type="disp-formula" rid="e12">12</xref> has been used to establish a colour constraint between the objects, denoted as <bold>c</bold>
<sub>
<bold>2</bold>
</sub>
<bold>&#x2013;c</bold>
<sub>
<bold>3</bold>
</sub>: {Colour}. More specifically, the colours of the object to be picked and the object on which this object must be placed are always identical. Since the environment only includes reference objects, there are no spatial constraints.</p>
<p>The size of the objects remains consistent in each demonstration. For continuous attributes, we always examine the correlation between two objects. However, there isn&#x2019;t enough variation in size to determine whether a correlation exists. Therefore, size is not considered an inter-object constraint between two objects.</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Reproduction results</title>
<p>Following the teaching phase, the reproduction phase for this task takes the desired object attributes for each keyframe, discrete inter-object constraint, and the desired relative poses between the end-effector and reference objects as inputs. In the reproduction phase, we apply the learned constraints in three different environments, as depicted in <xref ref-type="fig" rid="F6">Figure 6</xref>. In the first case, a green cylinder and a blue cube are present on the table. Although these objects meet the desired object attributes for each keyframe, the colour of the objects must be identical. Therefore, there are no suitable candidate objects for the task. In the second case, there are four objects. While the yellow cube and green cube are candidate objects for the second keyframe, the yellow cylinder and blue cylinder are chosen for the third keyframe. Since the colour of both objects must be the same, after the inter-object constraints in the reproduction steps, the resulting candidate objects are the yellow cube and yellow cylinder.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Reproduction Scenarios for the Stacking Task: This figure presents three distinct scenarios. Scene 1: an invalid configuration with a green cylinder and blue cube. Scene 2: a yellow cylinder and cube, and a blue cylinder and green cube, with the robot successfully stacking the yellow objects. Scene 3: a red cylinder, cube, and torus, with the robot stacking the red cube on the cylinder. Each column sequence illustrates the robot&#x2019;s motion.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g006.tif"/>
</fig>
<p>In the third case, a red cube, a red cylinder, and a red torus are present. Since the torus shape is not suitable for any keyframes, it is eliminated after applying the desired constraints for each keyframe.</p>
<p>The second and third cases are successful because there are objects that satisfy the constraints between demonstrations and discrete inter-object constraints. The outcomes of these two scenes are illustrated in the last column of <xref ref-type="fig" rid="F6">Figure 6</xref>. The object attributes and the candidate objects in each environment are provided in <xref ref-type="sec" rid="s12">Supplementary Table S1</xref>.</p>
<p>This result proves that the robot does not need task-unimportant keyframes to execute the task. Moreover, using only three demonstrations, we could extract the user&#x2019;s intention. In reproduction, although there are multiple and unseen objects, the robot could generalize the task. The learning of discrete object attributes within the demonstrations indeed underscores the potential of our contribution. Without this contribution, the robot would execute the first case in the reproduction phase, even though it does not meet the user&#x2019;s expectations.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 Sorting task</title>
<sec id="s5-2-1">
<title>5.2.1 Demonstration results</title>
<p>Three demonstrations of the sorting task are sufficient to achieve this task. The objects used in the training and reproduction are illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Variety of Geometric Objects for Sorting Task. Objects O<sub>1</sub> (purple torus), O<sub>2</sub> (dark blue torus), O<sub>3</sub> (blue torus), O<sub>4</sub> (dark green torus), O<sub>5</sub> (yellow torus), O<sub>7</sub> (dark blue rectangle prism), O<sub>8</sub> (blue rectangle prism), O<sub>9</sub> (yellow rectangle prism), O<sub>13</sub> (red table) are observed during the demonstration phase. Objects O<sub>6</sub> (red rectangle prism), O<sub>10</sub> (orange octagon), O<sub>11</sub> (dark green octagon), O<sub>12</sub> (yellow octagon) are added for the reproduction phase.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g007.tif"/>
</fig>
<p>As depicted in <xref ref-type="fig" rid="F8">Figure 8</xref>, three distinct demonstrations were executed. Each environment used in the demonstrations contains three objects with the same shape and a red rectangular area for arranging these objects. The shapes and sizes of the objects to be sorted vary between demonstrations. The configurations of these objects within each demonstration are illustrated in <xref ref-type="fig" rid="F8">Figure 8A</xref>. Since shape is a discrete attribute, using two different shapes is enough to teach that shape is not a significant factor between demonstrations. <xref ref-type="fig" rid="F8">Figure 8B</xref> provides a graphical representation of the end-effector positions and orientations, with the number of keyframes in each demonstration being 7, 11, and 9, respectively.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Demonstrations of the Sorting Task: This figure presents three distinct demonstrations, each arranged vertically and utilizing a unique set of objects. The first demonstration involves objects O<sub>1</sub>, O<sub>2</sub>, O<sub>3</sub>, and O<sub>13</sub>, the second incorporates objects O<sub>7</sub>, O<sub>8</sub>, O<sub>9</sub> and O<sub>13</sub>, and the third employs objects O<sub>3</sub>, O<sub>4</sub>, O<sub>5</sub>, and O<sub>13</sub>. Each demonstration: <bold>(A)</bold> Depicts the initial scene before the demonstrations. <bold>(B)</bold> Graphically represents the recorded end-effector positions (x, y, z) and orientations (q<sub>x</sub>, q<sub>y</sub>, q<sub>z</sub>, q<sub>w</sub>) relative to the robot&#x2019;s base frame. <bold>(C)</bold> Showcases the concluding object-centric keyframes.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g008.tif"/>
</fig>
<p>Lastly, <xref ref-type="fig" rid="F8">Figure 8C</xref> presents the scenes corresponding to the final object-centric keyframes in each demonstration. After calculating the constraints both between and within demonstrations, it&#x2019;s found that the shape of objects within demonstrations is consistent. The size difference between two objects to be picked in order is 0.01 m in the same scene. The size of the largest object ranges between 0.08 m and 0.04 m, the second-largest object&#x2019;s size ranges between 0.07 m and 0.03 m, and the smallest object&#x2019;s size ranges between 0.06 m and 0.02 m. There are six object-centric keyframes, and the constraints for each keyframe are as follows:<disp-formula id="equ2">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>toy</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>geometric</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.04</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.08</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>grasping</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ3">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>furniture</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>table</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>red</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>rectangle</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.3</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>on&#x2009;the&#x2009;table</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ4">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">4</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>toy</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>geometric</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.03</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.07</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>grasping</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ5">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mrow>
<mml:mn mathvariant="bold">5</mml:mn>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>furniture</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>table</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>red</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>rectangle</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.3</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>on&#x2009;the&#x2009;table</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ6">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">6</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>toy</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>geometric</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.02</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.06</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>grasping</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ7">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">7</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>furniture</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>table</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>red</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>rectangle</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mn>0.3</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;&#x2009;</mml:mtext>
<mml:mtext>on&#x2009;the&#x2009;table</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Our analysis indeed goes beyond examining similar attributes of just two objects. For instance, this task involves four objects. When a discrete constraint exists between multiple keyframes within a demonstration, we combine these constraints in order. This approach allows for the faster elimination of possible incorrect objects during reproduction. A discrete constraint occurs when there is an equality. The discrete constraints between each keyframe group are defined as follows:<disp-formula id="equ8">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2013;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">4</mml:mn>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2013;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">6</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Shape</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The shape of the reference objects in the second, fourth, and sixth keyframes must be the same. When continuous constraints of more than two keyframes exist, we separate them. This separation is necessary because we use a linear regression model, where one variable is the independent variable, and we always take the previous keyframe as the independent variable. The next keyframe, therefore, must be the dependent variable. Consequently, for continuous constraints, we combine each keyframe group as a pair, instead of a combination of them like discrete constraints. For example, in this task, a correlation exists between the c<sub>2</sub>-c<sub>4</sub>, c<sub>4</sub>-c<sub>6</sub>, and c<sub>2</sub>-c<sub>6</sub> keyframe groups. Instead of recording all three constraints, we only record the c<sub>2</sub>-c<sub>4</sub> and c<sub>4</sub>-c<sub>6</sub> constraints. For example, in this task, shape is a discrete inter-object constraint between three objects, so we record these constraints as c<sub>2</sub>-c<sub>4</sub>-c<sub>6</sub>, which also reduced the data size used in reproduction. As a result, continuous attribute constraints between keyframes are,<disp-formula id="equ9">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2013;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Size&#x2009;</mml:mtext>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.01</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ10">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x2013;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mn>6</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Size&#x2009;</mml:mtext>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.01</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
</sec>
<sec id="s5-2-2">
<title>5.2.2 Reproduction results</title>
<p>In reproduction, the robot executes three different scenarios as shown in <xref ref-type="fig" rid="F9">Figure 9</xref>. The scenes are entirely new to the robot. While some objects have been seen in demonstrations, others, such as the octagon shapes, are new. During the demonstration and in the reproduction, we record objects using indexes. The robot does not receive any hints about the order of objects to be selected as a reference frame. Since only reference objects were used during demonstrations, the learning outcome does not include any constraints related to spatial relations. The input for reproduction consists of desired object attributes for each keyframe, discrete inter-object constraints, continuous inter-object constraints, desired relative poses between the end-effector and objects, and the new environment&#x2019;s object attributes and poses. The candidate objects of each keyframe for each case are summarized in <xref ref-type="sec" rid="s12">Supplementary Table S2</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Reproduction Scenarios for the Sorting Task: This figure presents three distinct scenarios. Each scene is valid for the task&#x2019;s description. Each column sequence illustrates the robot&#x2019;s motion in each scene.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g009.tif"/>
</fig>
<p>In the first environment, after applying constraints between demonstrations, the candidate objects for the c<sub>2</sub>-c<sub>4</sub>-c<sub>6</sub> groups can be any toy in the environment. However, the objects for c<sub>3</sub>-c<sub>5</sub>- c<sub>7</sub> are red rectangle area on the table. These possible object sets will be decreased by applying discrete constraints within the task. This step eliminates the rectangle prism objects since the robot requires three objects with the same shape. However, for c<sub>2</sub>-c<sub>4</sub>-c<sub>6</sub>, there are still multiple possible objects. Next, continuous inter-object constraints are applied. When the robot chooses three objects in the correct order, it satisfies the task objectives. Therefore, the task is executed successfully. In fact, the dark green octagon size falls within the required size range for the largest object. However, if the robot takes this object as the largest, the robot cannot find any object for the smallest one. Consequently, it learns to eliminate these options.</p>
<p>In the second scenario, although the shape of all objects satisfies inter-object discrete constraints, the size difference between the second and the third object is 0.02 m instead of 0.01 m. Therefore, the robot chooses rectangle prisms. In the last scenario, the sizes of objects are suitable for the task. However, there are no three rectangle prisms, therefore the robot chooses octagons.</p>
<p>This result indeed demonstrates that continuous inter-object constraints can also be crucial for the task. Without this contribution, the robot would not be able to learn the size difference between the objects. Moreover, focusing on only object-centric keyframes assists in dealing with multiple objects, as each object is relevant to a keyframe.</p>
</sec>
</sec>
<sec id="s5-3">
<title>5.3 Serving task</title>
<sec id="s5-3-1">
<title>5.3.1 Demonstration results</title>
<p>In this task the fruit on the plate and the fruit in the box are the same. So, multiple objects with the same attributes can be on the table at the same time, these objects are represented with different indexes, as depicted in <xref ref-type="fig" rid="F10">Figure 10</xref>.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Variety of Objects for Serving Task. Objects O<sub>1</sub> (red apple), O<sub>8</sub> (red apple), O<sub>2</sub> (green apple), O<sub>9</sub> (green apple), O<sub>3</sub> (lemon), O<sub>10</sub> (lemon), O<sub>5</sub> (banana), O<sub>7</sub> (brown box), O<sub>6</sub> (green plate) are observed during the demonstration phase. Objects O<sub>4</sub>, O<sub>11</sub>, O<sub>12</sub> (orange), O<sub>13</sub> (banana), O<sub>14</sub> (brown box) are added for the reproduction phase.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g010.tif"/>
</fig>
<p>As depicted in <xref ref-type="fig" rid="F11">Figure 11</xref>, three distinct demonstrations were executed. As illustrated in <xref ref-type="fig" rid="F11">Figure 11A</xref>, in the first demonstration, there&#x2019;s a lemon on the plate and a lemon and a green apple on the box. The second demonstration has a green apple on the plate, and a lemon, and another green apple on the box. The third demonstration has a red apple on the plate, and a red apple and a banana on the box. <xref ref-type="fig" rid="F11">Figure 11B</xref> graphically represents the end-effector&#x2019;s positions and orientations, with each demonstration consisting of 4, 3, and 3 keyframes, respectively. Finally, <xref ref-type="fig" rid="F11">Figure 11C</xref> visually presents the scenes associated with the last object-centric keyframes for each demonstration. There are two different reference objects and two object-centric keyframes for these objects. The constraints between these keyframes groups are:<disp-formula id="equ11">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Fruit</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.03</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.09</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>grasping</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ12">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">3</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Kitchen&#x2009;Tool</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Plate</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Green</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Rectangle</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mn>0.26</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>on&#x2009;the&#x2009;table</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Demonstrations of the Serving Task: This figure presents three distinct demonstrations, each arranged vertically and utilizing a unique set of objects. The first demonstration involves objects two lemons, a green apple, a box and a plate. The second incorporates objects two green apples, one lemon, a plate and a box, and the third employs objects two red apples, one banana, a box and a plate. Each demonstration: <bold>(A)</bold> Depicts the initial scene before the demonstrations. <bold>(B)</bold> Graphically represents the recorded end-effector positions (x, y, z) and orientations (q<sub>x</sub>, q<sub>y</sub>, q<sub>z</sub>, q<sub>w</sub>) relative to the robot&#x2019;s base frame. <bold>(C)</bold> Showcases the concluding object-centric keyframes.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g011.tif"/>
</fig>
<p>In this task, there are situational objects that are automatically defined from the data. The labels for these spatial constraints are as follows: <bold>c</bold>
<sub>
<bold>2</bold>
</sub> is on top of <bold>c</bold>
<sub>
<bold>4</bold>
</sub>, <bold>c</bold>
<sub>
<bold>2</bold>
</sub> is next to <bold>c</bold>
<sub>
<bold>5</bold>
</sub>, and <bold>c</bold>
<sub>
<bold>3</bold>
</sub> has <bold>c</bold>
<sub>
<bold>6</bold>
</sub>. As a result, the number of keyframes&#x2019; group has increased. The common attributes in each new keyframe group are:<disp-formula id="equ13">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">4</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Furniture</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Box</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Light&#x2009;Brown</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Rectangle</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mn>0.21</mml:mn>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>on&#x2009;the&#x2009;table&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ14">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mrow>
<mml:mn mathvariant="bold">5</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Fruit</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Sphere</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.04</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.07</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>on&#x2009;the&#x2009;table&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="equ15">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mrow>
<mml:mn mathvariant="bold">6</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Fruit</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Sphere</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0.04</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>0.07</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>on&#x2009;the&#x2009;table&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The first reference object (the fruit to be picked) is on the box and next to another fruit. Moreover, the second reference object (plate) has a fruit. The instance of the fruit next to the first reference object is the same as the instance of the fruit on the second reference object. This is computed as the discrete constraints between the fifth and sixth keyframe groups:<disp-formula id="equ16">
<mml:math id="m96">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">5</mml:mn>
</mml:msub>
<mml:mo>&#x2010;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">6</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Colour</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>Instance&#x2009;Of</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Moreover, since the size of the two objects in the fifth keyframe and sixth keyframe is the same, a continuous attribute constraint is found between them:<disp-formula id="equ17">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">5</mml:mn>
</mml:msub>
<mml:mo>&#x2010;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msub>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mn mathvariant="bold">6</mml:mn>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mtext>Size&#x2009;</mml:mtext>
<mml:mo>&#x2192;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1.0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Indeed, while the proposed method initially seeks situational objects using spatial constraints for each task, in previous experiments, all objects in the scene were reference objects.</p>
<p>However, the user&#x2019;s intention may also depend on situational objects in the environment. Consequently, all these objects are added as additional groups. It&#x2019;s important to note that the end-effector goal poses depend solely on reference objects. Situational objects merely assist in selecting these reference objects.</p>
</sec>
<sec id="s5-3-2">
<title>5.3.2 Reproduction results</title>
<p>In the reproduction phase, the robot performs the task in three different environments, as depicted in <xref ref-type="fig" rid="F12">Figure 12</xref>. In the first scenario, the robot encounters a new fruit, an orange, which is on the box. A lemon is situated next to the orange, and another lemon is on the plate. Since the two situational objects must be the same, the lemons are situational objects. The robot selects the orange as a reference object, with the plate being the other reference object.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Reproduction Scenarios for the Serving Task: figure presents three distinct scenarios. Scene 1: a lemon is on the plate, an orange and a lemon are on the box. Scene 2: There are two boxes. A red apple and a green apple are on one of the box. A green apple and an orange are on the other box. An orange is on the plate. Scene 3: A banana and an orange are on the table. A banana and an orange on the box, and an orange on the plate. Each column sequence illustrates the robot&#x2019;s motion.</p>
</caption>
<graphic xlink:href="frobt-11-1340334-g012.tif"/>
</fig>
<p>In the second scenario, the complexity is increased with the presence of two boxes, each containing two fruits, and an orange on the plate. The orange on the plate serves as one of the situational objects. The robot then looks for the same situational object (another orange) on the box. As a result, the robot selects the box containing an orange and green apple, successfully picking up the green apple.</p>
<p>In the final scenario, there are two fruits (a banana and an orange) on the box, two fruits (a banana and an orange) on the table, and an orange on the plate. The orange on the plate is one of the situational objects, but there is one orange on the table and one orange on the box. Even though both fruits are next to a banana, the banana must be on the box due to the spatial constraints between c<sub>2</sub> and c<sub>4</sub>. Therefore, the robot picks up the banana on the box. The resulting candidate objects for each keyframe are provided in <xref ref-type="sec" rid="s12">Supplementary Table S3</xref>.</p>
<p>However, due to the spherical shape of the fruits in c<sub>5</sub> and c<sub>6</sub>, if there is a banana on the plate, the reproduction will fail. Of course, this situation can be solved by an additional demonstration with the banana placed on the plate. However, as we mentioned before, obtaining these constraints requires the teacher to make demonstrations that meet their own intentions.</p>
<p>Indeed, this experiment demonstrates the power of keyframe representations in handling situations where the user&#x2019;s intentions may be associated with situational objects. Without this contribution, the robot might choose any fruit without considering its location or the fruit on the plate, thereby not fully meeting the user&#x2019;s expectations. This approach ensures that the robot&#x2019;s actions align more closely with the user&#x2019;s intentions, showcasing the effectiveness of our method.</p>
</sec>
</sec>
</sec>
<sec id="s6">
<title>6 Discussion and conclusion</title>
<p>This study employs keyframe demonstrations to capture user intentions addressing two distinct challenges. The first challenge involves extracting object-centric keyframes from a varying number of keyframes in each demonstration. The second challenge pertains to inferring the user&#x2019;s intention based on the attributes of objects and generalizing this understanding to environments that have not been previously encountered.</p>
<p>In the existing literature, the number of keyframes is often treated as equal (<xref ref-type="bibr" rid="B31">Jankowski et al., 2022</xref>) or even when the number of keyframes varies, the task is simple enough to involve only one object (<xref ref-type="bibr" rid="B4">Akgun et al., 2012b</xref>; <xref ref-type="bibr" rid="B5">Akgun and Thomaz, 2016</xref>). These keyframes, obtained from multiple demonstrations, must be clustered to generalize robot motion to different configurations. This clustering is dependent on the performance of alignment and clustering methods. In our study, we diverge from previous research by expanding the keyframe to include attributes of all objects in the environment, their poses, and the robot&#x2019;s configuration. This additional information identifies which keyframes are necessary for the task, ensuring an equal number of keyframes are obtained in each demonstration. As it is easier to group equal numbers of keyframes, each with the same meaning, it eliminates the need for the alignment method, unlike other studies. This elimination guarantees keyframe groups that include accurate relative poses between the end-effector and objects, without being affected by task-unimportant keyframes which may cause coarser relative poses than the user intended. Our experiments indicate that when the number of keyframes changes in each demonstration, only object-centric keyframes are sufficient to reproduce the task. Moreover, this reduction of keyframes aids in addressing the second challenge.</p>
<p>The second challenge is to uncover the user intention about the attributes of objects to generalize the robot&#x2019;s task to unseen environments. In the literature, studies such as those conducted by <xref ref-type="bibr" rid="B17">Cubek et al. (2015)</xref>, focus solely on pick-and-place tasks involving two objects. They examine the similarities of object attributes between demonstrations. Similarly, in the study conducted by <xref ref-type="bibr" rid="B14">Chao et al. (2010)</xref>, the similarities of both discrete and continuous attributes between demonstrations are given. These studies do not address similarities within a demonstration, such as the color of two objects having to be the same in a demo. Our solution can handle these situations. For example, in the first and second examples, the user desires depend on both similarities between and within the demonstration.</p>
<p>In the studies carried out by <xref ref-type="bibr" rid="B24">Fonooni et al. (2016)</xref>, a semantic network includes all the necessary concepts and objects that the robot can work with, and nodes of the semantic network are discrete. However, in our experiments, to show that the shape of objects is not important, we do not have to include all possible shapes that the robot can work within our attributes. Moreover, the approach proposed by <xref ref-type="bibr" rid="B24">Fonooni et al. (2016)</xref>, does not include similarity for continuous attributes. In the second experiment, we show the importance of continuous object attributes both between and within demonstrations. The size difference between each object can be important for a sorting task. These previous studies (<xref ref-type="bibr" rid="B14">Chao et al., 2010</xref>; <xref ref-type="bibr" rid="B17">Cubek et al., 2015</xref>; <xref ref-type="bibr" rid="B24">Fonooni et al., 2016</xref>) do not focus on the attributes of situational objects in the environment. For example, in the last experiment, we showed that for a serving task, the fruit on the box and on the plate must be the same. This constraint is determined using situational objects&#x2019; attributes. In summary, the similarities between object attributes are revealed in a more comprehensive way with complicated tasks involving multiple objects than the studies in the literature by taking advantage of keyframes. The constraints obtained as a result of this solution determine the purpose of the task, and the robot successfully performs the task by finding objects that meet these constraints, even if they are objects it has not seen before.</p>
<p>As discussed in the related work, the concept of semantics can be associated with a variety of applications. Our approach, when applied to object search applications such as those conducted by <xref ref-type="bibr" rid="B27">Guo et al. (2022)</xref> and <xref ref-type="bibr" rid="B59">Zhang et al. (2023)</xref>, has certain limitations, particularly in the representation of object attributes. We demonstrate that if discrete object attributes match, then this attribute becomes a constraint. However, in the case of a service robot, the object could be located anywhere, necessitating a discrete object attribute to define its location. For instance, if a user demonstrates to the robot how to pick up a mug from the kitchen, the robot may not be able to find the mug if it is in another room during reproduction. Typically, navigation or object search applications use ontology-based graph representation, which could enhance our approach.</p>
<p>When applied to semantic navigation applications like <xref ref-type="bibr" rid="B50">Qi et al. (2020)</xref> and <xref ref-type="bibr" rid="B2">Achat et al. (2023)</xref>, another limitation arises. We use keyframes to represent robot targets, but they may not generate constraints like &#x201c;avoid the walls&#x201d;. Adding trajectories could provide these constraints. Some studies propose hybrid movements as input (<xref ref-type="bibr" rid="B3">Akgun et al., 2012a</xref>), where both keyframes and trajectories can be provided. We aim to enhance our approach to accept trajectories as input in future work.</p>
<p>We believe that our approach has potential for applications in human-robot interaction. Although natural language is user-friendly, as shown in studies by <xref ref-type="bibr" rid="B33">Kartmann et al. (2021)</xref> and <xref ref-type="bibr" rid="B10">Bucker et al. (2023)</xref>, these applications typically require a mapping from sentences to low-level robot motion and generally require substantial data. With our approach, users can implicitly teach their high-level preferences to the robot through physical interactions. Another potential application could be semantic grasping, as demonstrated by <xref ref-type="bibr" rid="B42">Moon and Lee (2020)</xref>, if we add attributes such as &#x201c;fragile&#x201d; and &#x201c;not fragile&#x201d;, and record forces by the robot. Physical human-robot interactions when the robot executes the task can be another application that could benefit from our approach. For example, when there is a misalignment between the user and robot, the user records multiple trajectory traces around the object, as shown in the study by <xref ref-type="bibr" rid="B40">Louren&#xe7;o et al. (2023)</xref>. With the use of our approach, the user could show the trajectories around &#x201c;computer,&#x201d; &#x201c;mobile phone&#x201d; and the robot could understand implicitly that it needs to avoid &#x201c;electronic devices&#x201d;.</p>
<p>While not explored in this study, our method can effectively communicate the reasons for task failures to the user when the robot cannot find a solution for a task. For instance, in the first experiment, when there was a green cylinder and a blue cube, the robot would not perform the task because the colors did not match, thus not meeting the user&#x2019;s intentions. In fact, the robot can understand the reason for failure, and these reasons can be informative for the user. The user can rectify these situations by providing an additional demonstration or changing objects based on the feedback received.</p>
<p>Similarly, in some contexts, there may be more than one group of candidate objects that meet the constraints derived from demonstrations. In fact, our proposed method can accurately identify all possible candidate objects. If combined with behavior trees (<xref ref-type="bibr" rid="B28">Gustavsson et al., 2022</xref>), the robot motions can be repeated until there are no objects in the environment that satisfy the desired constraints. However, we do not allow the robot to repeat the task when there are multiple correct object groups, as this may be incorrect for some tasks. For example, in the scenario discussed in the first experiment, when there is a red cylinder, red cube, yellow cylinder, and yellow cube, our algorithm provides us with the object pairs and the necessary keyframes and reference poses of the robot. However, when we apply this situation to a sorting task, the red area will already be full after the robot executes the task once. By avoiding the use of an object more than once for the same conceptual keyframe, we can easily handle this problem. But this will not be able to solve the situation of collecting fruits in a box, where we can put multiple fruits into a box. Although the message we want to convey can be understood by the robot, there may be situations that the robot cannot cope with. Asking the user about these situations through communication and after getting approval from him/her may perhaps take our work one step further. We believe that human-robot interaction is crucial to correcting the robot&#x2019;s behavior or updating the task.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s12">Supplementary Material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>BS: Conceptualization, Methodology, Writing&#x2013;original draft. JE: Methodology, Supervision, Writing&#x2013;review and editing. ET: Supervision, Writing&#x2013;review and editing. RvM: Supervision, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s12">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2024.1340334/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2024.1340334/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Video3.MP4" id="SM1" mimetype="application/MP4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM2" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Video2.MP4" id="SM3" mimetype="application/MP4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Video1.MP4" id="SM4" mimetype="application/MP4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ab Azar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Shahmansoorian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Davoudi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>From inverse optimal control to inverse reinforcement learning: a historical review</article-title>. <source>Annu. Rev. Control</source> <volume>50</volume>, <fpage>119</fpage>&#x2013;<lpage>138</lpage>. <pub-id pub-id-type="doi">10.1016/j.arcontrol.2020.06.001</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Achat</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Serdel</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Marzat</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Moras</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A case study of semantic mapping and planning for autonomous robot navigation</article-title>. <source>SN Comput. Sci.</source> <volume>5</volume>, <fpage>55</fpage>. <pub-id pub-id-type="doi">10.1007/s42979-023-02370-3</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akgun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cakmak</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2012a</year>). <article-title>Keyframe-based learning from demonstration: method and evaluation</article-title>. <source>Int. J. Soc. Robot.</source> <volume>4</volume>, <fpage>343</fpage>&#x2013;<lpage>355</lpage>. <pub-id pub-id-type="doi">10.1007/s12369-012-0160-0</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Akgun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cakmak</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2012b</year>). &#x201c;<article-title>Trajectories and keyframes for kinesthetic teaching: a human-robot interaction perspective</article-title>,&#x201d; in <source>
<italic>Proceedings of the seventh annual ACM/IEEE international conference on Human-Robot Interaction</italic>, (ACM)</source>, <fpage>391</fpage>&#x2013;<lpage>398</lpage>. <pub-id pub-id-type="doi">10.1145/2157689.2157815</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akgun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Simultaneously learning actions and goals from demonstration</article-title>. <source>Auton. Robots</source> <volume>40</volume>, <fpage>211</fpage>&#x2013;<lpage>227</lpage>. <pub-id pub-id-type="doi">10.1007/s10514-015-9448-x</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arora</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Doshi</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey of inverse reinforcement learning: challenges, methods and progress</article-title>. <source>Artif. Intell.</source> <volume>297</volume>, <fpage>103500</fpage>. <pub-id pub-id-type="doi">10.1016/j.artint.2021.103500</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Avaei</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Van Der Spaa</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Peternel</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kober</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>An incremental inverse reinforcement learning approach for motion planning with separated path and velocity preferences</article-title>. <source>Robotics</source> <volume>12</volume>, <fpage>61</fpage>. <pub-id pub-id-type="doi">10.3390/robotics12020061</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Billard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Calinon</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dillmann</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Robot programming by demonstration</article-title>,&#x201d; in <source>Springer handbook of robotics</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Siciliano</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Khatib</surname>
<given-names>O.</given-names>
</name>
</person-group> (<publisher-loc>Berlin, Heidelberg</publisher-loc>: <publisher-name>Springer Berlin Heidelberg</publisher-name>), <fpage>1371</fpage>&#x2013;<lpage>1394</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-540-30301-5_60</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bobu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bajcsy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fisac</surname>
<given-names>J. F.</given-names>
</name>
<name>
<surname>Dragan</surname>
<given-names>A. D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Learning under misspecified objective spaces</article-title>,&#x201d; in <source>Conference on robot learning</source> (<publisher-loc>Zurich, Switzerland</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>796</fpage>&#x2013;<lpage>805</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bucker</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Figueredo</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Haddadin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kapoor</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vemprala</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>LATTE: LAnguage trajectory TransformEr</article-title>,&#x201d; in <source>2023 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-loc>London, United Kingdom</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7287</fpage>&#x2013;<lpage>7294</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA48891.2023.10161068</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bucker</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Figueredo</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Haddadinl</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kapoor</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bonatti</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Reshaping robot trajectories using Natural Language commands: a study of multi-modal data alignment using transformers</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-loc>Kyoto, Japan</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>978</fpage>&#x2013;<lpage>984</lpage>. <pub-id pub-id-type="doi">10.1109/IROS47612.2022.9981810</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bullard</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Akgun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chernova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Grounding action parameters from demonstration</article-title>,&#x201d; in <source>2016 25th IEEE international symposium on robot and human interactive communication (RO-MAN)</source> (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>253</fpage>&#x2013;<lpage>260</lpage>. <pub-id pub-id-type="doi">10.1109/ROMAN.2016.7745139</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Calinon</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Learning from demonstration (programming by demonstration)</article-title>,&#x201d; in <source>Encyclopedia of robotics</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Ang</surname>
<given-names>M. H.</given-names>
</name>
<name>
<surname>Khatib</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Siciliano</surname>
<given-names>B.</given-names>
</name>
</person-group> (<publisher-loc>Berlin, Heidelberg</publisher-loc>: <publisher-name>Springer Berlin Heidelberg</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-41610-1_27-1</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cakmak</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Interactive task learning with discrete and continuous features</article-title>,&#x201d; in <source>Visual representations and reasoning</source>.</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chella</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dindo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Infantino</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>Learning high-level tasks through imitation</article-title>,&#x201d; in <source>2006 IEEE/RSJ international conference on intelligent robots and systems</source> (<publisher-loc>Beijing, China</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3648</fpage>&#x2013;<lpage>3654</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2006.281721</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chernova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Robot learning from human teachers</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>. <pub-id pub-id-type="doi">10.1007/978-3-031-01570-0</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cubek</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ertel</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Palm</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>High-level learning from demonstration with conceptual spaces and subspace clustering</article-title>,&#x201d; in <source>2015 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-loc>Seattle, WA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2592</fpage>&#x2013;<lpage>2597</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2015.7139548</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>SEE-CSOM: sharp-edged and efficient continuous semantic occupancy mapping for mobile robots</article-title>. <source>IEEE Trans. Ind. Electron.</source> <volume>71</volume>, <fpage>1718</fpage>&#x2013;<lpage>1728</lpage>. <pub-id pub-id-type="doi">10.1109/TIE.2023.3262857</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Off-line programming framework for sorting task based on human-demonstration</article-title>. <source>IEEE Trans. Autom. Sci. Eng.</source>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/TASE.2024.3376712</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eiband</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Liebl</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Willibald</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Online task segmentation by merging symbolic and data-driven skill recognition during kinesthetic teaching</article-title>. <source>Robot. Auton. Syst.</source> <volume>162</volume>, <fpage>104367</fpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2023.104367</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fitzgerald</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Goel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Human-guided object mapping for task transfer</article-title>. <source>ACM Trans. Hum.-Robot Interact.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1145/3277905</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rouhani</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tavakoli</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A therapist-taught robotic System for assistance during gait therapy targeting foot drop</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>4</volume>, <fpage>407</fpage>&#x2013;<lpage>413</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2018.2890674</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fonooni</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hellstr&#xf6;m</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Janlert</surname>
<given-names>L.-E.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Learning high-level behaviors from demonstration through semantic networks</article-title>,&#x201d; in <source>4th international conference on agents and artificial intelligence (ICAART)</source>, <fpage>419</fpage>&#x2013;<lpage>426</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fonooni</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hellstr&#xf6;m</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Janlert</surname>
<given-names>L.-E.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Priming as a means to reduce ambiguity in learning from demonstration</article-title>. <source>Int. J. Soc. Robot.</source> <volume>8</volume>, <fpage>5</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1007/s12369-015-0292-0</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fonooni</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jevti&#x107;</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hellstr&#xf6;m</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Janlert</surname>
<given-names>L.-E.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Applying Ant Colony Optimization algorithms for high-level behavior learning and reproduction from demonstrations</article-title>. <source>Robot. Auton. Syst.</source> <volume>65</volume>, <fpage>24</fpage>&#x2013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2014.12.001</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>French</surname>
<given-names>K. D.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Goeddel</surname>
<given-names>E. M.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jenkins</surname>
<given-names>O. C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Super intendo: semantic robot programming from multiple demonstrations for taskable robots</article-title>. <source>Robot. Auton. Syst.</source> <volume>166</volume>, <fpage>104397</fpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2023.104397</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ban</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Sadoun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Obaidat</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An efficient object navigation strategy for mobile robots based on semantic information</article-title>. <source>Electronics</source> <volume>11</volume>, <fpage>1136</fpage>. <pub-id pub-id-type="doi">10.3390/electronics11071136</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gustavsson</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Iovino</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Styrud</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Combining context awareness and planning to learn behavior trees from demonstration</article-title>,&#x201d; in <source>2022 31st IEEE international conference on robot and human interactive communication (RO-MAN)</source> (<publisher-loc>Napoli, Italy</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1153</fpage>&#x2013;<lpage>1160</lpage>. <pub-id pub-id-type="doi">10.1109/RO-MAN53752.2022.9900603</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hristov</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ramamoorthy</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Learning from demonstration with weakly supervised disentanglement</article-title>,&#x201d; in <source>9th international Conference on learning representations, ICLR</source>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hussein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gaber</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Elyan</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jayne</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Imitation learning: a survey of learning methods</article-title>. <source>ACM Comput. Surv.</source> <volume>50</volume>, <fpage>1</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1145/3054912</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jankowski</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Racca</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Calinon</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>From key positions to optimal basis functions for probabilistic adaptive control</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>7</volume>, <fpage>3242</fpage>&#x2013;<lpage>3249</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2022.3146614</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kaelbling</surname>
<given-names>L. P.</given-names>
</name>
<name>
<surname>Oates</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hernandez</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Finney</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Learning in worlds with objects</article-title>,&#x201d; in <source>Working notes of the AAAI stanford spring symposium on learning grounded representations</source>, <fpage>31</fpage>&#x2013;<lpage>36</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kartmann</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Asfour</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Semantic scene manipulation based on 3D spatial object relations and language instructions</article-title>,&#x201d; in <source>2020 IEEE-RAS 20th international conference on humanoid robots (humanoids)</source> (<publisher-loc>Munich, Germany</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>306</fpage>&#x2013;<lpage>313</lpage>. <pub-id pub-id-type="doi">10.1109/HUMANOIDS47582.2021.9555802</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kawasaki</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Takahashi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Bottom-up action modeling via spatial factorization for serving food</article-title>. <source>Adv. Robot.</source> <volume>35</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1080/01691864.2021.1919548</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kollmitz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Koller</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Boedecker</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Burgard</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning human-aware robot navigation from physical interaction via inverse reinforcement learning</article-title>,&#x201d; in <source>2020 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>11025</fpage>&#x2013;<lpage>11031</lpage>. <pub-id pub-id-type="doi">10.1109/IROS45743.2020.9340865</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kretzschmar</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Spies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sprunk</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Burgard</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Socially compliant mobile robot navigation via inverse reinforcement learning</article-title>. <source>Int. J. Robot. Res.</source> <volume>35</volume>, <fpage>1289</fpage>&#x2013;<lpage>1307</lpage>. <pub-id pub-id-type="doi">10.1177/0278364915619772</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kurenkov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Akgun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>An evaluation of GUI and kinesthetic teaching methods for constrained-keyframe skills</article-title>,&#x201d; in <source>2015 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-loc>Hamburg, Germany</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3608</fpage>&#x2013;<lpage>3613</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2015.7353881</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kwak</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Whang</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Jo</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Semantic grasping via a knowledge graph of robotic manipulation: a graph representation learning approach</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>7</volume>, <fpage>9397</fpage>&#x2013;<lpage>9404</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2022.3191194</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lioutikov</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Maeda</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Probabilistic segmentation applied to an assembly task</article-title>,&#x201d; in <source>2015 IEEE-RAS 15th international conference on humanoid robots (humanoids)</source> (<publisher-loc>Seoul, South Korea</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>533</fpage>&#x2013;<lpage>540</lpage>. <pub-id pub-id-type="doi">10.1109/HUMANOIDS.2015.7363584</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Louren&#xe7;o</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bobu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rojas</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Wahlberg</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Diagnosing and repairing feature representations under distribution shifts</article-title>,&#x201d; in <source>2023 62nd IEEE conference on decision and control (CDC)</source> (<publisher-loc>Singapore, Singapore</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3638</fpage>&#x2013;<lpage>3645</lpage>. <pub-id pub-id-type="doi">10.1109/CDC49753.2023.10383644</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Markley</surname>
<given-names>F. L.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Crassidis</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Oshman</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Averaging quaternions</article-title>. <source>J. Guid. Control Dyn.</source> <volume>30</volume>, <fpage>1193</fpage>&#x2013;<lpage>1197</lpage>. <pub-id pub-id-type="doi">10.2514/1.28949</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Moon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>B.-H.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Object-oriented semantic graph based natural question generation</article-title>,&#x201d; in <source>2020 IEEE international Conference on Robotics and automation (ICRA)</source> (<publisher-loc>Paris, France</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4892</fpage>&#x2013;<lpage>4898</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA40945.2020.9196563</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Muhlig</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hellbach</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Steil</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Goerick</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Task-level imitation learning using variance-based movement optimization</article-title>,&#x201d; in <source>2009 IEEE international conference on robotics and automation</source> (<publisher-loc>Kobe</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1177</fpage>&#x2013;<lpage>1184</lpage>. <pub-id pub-id-type="doi">10.1109/ROBOT.2009.5152439</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niekum</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Osentoski</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Konidaris</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chitta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Marthi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Learning grounded finite-state representations from unstructured demonstrations</article-title>. <source>Int. J. Robot. Res.</source> <volume>34</volume>, <fpage>131</fpage>&#x2013;<lpage>157</lpage>. <pub-id pub-id-type="doi">10.1177/0278364914554471</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Polden</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Larkin</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Van Duin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Norrish</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Recent progress on programming methods for industrial robots</article-title>. <source>Robot. Comput.-Integr. Manuf.</source> <volume>28</volume>, <fpage>87</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1016/j.rcim.2011.08.004</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pareek</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kesavadas</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>iART: learning from demonstration for assisted robotic therapy using LSTM</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>5</volume>, <fpage>477</fpage>&#x2013;<lpage>484</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2019.2961845</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname>
<given-names>F. C.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>B. J.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>Robot sensor calibration: solving AX&#x3d;XB on the Euclidean group</article-title>. <source>IEEE Trans. Robot. Autom.</source> <volume>10</volume>, <fpage>717</fpage>&#x2013;<lpage>721</lpage>. <pub-id pub-id-type="doi">10.1109/70.326576</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Perez-D&#x2019;Arpino</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>C-LEARN: learning geometric constraints from demonstrations for multi-step manipulation in shared autonomy</article-title>,&#x201d; in <source>2017 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-loc>Singapore, Singapore</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4058</fpage>&#x2013;<lpage>4065</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2017.7989466</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Peter</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2000</year>). <source>Conceptual spaces: the geometry of thought</source>. <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Building semantic grid maps for domestic robot navigation</article-title>. <source>Int. J. Adv. Robot. Syst.</source> <volume>17</volume>, <fpage>172988141990006</fpage>. <pub-id pub-id-type="doi">10.1177/1729881419900066</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramirez-Amaro</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dean-Leon</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bergner</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A semantic-based method for teaching industrial robots new tasks</article-title>. <source>KI - K&#xfc;nstl. Intell.</source> <volume>33</volume>, <fpage>117</fpage>&#x2013;<lpage>122</lpage>. <pub-id pub-id-type="doi">10.1007/s13218-019-00582-5</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ravichandar</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Polydoros</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Chernova</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Billard</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recent advances in robot learning from demonstration</article-title>. <source>Annu. Rev. Control Robot. Auton. Syst.</source> <volume>3</volume>, <fpage>297</fpage>&#x2013;<lpage>330</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-control-100819-063206</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shek</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>B. Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Learning from physical human feedback: an object-centric one-shot adaptation method</article-title>,&#x201d; in <source>2023 IEEE international Conference on Robotics and automation (ICRA)</source> (<publisher-loc>London, United Kingdom</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>9910</fpage>&#x2013;<lpage>9916</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA48891.2023.10161416</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Steinmetz</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Nitsch</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Stulp</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Intuitive task-level programming by demonstration through semantic skill recognition</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>4</volume>, <fpage>3742</fpage>&#x2013;<lpage>3749</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2019.2928782</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Inverse reinforcement learning-based time-dependent A&#x2a; planner for human-aware robot navigation with local vision</article-title>. <source>Adv. Robot.</source> <volume>34</volume>, <fpage>888</fpage>&#x2013;<lpage>901</lpage>. <pub-id pub-id-type="doi">10.1080/01691864.2020.1753569</pub-id>
</citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Villani</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pini</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Leali</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Secchi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fantuzzi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Survey on human-robot interaction for robot programming in industrial applications</article-title>. <source>IFAC-Pap.</source> <volume>51</volume>, <fpage>66</fpage>&#x2013;<lpage>71</lpage>. <pub-id pub-id-type="doi">10.1016/j.ifacol.2018.08.236</pub-id>
</citation>
</ref>
<ref id="B57">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ye</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Alterovitz</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Demonstration-guided motion planning</article-title>,&#x201d; in <source>Robotics research</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Christensen</surname>
<given-names>H. I.</given-names>
</name>
<name>
<surname>Khatib</surname>
<given-names>O.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>291</fpage>&#x2013;<lpage>307</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-29363-9_17</pub-id>
</citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zanchettin</surname>
<given-names>A. M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Symbolic representation of what robots are taught in one demonstration</article-title>. <source>Robot. Auton. Syst.</source> <volume>166</volume>, <fpage>104452</fpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2023.104452</pub-id>
</citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Semantic grounding for long-term autonomy of mobile robots toward dynamic object search in home environments</article-title>. <source>IEEE Trans. Ind. Electron.</source> <volume>70</volume>, <fpage>1655</fpage>&#x2013;<lpage>1665</lpage>. <pub-id pub-id-type="doi">10.1109/TIE.2022.3159913</pub-id>
</citation>
</ref>
<ref id="B60">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ziaeetabar</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Aksoy</surname>
<given-names>E. E.</given-names>
</name>
<name>
<surname>Worgotter</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Tamosiunaite</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Semantic analysis of manipulation actions using spatial relations</article-title>,&#x201d; in <source>2017 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-loc>Singapore, Singapore</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4612</fpage>&#x2013;<lpage>4619</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2017.7989536</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>