<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" 'JATS-journalpublishing1-3-mathml3.dtd'>
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1660244</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2025.1660244</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Visuo-tactile feedback policies for terminal assembly facilitated by reinforcement learning</article-title>
<alt-title alt-title-type="left-running-head">Li et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2025.1660244">10.3389/frobt.2025.1660244</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Yuchao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3118951"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing&#x2013;original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jin</surname>
<given-names>Ziqi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3239696"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Jin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3239629"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Ma</surname>
<given-names>Daolin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2228868"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing&#x2013;review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>School of Ocean and Civil Engineering, Shanghai Jiao Tong University</institution>, <city>Shanghai</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>School of Mechanical Engineering, Shanghai Jiao Tong University</institution>, <city>Shanghai</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Daolin Ma, <email xlink:href="mailto:daolinma@sjtu.edu.cn">daolinma@sjtu.edu.cn</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-10-22">
<day>22</day>
<month>10</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1660244</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Li, Jin, Liu and Ma.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Li, Jin, Liu and Ma</copyright-holder>
<license>
<ali:license_ref start_date="2025-10-22">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Industrial terminal assembly tasks are often repetitive and involve handling components with tight tolerances that are susceptible to damage. Learning an effective terminal assembly policy in real-world is challenging, as collisions between parts and the environment can lead to slippage or part breakage. In this paper, we propose a safe reinforcement learning approach to develop a visuo-tactile assembly policy that is robust to variations in grasp poses. Our method minimizes collisions between the terminal head and terminal base by decomposing the assembly task into three distinct phases. In the first <italic>grasp</italic> phase,a vision-guided model is trained to pick the terminal head from an initial bin. In the second <italic>align</italic> phase, a tactile-based grasp pose estimation model is employed to align the terminal head with the terminal base. In the final <italic>assembly</italic> phase, a visuo-tactile policy is learned to precisely insert the terminal head into the terminal base. To ensure safe training, the robot leverages human demonstrations and interventions. Experimental results on PLC terminal assembly demonstrate that the proposed method achieves 100% successful insertions across 100 different initial end-effector and grasp poses, while imitation learning and online-RL policy yield only 9% and 0%.</p>
</abstract>
<kwd-group>
<kwd>visual perception</kwd>
<kwd>tactile sensing</kwd>
<kwd>multi-modal fusion</kwd>
<kwd>terminal assembly</kwd>
<kwd>reinforcement learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by National Natural Science Foundation of China (NSFC, Grant No. 12272220), Contemporary Amperex Technology Co., Limited (CATL) and SIEMENS AG. The funders were not involved in the study design, collection, analysis, interpretation of data, the writing of this article, or the decision to submit it for publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="3"/>
<equation-count count="3"/>
<ref-count count="52"/>
<page-count count="11"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Industrial Robotics and Automation</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Terminal assembly (<xref ref-type="bibr" rid="B31">McKee et al., 1985</xref>) is a precision manipulation task that involves part-to-part contact. Its four key sub-tasks&#x2014;part feeding, object reorientation, peg insertion, and terminal buckling&#x2014;have been widely investigated (<xref ref-type="bibr" rid="B31">McKee et al., 1985</xref>; <xref ref-type="bibr" rid="B11">Goldberg, 1993</xref>; <xref ref-type="bibr" rid="B26">Lozano-P&#xe9;rez, 1986</xref>; <xref ref-type="bibr" rid="B27">Lozano-Perez et al., 1984</xref>; <xref ref-type="bibr" rid="B34">Natarajan, 1989</xref>). Early research primarily focused on mechanical design aspects (<xref ref-type="bibr" rid="B26">Lozano-P&#xe9;rez, 1986</xref>; <xref ref-type="bibr" rid="B34">Natarajan, 1989</xref>) and motion planning strategies (<xref ref-type="bibr" rid="B11">Goldberg, 1993</xref>; <xref ref-type="bibr" rid="B27">Lozano-Perez et al., 1984</xref>; <xref ref-type="bibr" rid="B37">Qiao et al., 1995</xref>). With the aid of Computer-Aided Design (CAD), the assembly sequence can be pre-defined in simulation using accurate pose information (<xref ref-type="bibr" rid="B5">De Mello and Sanderson, 1989</xref>), enabling robots to plan the required actions for executing the assembly (<xref ref-type="bibr" rid="B19">Koga et al., 2022</xref>). Recently, reinforcement learning (RL)-based approaches have demonstrated potential in handling assembly tasks involving parts with complex geometries (<xref ref-type="bibr" rid="B47">Wen et al., 2022</xref>; <xref ref-type="bibr" rid="B23">Lian et al., 2021</xref>). However, RL remains challenging due to the requirement for frequent human inputs during learning (<xref ref-type="bibr" rid="B29">Luo et al., 2021</xref>) or high-precision sensors for collecting training data (<xref ref-type="bibr" rid="B47">Wen et al., 2022</xref>). Meanwhile, because the terminal head has the characteristics of irregular shape, easy damage, there is also a need for a safe training and data collection method for learning assembly tasks.</p>
<p>Another challenge in terminal assembly tasks is that the precise initial pose of the terminal is often unknown. Since the grasped object is frequently visually occluded by the gripper, tactile sensing provides a more effective means for grasp pose estimation (<xref ref-type="bibr" rid="B35">Okumura et al., 2022</xref>; <xref ref-type="bibr" rid="B4">Dang et al., 2023</xref>). Although recent advances have demonstrated improved simulation accuracy for industrial insertion tasks (<xref ref-type="bibr" rid="B33">Narang et al., 2022</xref>), and successful Sim2Real transfer has been achieved for tactile-based insertion tasks (<xref ref-type="bibr" rid="B17">Kelestemur et al., 2022</xref>; <xref ref-type="bibr" rid="B46">Wang et al., 2022</xref>), simulating soft contacts between tactile sensors and objects with complex geometries remains an open challenge (<xref ref-type="bibr" rid="B45">Wang et al., 2021</xref>). This issue often hinders real-world transfer, as accurate object models are rarely publicly available. Additionally, a major obstacle in applying reinforcement learning (RL) to real-world terminal assembly tasks involving tactile feedback is the frequent slippage of parts caused by environmental collisions and the inherently smooth surface of the tactile sensor&#x2019;s gel pad. Such slippage makes it difficult for RL methods to succeed without human intervention or the use of a dedicated pose estimation algorithm to detect and correct misalignments.</p>
<p>In this work, we present a novel method to safely learn visuo-tactile feedback policies in real for terminal assembly tasks under grasp pose uncertainties, with inexpensive off-the-shelf sensors. Our approach draws on tactile and visual feedback to deal with the uncertainty of grasp pose and a safe RL training procedure, minimizing damage during the training phase. We use Sample-Efficient Robotic reinforcement Learning (SERL) (<xref ref-type="bibr" rid="B30">Luo et al., 2024</xref>), a software suite that provides a well-designed foundation for robotic RL, to develop a data collection and training pipeline that minimizes collision between the part and its environment.</p>
<p>The whole pipeline can be divided into three steps: First, Training Reward Classifier: Labeling visual and tactile images from human instruction instances to train a reward classifier to decide when to give policy rewards throughout the RL training process. Second, Recording Demonstrations: To accelerate training, record a predetermined number of human-operated robot demonstrations to finish terminal assembly. This will serve as a demo buffer for RL. Third, Policy Training: Using the trained reward classifier and recorded demonstrations to complete the task training (during training, human interventions can be added to avoid collisions and speed up the training).</p>
<p>The main contributions of this paper are as follows: the development of a policy for complex terminal assembly in real-world scenarios, which leverages visual and tactile information through reinforcement learning and can be acquired in less than 60 min; the introduction of a safe exploratory strategy for reinforcement learning, accompanied by a secure data collection methodology grounded in a designated manual remote operation technique; and the presentation of experimental findings that indicate the policy attains a success rate of 100 out of 100 in Programmable Logic Controller (PLC) terminal assembly, thereby surpassing two baseline approaches that recorded success rates of 0 out of 100 and nine out of 100, respectively.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>For many years, terminal assembly has been an essential part of robotics. The parts&#x2019; fragility, the moderate force during terminal buckling, the occlusions caused by the robot gripper, the grasp uncertainty from the acquisition process and its collision with the environment, and the precision required to control the robot for insertion render the task challenging. Early work approached this problem using CAD information to infer desired assembly sequences (<xref ref-type="bibr" rid="B5">De Mello and Sanderson, 1989</xref>) and generating designs of part feeders based on object geometry (<xref ref-type="bibr" rid="B34">Natarajan, 1989</xref>). Other work approached the problem from an algorithmic design perspective, with a focus on developing motion planning strategies for peg insertion (<xref ref-type="bibr" rid="B26">Lozano-P&#xe9;rez, 1986</xref>; <xref ref-type="bibr" rid="B37">Qiao et al., 1995</xref>).</p>
<p>Recently, learning-based methods have shown success on this task. This includes learning assembly policies with a physical robot via Sim2Real transfer (<xref ref-type="bibr" rid="B16">Johannink et al., 2019</xref>), online adaptation with meta-learning (<xref ref-type="bibr" rid="B41">Schoettler et al., 2020b</xref>; <xref ref-type="bibr" rid="B51">Zhao et al., 2022</xref>), reinforcement learning (<xref ref-type="bibr" rid="B29">Luo et al., 2021</xref>; <xref ref-type="bibr" rid="B40">Schoettler et al., 2020a</xref>), self-supervised data collection with impedance control (<xref ref-type="bibr" rid="B42">Spector and Di Castro, 2021</xref>), accurate state estimation (<xref ref-type="bibr" rid="B47">Wen et al., 2022</xref>), or decomposing the assembly algorithm into a residual policy that relies on conventional feedback control (<xref ref-type="bibr" rid="B16">Johannink et al., 2019</xref>). These approaches assume that the parts are grasped with a fixed pose. To overcome this assumption, Wen et al. (<xref ref-type="bibr" rid="B47">Wen et al., 2022</xref>) perform accurate pose estimation and motion tracking with a high-precision depth camera and use a behavioral cloning algorithm to insert the part. Spector et al. (<xref ref-type="bibr" rid="B42">Spector and Di Castro, 2021</xref>; <xref ref-type="bibr" rid="B43">Spector et al., 2022</xref>) proposed Insertionnet for industrial assembly, which requires contact between the part and the environment to occur during data collection, a process that is expensive and often impractical for fragile parts. <xref ref-type="bibr" rid="B36">Ozalp et al. (2024)</xref> made advancements in deep RL and inverse RL for robotic manipulation. In comparison, we use inexpensive tactile sensors and a safe human-guided data collection and RL procedure that does not require such contact.</p>
<p>In systems using only visual perception, grasped parts are often visually occluded by the gripper, and changes in environment light can affect the accuracy of visual recognition. However, tactile perception is not affected by these factors: the camera of the tactile sensor is placed inside the body, so the collected tactile images will not be blocked by itself or environmental objects; the light source for tactile images is a built-in LED strip, so the image brightness, color, etc. are also not affected by environment light. Meanwhile, tactile images contain rich physical information such as object geometric features, contact force, contact deformation, and displacement. Based on this information, the system can achieve more precise contact control. Therefore, tactile feedback can be an alternative sensing modality for grasp pose estimation. Recent work uses tactile images from vision-based tactile sensors such as GelSight (<xref ref-type="bibr" rid="B49">Yuan et al., 2017</xref>), DIGIT (<xref ref-type="bibr" rid="B21">Lambeta et al., 2020</xref>) and GelSlim3.0 (<xref ref-type="bibr" rid="B44">Taylor et al., 2022</xref>) to estimate the relative pose and 3D motion field between grasped objects and tactile grippers. Meanwhile, many new types of tactile fingers (DexiTac (<xref ref-type="bibr" rid="B28">Lu et al., 2024</xref>)) and tactile sensors (Evetac (<xref ref-type="bibr" rid="B10">Funk et al., 2024</xref>)) are being applied in robotic operations. Li et al. (<xref ref-type="bibr" rid="B22">Li et al., 2014</xref>) use Gelsight sensors, BRISK features and RANSAC to estimate grasp pose. Gelsight produces high-quality 3D tactile images and can determine depth imprint, which improves feature detection by isolating the object from the background. DIGIT, a more affordable tactile sensor, provides a 2D RGB image but not the light incident direction (to generate the depth image). <xref ref-type="bibr" rid="B25">Liu et al. (2024)</xref> develops a method to reconstruct 3-D tactile motion field in real-time, that can provide rich tactile information (such as contact force) and serve as the foundation for many downstream tasks. <xref ref-type="bibr" rid="B17">Kelestemur et al. (2022)</xref> generates tactile image data in simulation for pose estimation of bottle caps but simulating contact and physical interaction between tactile sensors and objects with more intricate geometry is still challenging (<xref ref-type="bibr" rid="B45">Wang et al., 2021</xref>). In this work, we combine tactile images from a real-world PLC terminal with reinforcement learning process as part of observation. By means of contact tactile information analysis, these images enable the policy to precisely locate the terminal base and so try to minimize the contact force needed for terminal buckling.</p>
<p>Most prior work on tight tolerance assembly tasks (<xref ref-type="bibr" rid="B47">Wen et al., 2022</xref>; <xref ref-type="bibr" rid="B22">Li et al., 2014</xref>; <xref ref-type="bibr" rid="B6">Fan et al., 2019</xref>; <xref ref-type="bibr" rid="B7">Florence et al., 2022</xref>; <xref ref-type="bibr" rid="B48">Wu et al., 2025</xref>; <xref ref-type="bibr" rid="B24">Lin et al., 2024</xref>) leverages a single modality, such as vision, tactile, or force-torque, limiting the accuracy of the system due to occlusion, perspective effect, and sensory inaccuracy. Multi-modal systems have been explored to improve the robustness of automated insertion. <xref ref-type="bibr" rid="B42">Spector and Di Castro (2021)</xref>, <xref ref-type="bibr" rid="B43">Spector et al. (2022)</xref> use RGB cameras and a force-torque sensor for learning contact and impedance control. <xref ref-type="bibr" rid="B3">Chaudhury et al. (2022)</xref> couple vision and tactile data to perform localization and pose estimation, and demonstrate that vision helps with disambiguating tactile signals for objects without distinctive features. <xref ref-type="bibr" rid="B15">Ichiwara et al. (2022)</xref> leverage tactile and vision for deformable bag manipulation by performing auto-regressive prediction. Hansen et al. (<xref ref-type="bibr" rid="B13">Hansen et al., 2022</xref>) use a contact-gated tactile, vision and proprioceptive observation to train reinforcement learning policies. <xref ref-type="bibr" rid="B35">Okumura et al. (2022)</xref> also tackle the problem of grasp pose uncertainty for insertion by using Newtonian Variational Autoencoders to combine camera observations and tactile images. <xref ref-type="bibr" rid="B14">Hao et al. (2025)</xref>, <xref ref-type="bibr" rid="B52">Zhao et al. (2024)</xref> and <xref ref-type="bibr" rid="B50">Zhang et al. (2025)</xref> combined tactile information with large language models, achieving robotic arm manipulation of articulated objects and preference learning for insertion manipulation, respectively. They demonstrate results for USB insertion accounting for grasp pose uncertainty in one translation direction. In this work, we address terminal grasping, path planning, and terminal buckling as the whole reinforcement learning task. As the observation for the RL policy, combine two wrist camera images, one side camera image, and two tactile gripper images into visual-tactile multi-modal information. Concurrent with this was an artificial intervention program designed to guarantee a safe exploration for the policy. Our policy is able to handle both grasp pose rotation and translation uncertainty for the PLC terminal&#x2019;s assembly.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Problem statement and preliminaries</title>
<p>Overview: We sort out a terminal assembly task for a 7-DoF robot with a parallel-jaw gripper and tactile sensors on both jaws. The end-effector has two wrist-mounted RGBD cameras, and one RGB side-camera is configured to capture the entire assembly scenario. The objective is to learn a policy that can robustly insert the terminal head into the terminal base with an unknown part&#x2019;s pose within the gripper, while minimizing head-base collisions by human guidance during training. <xref ref-type="fig" rid="F1">Figure 1A</xref> shows the experiment setup.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>
<bold>(A)</bold> An overview of the terminal assembly task is shown in the figure. The goal is to grasp the terminal head from the placement tray and guide the robot to the terminal base. Two RGBD cameras on the wrist and one RGB side camera are used to observe the environment. The final step is to insert the terminal head clamped by the tactile sensors onto the terminal base using visual-tactile feedback. <bold>(B,C)</bold> Exhibition of the components and specifics of each segment of the terminal.</p>
</caption>
<graphic xlink:href="frobt-12-1660244-g001.tif">
<alt-text content-type="machine-generated">Panel (A) displays a robotic arm with wrist cameras, tactile sensors, terminal base, and terminal head. Panel (B) shows a device with a labeled DC input section. Panel (C) highlights a connector with pin measurements and numbered sections.</alt-text>
</graphic>
</fig>
<p>Details of the assembled terminal: As seen in <xref ref-type="fig" rid="F1">Figures 1B,C</xref>, our work accomplished the PLC terminal assembly. The terminal base and the terminal head are the two components that make up the hardware. Three barbed elastic latches and ten parallel-positioned pins make up the major mating components of the terminal head. The terminal base mating area is partially enlarged in the upper right corner, where the base&#x2019;s inner wall has three guide grooves that match the three spring clips, and the base&#x2019;s bottom has insertion holes that match the pins. The main challenge of this work is correctly inserting the pins into the holes and snapping the three spring clips into their respective guiding grooves without causing any damage to the pins, such as bending or breaking them. Therefore, we use tactile sensing and manual intervention to minimize collision forces during the assembly process to ensure the safety of the terminal hardware.</p>
<p>Robotic Reinforcement Learning: Robotic reinforcement learning tasks can be defined via an Markov Decision Process (MDP) <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="script">A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the state observation (e.g., the combination of the current environmental image, tactile image, and end-effector position), <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the action (e.g., the desired end-effector pose), <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a distribution over initial states, <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the unknown and potentially stochastic transition probabilities that depend on the system dynamics, and <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi mathvariant="script">S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="script">A</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the reward function, which encodes the task. An optimal policy <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is one that maximizes the cumulative expected value of the reward, i.e., <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where the expectation is taken with respect to the initial state distribution, transition probabilities, and policy <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>While the specification of the RL task is concise and simple, turning real-world robotic learning problems into RL problems requires care. First, the sample efficiency of the algorithm for learning <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is paramount: when the learning must take place in the real world, every minute and hour of training comes at a cost. Sample efficiency can be improved by using effective off-policy RL algorithms (<xref ref-type="bibr" rid="B20">Konda and Tsitsiklis, 1999</xref>; <xref ref-type="bibr" rid="B12">Haarnoja et al., 2018</xref>; <xref ref-type="bibr" rid="B9">Fujimoto et al., 2018</xref>), but it can also be accelerated by incorporating prior data and demonstrations (<xref ref-type="bibr" rid="B39">Rajeswaran et al., 2017</xref>; <xref ref-type="bibr" rid="B1">Ball et al., 2023</xref>; <xref ref-type="bibr" rid="B32">Nair et al., 2020</xref>), which is important to achieve the fastest training times. Beyond <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> optimization, robotic RL has to figure out reward functions from image observations and automate initial state resets. Particularly in contact-rich tasks, the controller layer interfaces MDP actions to low-level robot controllers, necessitating safety and precision so that the RL algorithm can experiment with random actions during training.</p>
</sec>
<sec sec-type="methods" id="s4">
<label>4</label>
<title>Methods</title>
<p>In this section, we introduce our visuo-tactile feedback policies with the assistance of human intervention to address the terminal assembly problem. The overview of our method is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overview of the learned three-phase assembly policy: <bold>(A)</bold> The vision guided policy <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> estimates the position of the terminal head and grasps it at an initial pose. <bold>(B)</bold> The tactile guided policy <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> estimates the grasp pose using the tactile image and aligns the z-axis of the terminal head with the insertion axis. <bold>(C)</bold> A vision-tactile multi-modal guided policy <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is used to assemble the terminal head and the terminal base. <bold>(D)</bold> Following the insertion of the elastic latches, a specific procedure is executed to insert the pins, and ultimately, the entire terminal assembly is successfully completed.</p>
</caption>
<graphic xlink:href="frobt-12-1660244-g002.tif">
<alt-text content-type="machine-generated">A multi-panel illustration of a robotic assembly process. Panel A shows vision-guided grasp with a robotic arm and object detection. Panel B displays tactile-guided alignment with tactile sensor data. Panel C illustrates vision and tactile-guided assembly with combined observations. Panel D shows the completed assembly process with a control procedure diagram. Each panel integrates robotic mechanisms and sensor data processing for precise manipulation.</alt-text>
</graphic>
</fig>
<sec id="s4-1">
<label>4.1</label>
<title>Real-world RL for terminal grasp and assembly</title>
<sec id="s4-1-1">
<label>4.1.1</label>
<title>Fundamental RL algorithm</title>
<p>For the reinforcement learning method to be used in terminal assembly, there are two requirements: It must be (1) effective and capable of making several gradient adjustments in a time step, and (2) readily integrate prior data and then get improved with further training. In pursuit of this objective, we expand upon the recently proposed RLPD algorithm (<xref ref-type="bibr" rid="B1">Ball et al., 2023</xref>), which has demonstrated remarkable outcomes in sample-efficient robotic learning. The off-policy actor-critic reinforcement learning algorithm, known as RLPD, relies on the success of temporal difference algorithms with soft-actor critic (<xref ref-type="bibr" rid="B12">Haarnoja et al., 2018</xref>), it undergoes some significant changes to satisfy the requirements above. Three main improvements are made by RLPD: (i) high update-to-data ratio training (UTD); (ii) symmetric sampling between on-policy and prior data, where half of each batch comes from the online replay buffer and half from prior data; and (iii) layer-norm regularization during training. In order to accelerate learning, this technique can either start from scratch or leverage prior data (e.g., demonstrations). Each step of the algorithm updates the parameters of a parametric Q-function <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and actor <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> according to the gradient of their respective loss functions (<xref ref-type="disp-formula" rid="e1">Equations 1</xref>, <xref ref-type="disp-formula" rid="e2">2</xref>):<disp-formula id="e1">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi mathvariant="script">H</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a target network, and the actor loss uses entropy regularization with an adaptively adjusted weight <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Every update step employs a sample-based approximation of each expectation, with half of the samples receiving from the replay buffer and the other half from the prior data (e.g., demonstrations). For efficient learning, multiple update steps are performed per time step in the environment, which is referred to as the update-to-date (UTD) ratio. Regularizing the critic with layer normalization enables higher UTD ratios and more effective training (<xref ref-type="bibr" rid="B1">Ball et al., 2023</xref>).</p>
<p>In this work, <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are trained based on RLPD. And the three improvements of RLPD have also demonstrated their advantages in handling task-specific challenges in our experiments: (i) High UTD ratio: Our training shows that a UTD ratio of 20 reduced wall-clock training time by 47% compared to a UTD ratio of 5 (a common baseline in off-policy RL). This acceleration is critical for real-world assembly, where hardware access is constrained; (ii) Symmetric sampling: Replaying training data (without modifying hardware interactions) revealed that removing symmetric sampling (using 100% online data) increased Q-function loss variance by 63%&#x2014;indicating unstable learning from contact-driven data fluctuations. In contrast, symmetric sampling maintained loss variance <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>5% across epochs; (iii) Layer normalization: Omitting layer normalization caused the policy to diverge in 32% of training trials (vs. 0% with normalization), as it failed to adapt to sudden tactile signal shifts (e.g., from no contact to hard contact with the terminal base).</p>
</sec>
<sec id="s4-1-2">
<label>4.1.2</label>
<title>Classifier-based reward specification</title>
<p>Reward functions are difficult to specify by hand when learning with image observations, as the robot typically requires some sort of perception system just to determine if the task was performed successfully. While some tasks can accommodate hand-specified rewards based on the location of the end effector (under the assumption that the object is held rigidly in the gripper), most tasks require rewards to be deduced from images. In this case, the reward function can be provided by a binary classifier that takes in the state observation <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and outputs the probability of a binary &#x201c;event&#x201d; <inline-formula id="inf23">
<mml:math id="m25">
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, corresponding to successful completion. The reward is then given by <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>This classifier can be trained either using hand specified positive and negative examples, or via an adversarial method called VICE (<xref ref-type="bibr" rid="B8">Fu et al., 2018</xref>). The latter addresses a reward exploitation problem that can arise when learning with classifier based rewards, and removes the need for negative examples in the classifier training set: when the RL algorithm optimizes the reward <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, it can potentially discover &#x201c;adversarial&#x201d; states that fool the classifier <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to erroneously output high probabilities. VICE addresses this issue by adding all states visited by the policy into the training set for the classifier with negative labels, and updating the classifier after each iteration. In this way, the RL process is analogous to a generative adversarial network (GAN), with the policy acting as the generator and the reward classifier acting as the discriminator. We trained corresponding classifiers for <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf28">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in this work. Set the visual image observation for <inline-formula id="inf29">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as a positive example <inline-formula id="inf30">
<mml:math id="m32">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>200</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> when the gripper successfully grabs the terminal head in the initial bin, and the others as negative examples <inline-formula id="inf31">
<mml:math id="m33">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>800</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. In the case of <inline-formula id="inf32">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the visual-tactile multi-modal observation is set as a positive example <inline-formula id="inf33">
<mml:math id="m35">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>600</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> when the terminal head&#x2019;s elastic latches gets inserted into the terminal base; the other situations are set as negative examples <inline-formula id="inf34">
<mml:math id="m36">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2400</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.The rationale for the <inline-formula id="inf35">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ratio lies in the fact that, through our repeated experiments, classifiers trained on datasets adhering to this ratio yield the highest classification accuracy.</p>
</sec>
<sec id="s4-1-3">
<label>4.1.3</label>
<title>Actor and learner nodes</title>
<p>In order to decouple inferring actions and updating policies, this work incorporates alternatives for training and acting in tandem, as seen in <xref ref-type="fig" rid="F3">Figure 3</xref>. In sample-efficient real-world learning tasks with large UTD ratios, we discovered that this was advantageous. Our policy reduces the overall wall-clock time spent training in the real world while maintaining the control frequency at a fixed rate, which is essential for tasks requiring instant feedback and reactions, like deformable objects and contact-rich manipulations (e.g., terminal assembly). This is achieved by separating the actor and learner on two separate threads.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Policy training and real-world robot architecture. Three parallel processes, consisting of the actor, which chooses actions, and the learner node, which actually runs the training code, and the robot environment, which executes the actions from the actor and contributes data back to the learner.</p>
</caption>
<graphic xlink:href="frobt-12-1660244-g003.tif">
<alt-text content-type="machine-generated">Flowchart depicting a parallel training synchronization system with Actor and Learner Nodes. The Actor Node generates actions and receives transition tuples from the Robot Environment, which includes a gym, robot controller, and sensors. The Learner Node processes these via learner threads, queue, and a replay buffer. The system periodically synchronizes policies. Arrows indicate data flow labeled as &#x22;Actions&#x22; and &#x22;Transition Tuples.&#x22;</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Supervised learning for tactile guided alignment</title>
<p>Data Collection: The terminal head fixed in the initial bin throughout data collection. We explore grasp pose variations in 3-DoF (<inline-formula id="inf36">
<mml:math id="m38">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> translation and <inline-formula id="inf37">
<mml:math id="m39">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-axis rotation <inline-formula id="inf38">
<mml:math id="m40">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <xref ref-type="fig" rid="F2">Figure 2A</xref> left). We perform uniform random sampling over the range <inline-formula id="inf39">
<mml:math id="m41">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>6,6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, <inline-formula id="inf40">
<mml:math id="m42">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, <inline-formula id="inf41">
<mml:math id="m43">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:math>
</inline-formula> for <inline-formula id="inf42">
<mml:math id="m44">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, with 12, 10 and 60 samples respectively. The robot closes the gripper with a force of <inline-formula id="inf43">
<mml:math id="m45">
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at each of the sampled poses and records the pair of tactile image readings and <inline-formula id="inf44">
<mml:math id="m46">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. We collect 7,200 pairs of tactile images (<inline-formula id="inf45">
<mml:math id="m47">
<mml:mrow>
<mml:mn>700</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>400</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels, RGB) by Xense G1-WS vision-based tactile sensor as data points in 300 min.</p>
<p>Alignment Policy: We adopted RegNet 3.2 GF (<xref ref-type="bibr" rid="B38">Radosavovic et al., 2020</xref>) as the backbone of the policy network and replaced its last layer with a linear layer producing 3 outputs. Using the aforementioned data&#x2014;comprising pairs of tactile images (<xref ref-type="fig" rid="F2">Figure 2B</xref>, <inline-formula id="inf46">
<mml:math id="m48">
<mml:mrow>
<mml:mn>700</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>800</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> pixels, RGB) corresponding to grasp poses of the PLC terminal (<xref ref-type="fig" rid="F1">Figure 1C</xref>)&#x2014;we trained an alignment policy <inline-formula id="inf47">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> that outputs the desired End-Effector displacement <inline-formula id="inf48">
<mml:math id="m50">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> to align the terminal head with the terminal base (<xref ref-type="fig" rid="F2">Figure 2B</xref>) given a tactile image. Tactile image augmentation was performed by randomly jittering brightness and contrast within the range <inline-formula id="inf49">
<mml:math id="m51">
<mml:mrow>
<mml:mi mathvariant="script">U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0.8</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1.2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>; the jitter range settings were influenced to a certain extent by the geometric features of the grasped terminal head. Regarding hyperparameters, we used a batch size of 128, an initial learning rate of 1e-3 with a decay factor of 0.99 every 100 gradient steps, mean squared error as the loss function, and the Adam optimizer (<xref ref-type="bibr" rid="B18">Kingma, 2014</xref>). These hyperparameters represent optimal values determined through multiple experiments based on the collected raw data and are task-adaptable rather than universal, requiring further adjustment when using different tactile sensors or grasping different objects in future work.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Impedance controller for terminal assembly</title>
<p>During the experiment, we found that the choice of controllers can heavily affect the final performance. This is more pronounced for contact-rich manipulation. In this work (<xref ref-type="fig" rid="F2">Figures 2C,D</xref>), an overly stiff controller might bend the fragile pins and make insertion difficult, while an overly compliant controller might struggle to move the object into position quickly.</p>
<p>A typical setup for robotic RL employs a two-layered control hierarchy, where an RL policy produces setpoint actions at a much lower frequency than the downstream real-time controller. The RL controller can set targets for the low-level controller, but such targets may lead to physically undesirable consequences&#x2014;especially in contact-rich manipulation tasks&#x2014;if not regulated by a robust low-level control mechanism. To this end, the impedance controller is integrated into this hierarchy as a core component, with its framework encompassing a spring-damper-based force objective and a critical error-bounding safety constraint. A typical impedance control objective for this controller (<xref ref-type="disp-formula" rid="e3">Equation 3</xref>) is<disp-formula id="e3">
<mml:math id="m52">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>&#x307;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">cor</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf50">
<mml:math id="m53">
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf51">
<mml:math id="m54">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the measured pose of the end-effector, and <inline-formula id="inf52">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the target pose computed by the upstream controller. Here, <inline-formula id="inf53">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the feed-forward force (used to compensate for static loads like gravity), and <inline-formula id="inf54">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">cor</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the Coriolis force (to mitigate dynamic disturbances from robot motion). This force objective is then converted into joint space torques by multiplying with the Jacobian transpose, offset by nullspace torques to maintain stable joint behavior. By design, the controller acts as a spring-damper system around the equilibrium set by <inline-formula id="inf55">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf56">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (stiffness coefficient) governs the response to position deviations, while <inline-formula id="inf57">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (damping coefficient) smooths motion to avoid oscillations. As described above, this system will yield large forces if <inline-formula id="inf58">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is far away from the current pose, which can lead to a hard collision or damage when the arm is in contact with objects (e.g., during PCB insertion). Therefore, it&#x2019;s crucial to constrain the interaction force generated by it. However, directly reducing <inline-formula id="inf59">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf60">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> will hurt the controller&#x2019;s positional accuracy. Thus, we bound <inline-formula id="inf61">
<mml:math id="m64">
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> so that <inline-formula id="inf62">
<mml:math id="m65">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2264;</mml:mo>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (a predefined safety threshold), and the generated force from the spring-damper system will be bounded to <inline-formula id="inf63">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf64">
<mml:math id="m67">
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the control frequency of the low-level controller. This error-bounding step completes the impedance controller framework, ensuring it balances precision and safety for real-world robotic RL tasks.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experiments</title>
<p>In this section, we introduce the experimental setup of the assembly task and the evaluation of the proposed methods.</p>
<sec id="s5-1">
<label>5.1</label>
<title>Experiment setup</title>
<p>We consider a terminal assembly task using a Franka Emika Panda Robot (7-DoF), equipped with a parallel-jaw gripper with XENSE G1-WS vision-based tactile sensors (used in AgiBot World Colosseo (<xref ref-type="bibr" rid="B2">Bu et al., 2025</xref>)) mounted on both jaws. The G1-WS sensor, independently developed by our laboratory, captures RGB tactile images with a fixed resolution of 700<inline-formula id="inf65">
<mml:math id="m68">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 400 pixels&#x2014;matching the sampling resolution of commercial GelSight (<xref ref-type="bibr" rid="B49">Yuan et al., 2017</xref>) mini sensors&#x2014;and offers advantages including a lower cost ($300) compared to GelSight mini ($500), a larger sensing area (17.5 (H)<inline-formula id="inf66">
<mml:math id="m69">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 29.5 (V) mm) than GelSight mini (18.6 (H)<inline-formula id="inf67">
<mml:math id="m70">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 14.3 (V) mm), and a wedge-shaped structure that adapts to diverse assembly environments. For the alignment policy training (4.2), paired tactile images from both gripper jaws were concatenated horizontally to form a single 700<inline-formula id="inf68">
<mml:math id="m71">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 800 pixel input, ensuring simultaneous capture of contact information from both sides of the terminal head.</p>
<p>The end effector is equipped with two wrist-mounted Intel RealSense Depth Camera D435i RGBD cameras, selected for their high-quality 1,280<inline-formula id="inf69">
<mml:math id="m72">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 720 RGB imaging at up to 90 fps&#x2014;ensuring clear, temporally consistent visual data for dynamic manipulation scenarios. Time synchronization between visual and tactile data was achieved via two steps: (1) Hardware triggering: The D435i cameras and G1-WS tactile sensors were connected to a common GPIO trigger module, ensuring all sensors initiate sampling within a 1 ms time window; (2) Software timestamping: Each sensor frame (visual/tactile) was tagged with a high-precision system timestamp (resolution: 100<inline-formula id="inf70">
<mml:math id="m73">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> s) via Robot Operating System (ROS) topics. The D435i&#x2019;s 90 fps sampling frequency was downsampled to 30 fps (matching the G1-WS&#x2019;s 30 Hz rate) by selecting the visual frame with the timestamp closest to each tactile frame&#x2014;resulting in a maximum synchronization error of &#x3c;5 ms, which is negligible for terminal assembly tasks. This setup guarantees consistency between multi-modal observations.</p>
<p>The D435i&#x2032;s compact form factor minimizes interference with the gripper and assembly components, while its robust SDK (compatible with ROS and Python) facilitates seamless integration into our custom control pipeline. It also delivers reliable performance under varying lighting conditions, including low-light environments, ensuring stable data quality throughout experiments. Additionally, a jieruiweitong DF100 RGB side-camera is configured to capture the entire assembly scene (<xref ref-type="fig" rid="F1">Figure 1</xref>), chosen for its 1,280<inline-formula id="inf71">
<mml:math id="m74">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 720 resolution, 30 Hz sampling rate, and cost-effectiveness ($20).</p>
<p>At the beginning of each training and evaluation episode, the initial end effector pose is sampled uniformly <inline-formula id="inf72">
<mml:math id="m75">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> from a starting region <inline-formula id="inf73">
<mml:math id="m76">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: <inline-formula id="inf74">
<mml:math id="m77">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Meanwhile, we initialize RL training from 30 teleoperated demonstrations (<xref ref-type="sec" rid="s4-1-1">Section 4.1.1</xref>) using a Joystick (BTP-A1N3S). All training was done on a single Nvidia RTX 4090 GPU.</p>
</sec>
<sec id="s5-2">
<label>5.2</label>
<title>Experimental procedure</title>
<p>At the beginning of each test experiment, the end effector is set to the initial pose sampled from <inline-formula id="inf75">
<mml:math id="m78">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="fig" rid="F2">Figure 2A</xref> left). From this starting pose, the robot first executes the grasp policy <inline-formula id="inf76">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to visuoservo and grasps the terminal head&#x2014;leveraging RGB-D data from the D435i cameras for precise localization of the terminal head in the initial bin. During the removal of the terminal head, minor jitter introduced by <inline-formula id="inf77">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> may lead to a collision between the terminal head and the initial bin, thereby causing an error in the grasping posture. Specifically, the gripper remains vertically aligned downward, whereas the terminal head exhibits misalignment with the receptacle in both translational and rotational dimensions (<xref ref-type="fig" rid="F2">Figure 2A</xref> right).</p>
<p>Then the robot activates the align policy <inline-formula id="inf78">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which processes tactile images from the G1-WS sensors to estimate the terminal head&#x2019;s relative pose (y/z translation and roll rotation) and outputs corrective movements to align the terminal head&#x2019;s insertion axis with the terminal base (<xref ref-type="fig" rid="F2">Figure 2B</xref>). The G1-WS&#x2019;s large sensing area and high-resolution imaging ensure accurate pose estimation, while its wedge-shaped design avoids interference with the gripper during alignment.</p>
<p>After the alignment, the vision-tactile guided assembly policy <inline-formula id="inf79">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is executed to insert the elastic latches (<xref ref-type="fig" rid="F2">Figure 2C</xref>), fusing D435i visual data (for environmental context) and G1-WS tactile feedback (for contact detection). Due to the structural redundancy and ductility of the assembled PLC terminal, once all elastic latches are properly inserted, a simple vertical downward force applied to the terminal head is sufficient to ensure complete insertion of all pins. Accordingly, we developed an open-loop control program to execute the final pin insertion process (<xref ref-type="fig" rid="F2">Figure 2D</xref>). The robot then resets to the next initial sampled pose, waiting for the next test.</p>
<p>During the policy training and testing process, human intervention was triggered by a hybrid mechanism combining manual visual observation and automatic force sensing, with clearly defined termination conditions: (i) Successful termination: The robot successfully grasps the terminal head <inline-formula id="inf80">
<mml:math id="m83">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and completes the assembly after adjusting the grasping pose <inline-formula id="inf81">
<mml:math id="m84">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. (ii) Grasp failure intervention: Triggered when the <inline-formula id="inf82">
<mml:math id="m85">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for 5 consecutive seconds (indicating unstable grasp). Intervention was initiated via Joystick by the experimenter to manually re-grasp until the <inline-formula id="inf83">
<mml:math id="m86">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, after which the task terminates. (iii) Deviation/collision intervention: Triggered by two complementary cues: (a) Manual visual observation: the experimenter initiated intervention upon visually detecting the terminal head deviating from the terminal base or colliding with non-target components; (b) Automatic force sensing: The system automatically paused motion and prompted intervention if the EE force-torque sensor detected a collision force <inline-formula id="inf84">
<mml:math id="m87">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 30 N. Upon intervention, the experimenter manually completed assembly until the <inline-formula id="inf85">
<mml:math id="m88">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, then the task terminates. Notably, in both conditions (ii) and (iii), the data collected during manual intervention is stored as expert demonstration data into the replay buffers of <inline-formula id="inf86">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">grasp</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf87">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> respectively, to guide and accelerate policy training.</p>
</sec>
<sec id="s5-3">
<label>5.3</label>
<title>Comparison and ablation studies</title>
<p>Examine the function and significance of the RLPD algorithm: As outlined in <xref ref-type="sec" rid="s4-1-1">Section 4.1.1</xref>, the most distinctive characteristic of the RLPD algorithm lies in its integration of human prior demonstrations to guide the learning process, which effectively reduces both training time and sample complexity. To assess the necessity of these demonstrations, we compare our approach with the Twin Delayed Deep Deterministic Policy Gradient (TD3), an off-policy Actor-Critic algorithm derived from DDPG. TD3 belongs to the class of online reinforcement learning algorithms that require continuous interaction with the environment and rely solely on trial-and-error learning to discover optimal policies, without incorporating human demonstrations. The comparison is conducted under identical environmental settings: (1) Exploration noise: Gaussian noise with standard deviation &#x3d; 0.1 (applied to end-effector pose commands); (2) Learning rate: 1e-3 for actor/critic networks (Adam optimizer); (3) Training epochs: 200 epochs (1,000 steps per epoch); (4) Network architecture: Same 3-layer actor/critic structure (consistent with RLPD&#x2019;s base design).</p>
<p>Furthermore, to demonstrate that expert demonstrations alone are insufficient for task completion, we also evaluate a behavioral cloning (BC) baseline trained on 150 high-quality expert teleoperated demonstrations. This dataset size approximately matches the total amount of data stored in the RLPD replay buffer at convergence. To ensure fair comparison: (1) Network architecture: BC used the same RegNet 3.2 GF backbone as RLPD&#x2019;s alignment policy <inline-formula id="inf88">
<mml:math id="m91">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, with an output layer predicting end-effector poses; (2) Training epochs: 200 epochs (matching RLPD), batch size &#x3d; 128. It is important to note that this BC baseline utilizes five times more demonstration data than the number of demonstrations required by our method. Meanwhile, to intuitively verify the role of &#x201c;human prior demonstrations&#x201d; in the RLPD algorithm, we replaced the demo buffer with a subset of replay buffer data in one training session to isolate and examine the function of human demonstrations.</p>
<p>We report the results in <xref ref-type="table" rid="T1">Table 1</xref>, and show example executions in <xref ref-type="fig" rid="F4">Figure 4</xref>. Training the TD3 policy in the physical environment resulted in divergence across all conducted training trials. In each case, the terminal head collided with the terminal base during the training of <inline-formula id="inf89">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, causing significant changes to the relative grasp pose or inflicting damage to the pins and the tactile sensor gel pad. Such issues cannot be directly corrected due to the absence of a reliable recovery procedure that can systematically restore the grasp pose without human demonstrations. Our policies significantly outperform BC baselines, even when trained with five times fewer demonstrations than BC. This indicates that relying solely on demonstrations is insufficient for achieving optimal performance. In addition to achieving up to a tenfold improvement in success rate over BC methods, our approach also reduces training time by up to twofold. Removing real-time human intervention data from the buffer leads to a 68% drop in success rate (from 100 to 32), confirming the buffer&#x2019;s role in addressing rare failure modes (<xref ref-type="table" rid="T1">Table 1</xref>, RLPD (w/o demo)). We also observed from the aforementioned experiments that the terminal head rotation and translation estimated based on tactile images <inline-formula id="inf90">
<mml:math id="m93">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> exhibit a high degree of accuracy (see <xref ref-type="table" rid="T2">Table 2</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Results suggest that (1) frequent slippage and rotations of the terminal head caused by collisions with the terminal base lead to failure in training TD3, (2) the BC trained solely on 150 human demonstrations is insufficient for training an accurate assembly model and (3) the human demonstrations play an important role in improving training efficiency and policy success rate. Our approach outperforms both baseline policies.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Algorithms</th>
<th align="center">&#x23; Of demos</th>
<th align="center">Env input</th>
<th align="center">Training time</th>
<th align="center">Success/Total</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">TD3</td>
<td align="center">0</td>
<td align="center">Yes</td>
<td align="center">285 min</td>
<td align="center">0/100</td>
</tr>
<tr>
<td align="center">BC</td>
<td align="center">150</td>
<td align="center">No</td>
<td align="center">105 min</td>
<td align="center">9/100</td>
</tr>
<tr>
<td align="center">RLPD (w/o demo)</td>
<td align="center">0</td>
<td align="center">Yes</td>
<td align="center">265 min</td>
<td align="center">32/100</td>
</tr>
<tr>
<td align="center">RLPD (Ours)</td>
<td align="center">30</td>
<td align="center">Yes</td>
<td align="center">55 min</td>
<td align="center">100/100</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Illustration of the robot performing terminal assembly with our method. The green box indicates a state where the robot receives classifier reward for completing the task.</p>
</caption>
<graphic xlink:href="frobt-12-1660244-g004.tif">
<alt-text content-type="machine-generated">Robotic arm performing a task above a flat surface with mounted components, including a small structure and electronic parts. The arm's movement is shown in a sequence of four images, demonstrating its range of motion.</alt-text>
</graphic>
</fig>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Mean and standard deviation of the error in estimating the relative grasp pose <inline-formula id="inf91">
<mml:math id="m94">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of the terminal head using the tactile-based pose estimation policy <inline-formula id="inf92">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tac2pos</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, evaluated over 100 sampled initial end effector poses.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Error Dimension</th>
<th align="center">
<inline-formula id="inf93">
<mml:math id="m96">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf94">
<mml:math id="m97">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf95">
<mml:math id="m98">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Mean Error</td>
<td align="center">8.63e-2</td>
<td align="center">1.28e-1</td>
<td align="center">5.76e-3</td>
</tr>
<tr>
<td align="center">Standard Deviation</td>
<td align="center">4.28e-3</td>
<td align="center">6.13e-2</td>
<td align="center">4.23e-3</td>
</tr>
<tr>
<td align="center">Success Threshold (ME)</td>
<td align="center">1.50e-1</td>
<td align="center">2.00e-1</td>
<td align="center">1.80e-2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Exploring Utility of Tactile and Vision Information: We perform study the relative benefits of using tactile and vision for assembly term <inline-formula id="inf96">
<mml:math id="m99">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">assemble</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. We test 3 different approaches: (1) A Tactile Only approach (<xref ref-type="fig" rid="F2">Figure 2C</xref>, the lower part of Visuo-Tactile Observation) (2) A Vision Only approach (<xref ref-type="fig" rid="F2">Figure 2C</xref>, the upper part of Visuo-Tactile Observation) and (3) a Combined Approach (Ours). We perform experiments with the three different approaches with the same procedure as in <xref ref-type="sec" rid="s5-2">Section 5.2</xref> and report results in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Ablation study with comparing single modal Tactile Only, Vision Only, and a Combined two-modal approach leveraging tactile and visual information.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Observation</th>
<th align="center">Training time</th>
<th align="center">Success/Total</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Tactile Only</td>
<td align="center">195 min</td>
<td align="center">23/100</td>
</tr>
<tr>
<td align="center">Vision Only</td>
<td align="center">60 min</td>
<td align="center">1/100</td>
</tr>
<tr>
<td align="center">Vision &#x2b; Tactile</td>
<td align="center">55 min</td>
<td align="center">100/100</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The Tactile Only model achieved successful assembly 23/100 times. However, its training time exceeded 3 times that of the other two models. This is because, across much of the exploration range, no contact occurred between the terminal head and terminal base, resulting in static tactile sensor images. Consequently, a significant portion of the training process involved the policy exploring for the position of the terminal base. These findings suggest that visual observation is essential for estimating the approximate location of the terminal base, enabling the policy to actively reduce the exploration space and accelerate learning. In contrast, the Vision Only model exhibited faster convergence during training but performed poorly in completing the assembly task, achieving only one success in 100 attempts. This limitation stems from the absence of fine-grained tactile feedback regarding contact events, highlighting the necessity of tactile sensing for millimeter-level positional estimation in contact-rich tasks. The multi-modal model, which integrates both tactile and visual inputs, outperforms either modal approach by combining tactile-based terminal head position prediction with vision-based implicit estimation of environmental states. This synergy demonstrates that the integration of tactile and visual observations effectively reduces uncertainties inherent in assembly tasks.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s6">
<label>6</label>
<title>Discussion</title>
<p>In conclusion, we propose an effective and safe methodology for acquiring a visuo-tactile insertion policy within real-world reinforcement learning (RL) environments characterized by unknown component positions and grasping configurations. This is achieved by leveraging human demonstrations to accelerate the training process while maintaining the safety of the components, alongside the implementation of a structured three-phase assembly framework that delineates the task into distinct stages&#x2014;grasping, alignment, and insertion&#x2014;facilitated by integrated tactile and visual feedback.</p>
<sec id="s6-1">
<label>6.1</label>
<title>Limitations</title>
<p>Although our results are promising, several limitations of the proposed approach remain. First, the generalizability of our method has yet to be validated across various assembly tasks, particularly those involving objects with more intricate geometric properties (e.g., non-prismatic components with curved mating surfaces) or scenarios where the physical dimensions significantly deviate from the scale of the tactile sensor (e.g., micro-assembly tasks with parts <inline-formula id="inf97">
<mml:math id="m100">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5 mm or large components <inline-formula id="inf98">
<mml:math id="m101">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 50 mm). The current tactile pose estimation policy <inline-formula id="inf99">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>tac</mml:mtext>
<mml:mn>2</mml:mn>
<mml:mtext>pos</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is trained specifically on PLC terminals, and its performance degrades when applied to parts with distinct contact patterns (e.g., smooth metallic vs. textured plastic surfaces). Second, components composed of different materials may necessitate the application of distinct pose estimation algorithms: for example, slippery materials (e.g., Teflon-coated terminals) introduce slippage between the gripper and part, which the current tactile model does not explicitly account for. Third, during the collection of human demonstrations and the training phase, the unique characteristics of the assembled programmable logic controller (PLC) in this study require a human operator to manually detach the terminal head following each successful assembly to reset the environment. This manual intervention not only extends the training duration (adding 10s per trial) but also introduces variability due to inconsistencies in human execution (e.g., varying detachment forces that alter the initial bin&#x2019;s part placement).</p>
</sec>
<sec id="s6-2">
<label>6.2</label>
<title>Future work</title>
<p>To address these limitations, future research should focus on three main directions. First, generalizing the proposed methodology to encompass assembly tasks involving objects with diverse shapes, materials, and dimensions: this will involve developing few-shot tactile pose estimation models that adapt to new parts with minimal retraining data, as well as integrating material property estimation (e.g., friction coefficient) from tactile images to handle slippage&#x2014;directly addressing the need for multi-material terminal adaptation in industrial scenarios. Specifically, we aim to extend the current PLC terminal-focused framework to metallic, Teflon-coated, and composite-material terminals, where varying surface properties (e.g., friction coefficients ranging from 0.2 to 0.6) require adaptive tactile signal interpretation and grasp force adjustment. Second, the development of an automated reset learning framework tailored specifically for terminal insertion and extraction processes: this framework could leverage the existing <inline-formula id="inf100">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>tac</mml:mtext>
<mml:mn>2</mml:mn>
<mml:mtext>pos</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> policy to detect successful assembly, followed by a learned &#x201c;extraction policy&#x201d; that uses tactile feedback to safely detach the terminal head without human intervention&#x2014;significantly improving the efficiency and reliability of such systems. Concurrently, we will investigate batch assembly efficiency optimization by integrating real-time sensor drift compensation (e.g., calibrating tactile image brightness and depth accuracy across 100&#x2b; consecutive assembly cycles) and adaptive RL policy updates to mitigate performance fluctuations induced by environmental wear (e.g., gripper fatigue) or component batch variations. Third, optimizing the multi-modal policy for edge deployment: techniques such as model quantization and knowledge distillation will be explored to reduce the computational footprint of the RegNet backbone and RLPD-based policy, enabling real-time inference on embedded GPUs. Additionally, future work will investigate the integration of foundation models for visual-tactile fusion, which could eliminate the need for task-specific classifiers by leveraging pre-trained knowledge of object interactions. Finally, validating the method in industrial factory settings with variable lighting, vibration, and part tolerances will be critical to demonstrating its practical applicability&#x2014;with a focus on validating multi-material adaptation and batch efficiency improvements in real-world production lines.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>YL: Formal Analysis, Writing &#x2013; original draft, Methodology, Data curation, Validation, Investigation, Visualization, Writing &#x2013; review and editing. ZJ: Writing &#x2013; review and editing, Data curation. JL: Writing &#x2013; review and editing, Methodology, Software. DM: Resources, Funding acquisition, Writing &#x2013; review and editing, Supervision, Project administration.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2686736/overview">Rajkumar Muthusamy</ext-link>, Dubai Future Foundation, United Arab Emirates</p>
</fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2947013/overview">Chuanfei Hu</ext-link>, Southeast University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3086380/overview">Zhongpan Zhu</ext-link>, University of Shanghai for Science and Technology, China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ball</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kostrikov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Efficient online reinforcement learning with offline data</article-title>,&#x201d; in <source>International conference on machine learning (PMLR)</source>, <fpage>1577</fpage>&#x2013;<lpage>1594</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Agibot world colosseo: a large-scale manipulation platform for scalable and intelligent embodied systems</article-title>. <comment>arXiv preprint arXiv:2503.06669</comment>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaudhury</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Man</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Atkeson</surname>
<given-names>C. G.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Using collocated vision and tactile sensors for visual servoing and localization</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>7</volume>, <fpage>3427</fpage>&#x2013;<lpage>3434</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2022.3146565</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Dang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Fusing vision and force: a framework of reinforcement learning for elastic peg-in-hole assembly</article-title>,&#x201d; in <source>2023 WRC symposium on advanced robotics and automation (WRC SARA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>De Mello</surname>
<given-names>L. H.</given-names>
</name>
<name>
<surname>Sanderson</surname>
<given-names>A. C.</given-names>
</name>
</person-group> (<year>1989</year>). &#x201c;<article-title>A correct and complete algorithm for the generation of mechanical assembly sequences</article-title>,&#x201d; in <source>1989 IEEE international conference on robotics and automation</source> (<publisher-name>IEEE Computer Society</publisher-name>), <fpage>56</fpage>&#x2013;<lpage>57</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tomizuka</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>A learning framework for high precision industrial assembly</article-title>,&#x201d; in <source>2019 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>811</fpage>&#x2013;<lpage>817</lpage>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Florence</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lynch</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ramirez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Wahid</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Downs</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Implicit behavioral cloning</article-title>,&#x201d; in <source>
<italic>Conference on robot learning</italic> (PMLR)</source>, <fpage>158</fpage>&#x2013;<lpage>168</lpage>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ghosh</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Variational inverse control with events: a general framework for data-driven reward definition</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>31</volume>. <pub-id pub-id-type="doi">10.48550/arXiv.1805.11686</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Fujimoto</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hoof</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Meger</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Addressing function approximation error in actor-critic methods</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1587</fpage>&#x2013;<lpage>1596</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Funk</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Helmut</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chalvatzaki</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Calandra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Evetac: an event-based optical tactile sensor for robotic manipulation</article-title>. <source>IEEE Trans. Robotics</source> <volume>40</volume>, <fpage>3812</fpage>&#x2013;<lpage>3832</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2024.3428430</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goldberg</surname>
<given-names>K. Y.</given-names>
</name>
</person-group> (<year>1993</year>). <article-title>Orienting polygonal parts without sensors</article-title>. <source>Algorithmica</source> <volume>10</volume>, <fpage>201</fpage>&#x2013;<lpage>225</lpage>. <pub-id pub-id-type="doi">10.1007/bf01891840</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Haarnoja</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Soft actor-critic: off-policy maximum entropy deep reinforcement learning with a stochastic actor</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>Pmlr</publisher-name>), <fpage>1861</fpage>&#x2013;<lpage>1870</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Hansen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hogan</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Rivkin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Meger</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jenkin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dudek</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Visuotactile-rl: learning multimodal manipulation policies with deep reinforcement learning</article-title>,&#x201d; in <source>2022 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>8298</fpage>&#x2013;<lpage>8304</lpage>.</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Tla: tactile-language-action model for contact-rich manipulation</article-title>. <source>arXiv Prepr. arXiv:2503.08548</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2503.08548</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ichiwara</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ito</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yamamoto</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mori</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ogata</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Contact-rich manipulation of a flexible object based on deep predictive learning using vision and tactility</article-title>,&#x201d; in <source>2022 international conference on robotics and automation (ICRA)</source>, <fpage>5375</fpage>&#x2013;<lpage>5381</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA46639.2022.9811940</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Johannink</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bahl</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nair</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Loskyll</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Residual reinforcement learning for robot control</article-title>,&#x201d; in <source>2019 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>6023</fpage>&#x2013;<lpage>6029</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kelestemur</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Platt</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Padir</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Tactile pose estimation and policy learning for unknown object manipulation</article-title>. <source>arXiv Prepr. arXiv:2203</source>, <fpage>10685</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2203.10685</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname>
<given-names>D. P.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Adam: a method for stochastic optimization</article-title>. <comment>arXiv preprint arXiv:1412.6980</comment>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Koga</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kerrick</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chitta</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>On cad informed adaptive robotic assembly</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>10207</fpage>&#x2013;<lpage>10214</lpage>.</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Konda</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tsitsiklis</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Actor-critic algorithms</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>12</volume>. <pub-id pub-id-type="doi">10.1137/S0363012901385691</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lambeta</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>P.-W.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Maloon</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Most</surname>
<given-names>V. R.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Digit: a novel design for a low-cost compact high-resolution tactile sensor with application to in-hand manipulation</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>5</volume>, <fpage>3838</fpage>&#x2013;<lpage>3845</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2020.2977257</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Platt</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ten Pas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Roscup</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>M. A.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). &#x201c;<article-title>Localization and manipulation of small parts using gelsight tactile sensing</article-title>,&#x201d; in <source>2014 IEEE/RSJ international conference on intelligent robots and systems (IEEE)</source>, <fpage>3988</fpage>&#x2013;<lpage>3993</lpage>.</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lian</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kelch</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Holz</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Norton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Benchmarking off-the-shelf solutions to robotic assembly tasks</article-title>,&#x201d; in <conf-name>2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>1046</fpage>&#x2013;<lpage>1053</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Corcodel</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Generalize by touching: tactile ensemble skill transfer for robotic furniture assembly</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>9227</fpage>&#x2013;<lpage>9233</lpage>.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Real-time reconstruction of 3d tactile motion field <italic>via</italic> multi-task learning</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>73</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/tim.2024.3398136</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lozano-P&#xe9;rez</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1986</year>). &#x201c;<article-title>Motion planning and the design of orienting devices for vibratory part feeders</article-title>,&#x201d; in <source>IEEE journal of robotics and automation</source> (<publisher-name>MIT AI Laboratory</publisher-name>).</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lozano-Perez</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Mason</surname>
<given-names>M. T.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>R. H.</given-names>
</name>
</person-group> (<year>1984</year>). <article-title>Automatic synthesis of fine-motion strategies for robots</article-title>. <source>Int. J. Robotics Res.</source> <volume>3</volume>, <fpage>3</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1177/027836498400300101</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lepora</surname>
<given-names>N. F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Dexitac: soft dexterous tactile gripping</article-title>. <source>IEEE/ASME Trans. Mechatronics</source> <volume>30</volume>, <fpage>333</fpage>&#x2013;<lpage>344</lpage>. <pub-id pub-id-type="doi">10.1109/tmech.2024.3384432</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sushkov</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Pevceviciute</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Vecerik</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Robust multi-modal policies for industrial assembly <italic>via</italic> reinforcement learning and demonstrations: a large-scale study</article-title>. <comment>arXiv preprint arXiv:2103.11512</comment>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Y. L.</given-names>
</name>
<name>
<surname>Berg</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). &#x201c;<article-title>Serl: a software suite for sample-efficient robotic reinforcement learning</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>16961</fpage>&#x2013;<lpage>16969</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>McKee</surname>
<given-names>K. E.</given-names>
</name>
</person-group> (<year>1985</year>). <article-title>Automatic assembly by G. Boothroyd, C. poli and L.E. murch, marcel dekker, New York, 378 pp., 1982 ($45.00)</article-title>, <source>Robotica</source>, <publisher-name>Marcel Dekker</publisher-name>, <publisher-loc>new york</publisher-loc>, <volume>3</volume>, <fpage>195</fpage>&#x2013;<lpage>196</lpage>. <pub-id pub-id-type="doi">10.1017/S0263574700009255</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nair</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dalal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Accelerating online reinforcement learning with offline datasets. arxiv 2020</article-title>. <source>arXiv Prepr. arXiv:2006</source>, <fpage>09359</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2006.09359</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Narang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Storey</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Akinola</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Macklin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Reist</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wawrzyniak</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Factory: fast contact for robotic assembly</article-title>. <comment>arXiv preprint arXiv:2205.03532</comment>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Natarajan</surname>
<given-names>B. K.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>Some paradigms for the automated design of parts feeders</article-title>. <source>Int. J. Robotics Res.</source> <volume>8</volume>, <fpage>98</fpage>&#x2013;<lpage>109</lpage>. <pub-id pub-id-type="doi">10.1177/027836498900800607</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Okumura</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nishio</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Taniguchi</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Tactile-sensitive newtonianvae for high-accuracy industrial connector insertion</article-title>,&#x201d; in <conf-name>2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>4625</fpage>&#x2013;<lpage>4631</lpage>.</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ozalp</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ucar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Guzelis</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Advancements in deep reinforcement learning and inverse reinforcement learning for robotic manipulation: toward trustworthy, interpretable, and explainable artificial intelligence</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>51840</fpage>&#x2013;<lpage>51858</lpage>. <pub-id pub-id-type="doi">10.1109/access.2024.3385426</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dalay</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Parkin</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1995</year>). <article-title>Fine motion strategies for robotic peg-hole insertion</article-title>. <source>Proc. Institution Mech. Eng. Part C J. Mech. Eng. Sci.</source> <volume>209</volume>, <fpage>429</fpage>&#x2013;<lpage>448</lpage>. <pub-id pub-id-type="doi">10.1243/pime_proc_1995_209_173_02</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Radosavovic</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kosaraju</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Designing network design spaces</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>10428</fpage>&#x2013;<lpage>10436</lpage>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rajeswaran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Vezzani</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Todorov</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Learning complex dexterous manipulation with deep reinforcement learning and demonstrations</article-title>. <source>arXiv Prepr. arXiv:1709</source>, <fpage>10087</fpage>. <pub-id pub-id-type="doi">10.15607/RSS.2018.XIV.049</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Schoettler</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Nair</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bahl</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ojea</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Solowjow</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2020a</year>). &#x201c;<article-title>Deep reinforcement learning for industrial insertion tasks with visual inputs and natural rewards</article-title>,&#x201d; in <source>2020 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5548</fpage>&#x2013;<lpage>5555</lpage>.</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Schoettler</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Nair</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ojea</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Solowjow</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020b</year>). &#x201c;<article-title>Meta-reinforcement learning for robotic industrial insertion tasks</article-title>,&#x201d; in <source>2020 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>9728</fpage>&#x2013;<lpage>9735</lpage>.</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Spector</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Di Castro</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Insertionnet-a scalable solution for insertion</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>6</volume>, <fpage>5509</fpage>&#x2013;<lpage>5516</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2021.3076971</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Spector</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Tchuiev</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Di Castro</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Insertionnet 2.0: minimal contact multi-step insertion using multimodal multiview sensory input</article-title>,&#x201d; in <source>2022 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>6330</fpage>&#x2013;<lpage>6336</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Taylor</surname>
<given-names>I. H.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rodriguez</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Gelslim 3.0: high-Resolution measurement of shape, force and slip in a compact tactile-sensing finger</article-title>,&#x201d; in <source>2022 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>10781</fpage>&#x2013;<lpage>10787</lpage>.</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Elastic tactile simulation towards tactile-visual perception</article-title>,&#x201d; in <source>Proceedings of the 29th ACM international conference on multimedia</source>, <fpage>2690</fpage>&#x2013;<lpage>2698</lpage>.</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lambeta</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>P.-W.</given-names>
</name>
<name>
<surname>Calandra</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Tacto: a fast, flexible, and open-source simulator for high-resolution vision-based tactile sensors</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>7</volume>, <fpage>3930</fpage>&#x2013;<lpage>3937</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3146945</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bekris</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>You only demonstrate once: category-level manipulation from single visual demonstration</article-title>. <comment>arXiv preprint arXiv:2201.12716</comment>
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bing</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). &#x201c;<article-title>Tacdiffusion: force-domain diffusion policy for precise tactile manipulation</article-title>,&#x201d; in <conf-name>2025 IEEE International Conference on Robotics and Automation (ICRA)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>11831</fpage>&#x2013;<lpage>11837</lpage>.</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Adelson</surname>
<given-names>E. H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Gelsight: high-resolution robot tactile sensors for estimating geometry and force</article-title>. <source>Sensors</source> <volume>17</volume>, <fpage>2762</fpage>. <pub-id pub-id-type="doi">10.3390/s17122762</pub-id>
<pub-id pub-id-type="pmid">29186053</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Vtla: vision-tactile-language-action model with preference learning for insertion manipulation</article-title>. <source>arXiv Prepr. arXiv:2505.09577</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2505.09577</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>T. Z.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sushkov</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Pevceviciute</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Heess</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Scholz</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Offline meta-reinforcement learning for industrial insertion</article-title>,&#x201d; in <source>2022 international conference on robotics and automation (ICRA) (IEEE)</source>, <fpage>6386</fpage>&#x2013;<lpage>6393</lpage>.</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ruan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Tac-man: tactile-informed prior-free manipulation of articulated objects</article-title>. <source>IEEE Trans. Robotics</source> <volume>41</volume>, <fpage>538</fpage>&#x2013;<lpage>557</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2024.3508134</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>