<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1478016</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2024.1478016</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Technology and Code</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>WearMoCap: multimodal pose tracking for ubiquitous robot control using a smartwatch</article-title>
<alt-title alt-title-type="left-running-head">Weigend et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2024.1478016">10.3389/frobt.2024.1478016</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Weigend</surname>
<given-names>Fabian C.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2794196/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing-original draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kumar</surname>
<given-names>Neelesh</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2755151/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Aran</surname>
<given-names>Oya</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ben Amor</surname>
<given-names>Heni</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2900253/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Interactive Robotics Laboratory</institution>, <institution>School of Computing and Augmented Intelligence (SCAI)</institution>, <institution>Arizona State University (ASU)</institution>, <addr-line>Tempe</addr-line>, <addr-line>AZ</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Corporate Functions-R&#x26;D</institution>, <institution>Procter and Gamble</institution>, <addr-line>Mason</addr-line>, <addr-line>OH</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2647045/overview">Anany Dwivedi</ext-link>, University of Waikato, New Zealand</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/536510/overview">Maria Pozzi</ext-link>, University of Siena, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1342563/overview">Alessandro Carf&#xec;</ext-link>, University of Genoa, Italy</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Fabian C. Weigend, <email>fweigend@asu.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1478016</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Weigend, Kumar, Aran and Ben Amor.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Weigend, Kumar, Aran and Ben Amor</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>We present WearMoCap, an open-source library to track the human pose from smartwatch sensor data and leveraging pose predictions for ubiquitous robot control. WearMoCap operates in three modes: 1) a Watch Only mode, which uses a smartwatch only, 2) a novel Upper Arm mode, which utilizes the smartphone strapped onto the upper arm and 3) a Pocket mode, which determines body orientation from a smartphone in any pocket. We evaluate all modes on large-scale datasets consisting of recordings from up to 8 human subjects using a range of consumer-grade devices. Further, we discuss real-robot applications of underlying works and evaluate WearMoCap in handover and teleoperation tasks, resulting in performances that are within 2 cm of the accuracy of the gold-standard motion capture system. Our Upper Arm mode provides the most accurate wrist position estimates with a Root Mean Squared prediction error of 6.79 cm. To evaluate WearMoCap in more scenarios and investigate strategies to mitigate sensor drift, we publish the WearMoCap system with thorough documentation as open source. The system is designed to foster future research in smartwatch-based motion capture for robotics applications where ubiquity matters. <ext-link ext-link-type="uri" xlink:href="http://www.github.com/wearable-motion-capture">www.github.com/wearable-motion-capture</ext-link>.</p>
</abstract>
<kwd-group>
<kwd>motion capture</kwd>
<kwd>human-robot interaction</kwd>
<kwd>teleoperation</kwd>
<kwd>smartwatch</kwd>
<kwd>wearables</kwd>
<kwd>drone control</kwd>
<kwd>IMU motion capture</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Human-Robot Interaction</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Tracking and estimating the human pose is essential for applications in teleoperation (<xref ref-type="bibr" rid="B7">Hauser et al., 2024</xref>), imitation learning (<xref ref-type="bibr" rid="B3">Fu et al., 2024</xref>), and human-robot collaboration (<xref ref-type="bibr" rid="B22">Robinson et al., 2023</xref>). To date, camera-based approaches are the gold standard for capturing human position and motion (<xref ref-type="bibr" rid="B4">Desmarais et al., 2021</xref>; <xref ref-type="bibr" rid="B22">Robinson et al., 2023</xref>). While purely optical motion capture solutions provide a high degree of accuracy, they are also subject to line-of-sight issues, which typically confines their use to controlled environments (<xref ref-type="bibr" rid="B3">Fu et al., 2024</xref>; <xref ref-type="bibr" rid="B2">Darvish et al., 2023</xref>). This requirement of controlled environments is even more prominent in human pose estimation advances in Virtual Reality (VR), and Mixed Reality methods (<xref ref-type="bibr" rid="B28">Walker et al., 2023</xref>), which typically require the user to wear VR headsets, or heavily rely on camera-based tracking.</p>
<p>The most prominent alternatives to optical solutions are based on Inertial Measurment Unit (IMU) sensors (<xref ref-type="bibr" rid="B19">Noh et al., 2024</xref>; <xref ref-type="bibr" rid="B8">Hindle et al., 2021</xref>). These methods employ customized IMU-based solutions (<xref ref-type="bibr" rid="B20">Prayudi and Kim, 2012</xref>; <xref ref-type="bibr" rid="B1">Beange et al., 2018</xref>; <xref ref-type="bibr" rid="B13">Li et al., 2021</xref>) on low-cost wearable embedded system (<xref ref-type="bibr" rid="B21">Raghavendra et al., 2017</xref>), possibly in fusion with optical methods for enhanceed accuracy (<xref ref-type="bibr" rid="B16">Malleson et al., 2017</xref>; <xref ref-type="bibr" rid="B24">Shin et al., 2023</xref>). Unlike optical methods, IMUs do not require a direct line of sight because they are directly attached to the user&#x2019;s body. Commercial IMU motion capture systems incorporate up to 17 IMUs, enabling highly accurate non-optical human pose estimation (<xref ref-type="bibr" rid="B23">Roetenberg et al., 2009</xref>). Configurations with fewer sensors benefit from advances in deep-learning to obtain reliable lower-fidelity human poses (<xref ref-type="bibr" rid="B9">Huang et al., 2018</xref>). However, IMU-based motion capture systems typically require specialized IMU units and calibration procedures, thereby hindering their portability and applicability for inexperienced users (<xref ref-type="bibr" rid="B9">Huang et al., 2018</xref>; <xref ref-type="bibr" rid="B23">Roetenberg et al., 2009</xref>).</p>
<p>With the constantly growing popularity of consumer wearables, IMU-based motion capture from smartwatch and smartphone data offers perhaps the most ubiquitous solution (<xref ref-type="bibr" rid="B12">Lee and Joo, 2024</xref>). The recent IMUPoser (<xref ref-type="bibr" rid="B17">Mollyn et al., 2023</xref>) and SmartPoser (<xref ref-type="bibr" rid="B5">DeVrio et al., 2023</xref>) demonstrate that, even though consumer wearables motion capture may be less accurate than their optical and specialized IMU-based counterparts, these solutions are attractive because users tend to have these devices on them most of the time, enabling pose tracking at anytime and anywhere.</p>
<p>Despite these advances in ubiquitous pose tracking, smartwatch applications in robotics often merely utilize roll, pitch, yaw and gesture based control (<xref ref-type="bibr" rid="B26">Villani et al., 2020a</xref>), or on-body sensors for cognitive stress and alertness (<xref ref-type="bibr" rid="B11">Lee et al., 2015</xref>; <xref ref-type="bibr" rid="B27">Villani et al., 2020b</xref>). We have recently demonstrated the opportunities of motion capture from smartwatches for ubiquitous robot control (<xref ref-type="bibr" rid="B33">Weigend et al., 2023b</xref>; <xref ref-type="bibr" rid="B32">2024</xref>). Under a fixed-body-orientation constraint, we showed that a single smartwatch facilitates teleoperation tasks (<xref ref-type="bibr" rid="B33">Weigend et al., 2023b</xref>). The additional sensor data from a smartphone in the pocket allows for tracking body orientation as well (<xref ref-type="bibr" rid="B32">Weigend et al., 2024</xref>; <xref ref-type="bibr" rid="B31">Weigend et al., 2023a</xref>). To foster future research in ubiquitous motion capture for robotics, in this work, we present WearMoCap&#x2014;a comprehensive wearables-based motion capture system to unify and augment previous approaches in one system. As depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, WearMoCap has three modes of operation for different levels of precision and portability. Improving on previous works, we benchmark WearMoCap extensively on three large-scale datasets, and show successful demonstration on multiple real-world robotics tasks.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Our WearMoCap system features three modes: <italic>Watch Only</italic> requires a single smartwatch only. For <italic>Upper Arm</italic> we use a common fitness arm strap for the connected smartphone. The <italic>Pocket</italic> mode tracks the arm pose and uses the phone to determine changes in body orientation. We evaluate all modes in real-robot tasks, i.e., teleoperation, intervention <bold>(A)</bold>, handovers <bold>(B)</bold> and drone piloting <bold>(C)</bold>.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g001.tif"/>
</fig>
<p>We publish WearMoCap as an open-source library, together with extensive documentation, as well as all our training and test data. Specifically, our contributions are.<list list-type="simple">
<list-item>
<p>&#x2022; We unify previous and new pose tracking modalities, visualizations, and robot interfaces in one system under the name WearMoCap.</p>
</list-item>
<list-item>
<p>&#x2022; We introduce a more precise <italic>Upper Arm</italic> pose tracking mode using an off-the-shelf fitness strap.</p>
</list-item>
<list-item>
<p>&#x2022; We evaluate each system modality on large-scale datasets from a range of consumer devices, up to 8 human subjects, and by comparing them in real-robot tasks.</p>
</list-item>
</list>
</p>
<p>Overall, we envisage this paper to be a streamlined framework for wearable motion capture with three modes, intended to facilitate data collection and future research into human-robot interaction through smartwatch and smartphone motion capture.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methods</title>
<p>This section introduces the system architecture and operation. <xref ref-type="sec" rid="s2-1">Section 2.1</xref> covers system modules and formalizes the data flow. <xref ref-type="sec" rid="s2-2">Section 2.2</xref> describes calibration procedures, followed by the methodology for each pose prediction mode described in <xref ref-type="sec" rid="s2-3">Section 2.3</xref>. Finally, <xref ref-type="sec" rid="s2-4">Section 2.4</xref> covers additional control modalities that we use for our evaluation on real-robot tasks. Each section defines our contributions and additions to the methodology previous works.</p>
<sec id="s2-1">
<title>2.1 System overview and architecture</title>
<p>WearMoCap streams sensor data from smartwatches and phones, and computes pose estimates using them for robot control. As depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, the system operates in three modes: 1) The <italic>Watch Only</italic> mode produces arm pose estimates using the sensor data of a single smartwatch. 2) The <italic>Upper Arm</italic> mode further employs a smartphone strapped to the upper arm. The combined sensor data of watch and phone allow for more precise arm pose estimates. 3) The <italic>Pocket</italic> mode requires the user to wear the watch on their wrist and place the phone in any of their pockets. This allows for tracking both the body orientation and arm pose. While the Watch Only mode is based on <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref> and the Pocket mode on <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref>, the Upper Arm mode is introduced by this paper.</p>
<p>WearMoCap unites all three modes in one framework. To ensure that users can deploy and switch between WearMoCap functionalities easily, we developed WearMoCap as a modular system (<xref ref-type="fig" rid="F2">Figure 2</xref>). The system consists of the following components: i) apps to stream sensor data to a remote machine, ii) a pose estimation module to transform received sensor data into poses, iii) a visualization module that renders pose estimates and distributions using a 3D avatar, and iv) an interface to the Robot Operating System (ROS) for robot control. The apps are written in Kotlin and require Wear OS and Android OS. Pose estimation and the ROS interface are written in Python, and the visualization utilizes Unity3D and C&#x23; scripts. The communication between modules is facilitated using UDP messages. The only exceptions are robot control, which uses a ROS topic, and communication from the watch to the phone app, which is realized via Bluetooth.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>A schematic of the data streams between modules. The pipeline from smartwatch to visualization defaults to UDP. The pose estimation module can publish to the Robot Operating System (ROS).</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g002.tif"/>
</fig>
<p>The user initiates the data stream by pressing a button on the watch app. Messages from the watch app, <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, comprise:<disp-formula id="equ1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,init</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>init</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>with <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>27</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the time since the last message. The timestamp <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="bold-italic">t</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> contains the current hour, minute, second and nanosecond. The virtual rotation vector sensor <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> by Android and Wear OS provides a global orientation quaternion <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Angular velocities are provided by the gyroscope <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Additionally, we integrate linear acceleration measurements <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b1;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> over <inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to obtain velocities <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The value <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the atmospheric pressure sensor and the measurements <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b3;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are readings from the gravity sensor. The <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,init</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>init</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are saved orientation and pressure readings from the calibration (<xref ref-type="sec" rid="s2-2">Section 2.2</xref>).</p>
<p>In the Upper Arm and Pocket modes, the watch streams <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to the phone via Bluetooth. The phone then augments received messages with its own sensor data, and forwards the combined message <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to the host machine, where:<disp-formula id="equ2">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p,init</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>with <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>53</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The pose estimation module receives <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and computes pose estimates. To this end, it calibrates orientation values according to the procedure presented in <xref ref-type="sec" rid="s2-2">Section 2.2</xref> and makes predictions according to the corresponding mode methodology in <xref ref-type="sec" rid="s2-3">Section 2.3</xref>. Then, it outputs a message summarizing the pose <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as<disp-formula id="equ3">
<mml:math id="m23">
<mml:mrow>
<mml:mstyle stretchy="false">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ha</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ha</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>hand</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>,</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>lower&#x2009;&#x2009;arm</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>,</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>upper&#x2009;&#x2009;arm</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>,</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>hip</mml:mtext>
</mml:mrow>
</mml:munder>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x22a4;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mstyle>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula> with <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>25</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, quaternions <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and origin positions <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The pose estimation module can either record <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to a file, send them to the visualization module, or, publish to a ROS topic for robot control.</p>
<p>The reference frame for all final positions is relative to the hip origin. For estimating joint positions through forward kinematics, we facilitate default arm lengths and shoulder offsets. As shown in <xref ref-type="fig" rid="F3">Figure 3</xref>, the default left shoulder origin relative to the hip was set to X: -17.01 cm, Y: 43.1 cm, Z: -0.67 cm, which was determined as an average from our first three human subjects. Moreover, the default upper arm and lower arm lengths were set to 26 cm and 22 cm respectively. These settings worked well for all our experiments but developers can easily adjust the defaults in the <monospace>bone_map.py</monospace> script in our repository.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A)</bold> Possible wrist and elbow positions around the shoulder lie on spheres with the radii of our standard upper arm length 26 cm <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and lower arm length 22 cm <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. <bold>(B)</bold> We provide joint positions relative to the hip origin. If the forward-facing direction is not constrained, all possible shoulder positions lie on a circle. The shoulder offset <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>sh</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the hip is X: -17.01 cm, Y: 43.1 c, Z: -0.67 cm.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g003.tif"/>
</fig>
<p>A local WiFi connection is sufficient to establish the connections between the devices, there is no requirement for internet connectivity. The device synchronization is maintained as follows: First, the watch sends its data to the phone, along with the associated timestamps. The phone maintains a queue to collect the timestamped data from the watch, and then collects its own sensor data at the fastest rate possible. Once the phone completes the collection of a new array of its sensor values, it processes the data in the queue from the watch. The phone integrates the watch data over time and aligns it with its own data. This way, the final output from the phone contains the most recent phone sensor data along with the integrated watch data, accurately matched to the corresponding time points.</p>
</sec>
<sec id="s2-2">
<title>2.2 Calibration</title>
<p>Motion capture requires a set of transformations to bring body joints and IMUs into the same reference frame. Traditionally, this involves calibration procedures like standing in a T-Pose (<xref ref-type="bibr" rid="B23">Roetenberg et al., 2009</xref>; <xref ref-type="bibr" rid="B17">Mollyn et al., 2023</xref>). We implement a seamless calibration pose for each mode, asking the user to hold a respective pose (as depicted in <xref ref-type="fig" rid="F4">Figure 4</xref>) for one second.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>All modes start with a one-second calibration pose. For Watch Only and Pocket mode the user holds their arm parallel to the hip. In Upper Arm mode the user stretches their arm forward.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g004.tif"/>
</fig>
<p>For the Watch Only and Pocket modes, the user starts streaming with the watch app while holding their lower arm parallel to the chest and hip. The watch verifies this position using the gravity and magnetometer sensors. Then, it records the initial watch orientation sensor reading <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,init</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, such that the pose estimation from then on computes the calibrated orientation as<disp-formula id="equ4">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,cal</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,init</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Further, the watch records the initial atmospheric pressure <inline-formula id="inf29">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>init</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, so that we can compute the relative atmospheric pressure:<disp-formula id="equ5">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>cal</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>init</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The calibration for the phone data operates similarly. In the Pocket mode, the phone orientation <inline-formula id="inf30">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p,cal</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is calibrated in the same way as the watch orientation <inline-formula id="inf31">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,cal</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> because the hip forward direction aligns with the watch forward direction (<xref ref-type="fig" rid="F4">Figure 4</xref> on the right). In the Upper Arm mode, the user stretches their arm forward to put the upper arm into a known position relative to the lower arm and hip (<xref ref-type="fig" rid="F4">Figure 4</xref> in the middle). Calibrating the phone orientation in this position allows aligning <inline-formula id="inf32">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p,cal</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with the upper arm orientation and hence remains unaffected by varying body proportions. <xref ref-type="fig" rid="F4">Figure 4</xref> depicts the result: In the start pose, the calibrated device orientations equate to identity quaternions, i.e., no rotation.</p>
<p>We describe the detail of the calibration process along with the average duration for each mode in the following subsection.</p>
<sec id="s2-2-1">
<title>2.2.1 Watch Only</title>
<p>The user has to hold the watch in a calibration pose as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. The watch uses the gravity sensor to assess if it is positioned with its screen parallel to the ground. If the z-value of the gravity sensor is <inline-formula id="inf33">
<mml:math id="m38">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>9.75 <inline-formula id="inf34">
<mml:math id="m39">
<mml:mrow>
<mml:mtext>m</mml:mtext>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (perfect orientation would be the gravity constant 9.81 <inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:mtext>m</mml:mtext>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>), the watch indicates that it is ready to calibrate. The user can then initiate the calibration by triggering the start button. The app collects the watch orientation and atmospheric pressure sensor values for 100 ms and averages them. These measurements serve as the calibration values and future measurements are set relative to this initial average. Therefore, the calibration procedure requires the user to bring the watch into the correct position and collects 100 ms of data. The procedure is typically finished in 1 s.</p>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Upper Arm</title>
<p>For this calibration procedure, the user has to complete two steps. Both are depicted in <xref ref-type="fig" rid="F4">Figure 4</xref>. Step 1 is the same as Watch Only: If the z-value of the gravity sensor is <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>9.75 <inline-formula id="inf37">
<mml:math id="m42">
<mml:mrow>
<mml:mtext>m</mml:mtext>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, the watch indicates that it is ready to calibrate. Upon button trigger, the app collects 100 ms of orientation measurements and saves the average as the initial pose orientation. Subsequently, the watch vibrates to signal the user to stretch their arm forward. The watch then keeps track of orientation changes. As soon as the <italic>z</italic>-axis of the gravity sensor is <inline-formula id="inf38">
<mml:math id="m43">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>9.75 <inline-formula id="inf39">
<mml:math id="m44">
<mml:mrow>
<mml:mtext>m</mml:mtext>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> again and the global y-orientation changed by more than <inline-formula id="inf40">
<mml:math id="m45">
<mml:mrow>
<mml:mn>80</mml:mn>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the watch sends a message to the phone. Upon receiving the message, the phone collects its own global orientation for 1,000 ms. The average is the phone orientation calibration and future orientations are estimated relative to the calibration value. Altogether, the user has to stand in two poses and the devices collect data for 1,100 ms. The procedure is typically finished in about 2&#x2013;3 s.</p>
</sec>
<sec id="s2-2-3">
<title>2.2.3 Pocket</title>
<p>The user places the smartphone in their pocket. The user holds the watch in front of their body as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. Once the z-value of the gravity sensor is <inline-formula id="inf41">
<mml:math id="m46">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>9.75 <inline-formula id="inf42">
<mml:math id="m47">
<mml:mrow>
<mml:mtext>m</mml:mtext>
<mml:mo>/</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>s</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, the watch indicates that it is ready to calibrate. The watch collects orientation and pressure for 100 ms, then immediately sends a message to the phone, and the phone records its own orientation for 100 ms. Recorded orientations serve as calibration measures. Typically, this procedure is completed within 2 s.</p>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Pose estimation in motion capture modes</title>
<p>This section outlines the pose estimation methodology for the three motion capture modes. All three modes employ neural network-based approaches with stochastic forward passes to obtain a distribution of solutions <xref ref-type="bibr" rid="B6">Gal and Ghahramani (2016)</xref>. In <xref ref-type="fig" rid="F5">Figure 5</xref>, possible solutions are depicted as small cubes colored according to their distance from the mean. Wide distributions are indicative of unergonomic arm poses or fast jittering motions.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Stochastic forward passes produce ensembles of possible arm poses. Individual predicted wrist positions are shown as dots, colored based on their distance from the ensemble mean&#x2014;green indicates closer proximity to the mean, while red signifies greater deviation. High variance within the ensemble reflects high uncertainty, which might occur in unergonomic poses or during rapid movements. The true wrist position is indicated as ground truth (GT).</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g005.tif"/>
</fig>
<sec id="s2-3-1">
<title>2.3.1 Watch only</title>
<p>For the Watch Only mode, we employ the derived optimal neural network architecture from <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>. An LSTM estimates the lower arm orientation <inline-formula id="inf43">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and upper arm orientation <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from a sequence of watch sensor data <inline-formula id="inf45">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with calibrated orientation and pressure. The output message <inline-formula id="inf46">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> sets the estimated hand orientation <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ha</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> equal to <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and subsequently, we derive positional values through forward kinematics by assuming an approximate lower arm length of 22 cm and upper arm length of 26 cm. The Watch Only mode requires a constant forward-facing direction, i.e., the hip orientation estimate <inline-formula id="inf49">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is constant and arm pose tracking is stable as long as the user does not change their forward-facing direction after calibration. While the general inputs and targets are the same as in <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>, we use slightly altered hyperparameters: Our LSTM has 2 hidden layers with 256 neurons each and we use a sequence length of 12.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Upper arm</title>
<p>The previous Watch Only mode infers the upper arm orientation from the smartwatch sensor data only. This is sparse data for arm pose predictions. Therefore, we now introduce the additional Upper Arm mode, which facilitates more sensor data to infer the entire arm pose by placing the smartphone directly on the upper arm. As described earlier, the user can use an off the shelf fitness strap. We use an LSTM to predict <inline-formula id="inf50">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf51">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the last four combined watch and phone sensor data <inline-formula id="inf52">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> readings. Similar to the Watch Only mode, we estimate positions through forward kinematics with default arm lengths of 22 cm for the lower arm and upper arm length of 26 cm. We determined our hyperparameters through gridsearch. The best result was achieved with with three LSTM layers of 128 neurons applying a dropout of 0.2 on the last one. Further, a sequence length of 4, batch size of 32 and learning rate of 0.0015 lead to the best results. Our loss function was the L1 loss and we used the Adam optimizer.</p>
<p>With this mode, after calibration, the user is free to turn around. However, this mode does not provide body-orientation estimates, which means the lower and upper arm orientations <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> capture the correct arm pose in any forward-facing direction but the hip orientation estimate <inline-formula id="inf55">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is constant.</p>
</sec>
<sec id="s2-3-3">
<title>2.3.3 Pocket</title>
<p>This mode is based on <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref> and uses a Differentiable Ensemble Kalman Filter to update an ensemble of states from previous estimates and the watch and phone sensor data <inline-formula id="inf56">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Each ensemble member describes the orientation of the lower arm <inline-formula id="inf57">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, upper arm <inline-formula id="inf58">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the rotation around the up-axis of the hip <inline-formula id="inf59">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. This allows us to compile the pose estimation <inline-formula id="inf60">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>est</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and determine joint positions <inline-formula id="inf61">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ha</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf62">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf63">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> through forward kinematics. We retained the hyperparameter settings of <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref> but trained the filter anew on the larger dataset that we compiled for this work.</p>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Additional control modalities</title>
<p>For teleoperation tasks that involve advanced gripper control (see <xref ref-type="sec" rid="s3-4">Section 3.4</xref>), we stream microphone data to issue voice commands. This is done by transcribing the recorded audio signal into voice commands utilizing the Google Cloud speech-to-text service<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>. We also implement two positional control modalities (A and B in <xref ref-type="fig" rid="F7">Figure 7</xref>). Voice commands were used in our previous works (<xref ref-type="bibr" rid="B33">Weigend et al., 2023b</xref>; <xref ref-type="bibr" rid="B32">2024</xref>) and Modality A was utilized in <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>, while Modality B was proposed in <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref>. Typically, users expect to control the robot with their hand position. Therefore, both of our control modalities translate wrist/hand positions into control commands, e. g., end-effector positions.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>We use two control modalities to determine end-effector positions. Modality <bold>(A)</bold> leverages forward kinematics with default arm lengths to return the wrist origin relative to the hip. Modality <bold>(B)</bold> estimates the wrist origin projected onto the sagittal plane.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g007.tif"/>
</fig>
<p>With Modality A, we determine the wrist position relative to the hip origin. This is then directly translated to the end-effector position relative to its base. Modality B requires the dynamic hip orientation estimates <inline-formula id="inf64">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in Pocket mode. Here, the local forward direction (Z) aligns with the sagittal plane (red) given by the current hip orientation. The projected wrist coordinates define the end-effector position on that plane.</p>
<p>The main difference between Modality A and B is the reduction in interacting degrees of freedom to reduce potential compounding errors. With Modality A, the end-effector X-position is determined by the complete kinematic chain <inline-formula id="inf65">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf66">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and then <inline-formula id="inf67">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. In contrast, Modality B determines the target X-position through the hip orientation <inline-formula id="inf68">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and then the projected distance and elevation of the wrist. This reduces potential compounding errors but makes it more difficult to adjust the X-position without affecting Y and Z-positions. Therefore, Modality B is more suitable for circular control motions with the user at the center. On the other hand, Modality A is more suitable for situations where the user has a more constant forward facing direction. The evaluation of both control modalities on real-robot tasks is discussed in <xref ref-type="sec" rid="s3-4">Section 3.4</xref>.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>We evaluate the performance of WearMoCap in real-robot tasks and on large-scale datasets from multiple studies and across multiple devices (smartwatches and smartphones). The first <xref ref-type="sec" rid="s3-1">Section 3.1</xref> covers the composition of our training and test datasets. <xref ref-type="sec" rid="s3-2">Section 3.2</xref> details prediction performance on our test datasets and compares it to related work; Followed by <xref ref-type="sec" rid="s3-4">Section 3.4</xref>, which describes the evaluation on four real-robot tasks and concludes by summarizing results and limitations.</p>
<sec id="s3-1">
<title>3.1 Composition of datasets</title>
<p>We composed a large-scale dataset by merging datasets collected from previous studies (<xref ref-type="bibr" rid="B33">Weigend et al., 2023b</xref>; <xref ref-type="bibr" rid="B32">2024</xref>), and augmenting them with data collected for this study. We employed the following devices for data collection: smartwatches&#x2014;Fossil Gen 6 Men&#x2019;s, and Samsung Galaxy Watch 5 40 mm version (RM900) and 45 mm version (RM910); smartphones&#x2014;OnePlus N100, TCL 40XL and Samsung Galaxy A23G. Out of these, only Samsung Galaxy A23G and Samsung Galaxy Watch 5 were used in the datasets from previous studies (<xref ref-type="bibr" rid="B33">Weigend et al., 2023b</xref>; <xref ref-type="bibr" rid="B32">Weigend et al., 2024</xref>). The rest are new to this study. The OS version on the Samsung Watches was WearOS 4 which is based on Android 13. The Fossil Gen 6 had WearOS3 based on Android 11. The sampling frequency of newer phones such as Samsung A23 is 90 Hz, while phone such as OnePlus N100 transmit data at 60 Hz sampling frequency. Since our model input includes delta time, the model is able to account for fluctuations and differences in frequency. For all previous and new datasets, the ground truth was obtained with the optical motion capture system OptiTrack (<xref ref-type="bibr" rid="B18">Nagym&#xe1;t&#xe9; and Kiss, 2018</xref>). The OptiTrack motion capture environment featured 12 cameras, which were calibrated before data collection. Human subjects wore a 25-marker-upper-body suit along with the smartwatch on their left wrist and phone on upper arm or in pocket. We collected lower arm, upper arm, and hip orientations with time stamps. The system recorded poses at 120 Hz. In post processing, we matched WearMoCap data with the OptiTrack pose closest in time. All human subjects (8 Males; Mean age: 25 <inline-formula id="inf69">
<mml:math id="m74">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3) provided written informed consent approved by the institutional review board (IRB) of ASU under the ID STUDY00017558. The recruitment criteria for the subjects were as outlined in the IRB: English-speaking adults between the ages of 18 and 70 with no current physical impairments that affect arm or body movements.</p>
<p>To collect data for the Watch Only mode, we asked subjects to perform single-arm movements under a constant forward-facing constraint. We combined this data with data from <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>, which resulted in a dataset with 0.6 M observations. Here, each observation refers to a collected data row.</p>
<p>For the Upper Arm mode, we asked 5 subjects to perform similar movements as above, but with a phone strapped on to their upper arm. For the Upper Arm mode, we did not enforce a constant forward direction. Additionally, subjects were encouraged to occasionally perform teleoperation-typical motions, such as moving the wrist slowly in a straight line. We showed demonstrations of writing English letters on an imaginary plane as examples of such motions. However, subjects were not strictly instructed to perform these movements and some chose not to or forgot. Therefore, not all recordings contained these teleoperation-typical movements. This resulted in a dataset with 0.4 M observations.</p>
<p>For the Pocket mode, subjects had to keep a smartphone in any of their pockets. For data collection, subjects were free to move their arm in any direction and without the forward-facing constraint. Further, the pose estimation in Pocket mode only requires the orientation sensor data <inline-formula id="inf70">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the phone (<xref ref-type="bibr" rid="B32">Weigend et al., 2024</xref>). This allowed us to retrospectively simulate phone-in-pocket data for collected Watch Only and Upper Arm data using the ground truth hip orientation <inline-formula id="inf71">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as an approximate calibrated phone orientation. All data combined compiled a dataset of 0.9 M observations.</p>
<p>Both the Upper Arm and Pocket modes do not restrict body orientation, which allowed us to augment the data. This was done by rotating <inline-formula id="inf72">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>la</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>ua</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>hi</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as well as <inline-formula id="inf73">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,cal</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf74">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>p,cal</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> around the global <italic>Y</italic>-axis. The global rotation is possible because all other sensor readings in <inline-formula id="inf75">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>w,p</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are in the local device reference frame and, therefore, unaffected by changes in global Y-axis-rotation. We augment the data for the Upper Arm and Pocket modes two times by rotating around a random Y-angle. The dataset composition details for each mode are summarized in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Compiled dataset attributes for each WearMoCap mode.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="right">Mode</th>
<th align="center">Data</th>
<th align="center">Augm.</th>
<th align="center">&#x23;Subj.</th>
<th align="center">Devices</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Watch Only</td>
<td align="center">0.6 M</td>
<td align="center">-</td>
<td align="center">7</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Upper Arm</td>
<td align="center">0.4 M</td>
<td align="center">1.2 M</td>
<td align="center">5</td>
<td align="center">
<inline-formula id="inf76">
<mml:math id="m81">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Pocket</td>
<td align="center">0.9 M</td>
<td align="center">2.6 M</td>
<td align="center">8</td>
<td align="center">
<inline-formula id="inf77">
<mml:math id="m82">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The column Augm. indicates the dataset volume post augmentation, &#x23;Subj. indicates the number of subjects data was collected from, and Devices indicates the number of distinct devices data was collected with. <inline-formula id="inf78">
<mml:math id="m83">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> stands for three smartwatches and three smartphones.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>For all the datasets, we provided the subjects with verbal instructions and brief demonstrations of motions that covered the position space well, and asked the subjects to perform them. We confirmed the variability of their motions by inspecting the 3D plots of their movement trajectories, which revealed that the data covers the position space. An example overview of all participant&#x2019;s combined wrist positions is depicted in <xref ref-type="fig" rid="F8">Figure 8</xref>. Our training and test data includes recording sessions of up to 10 min duration. The mean duration and other statistics such as number of sessions, average number of observations, etc. can be found in <xref ref-type="table" rid="T2">Table 2</xref>. Five of the subjects that were used to collect data in previous studies <xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>, <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref> were used again to collect new data in this study.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Using wrist positions as an example, this figure shows that our collected data covers the space of possible arm poses (position space). Dots are data points representing wrist positions relative to the hip obtained from the motion capture system, which also show differing arm lengths. Data points are colored according to the sum of their coordinate magnitudes. Left: Data points collected under the fixed forward-facing constraint. Right: Wrist positions collected without the fixed body orientation constraint form a full sphere.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g008.tif"/>
</fig>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Statistics of dataset incorporated from previous works, and additional data collected in this work.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Data source</th>
<th align="center">Mean time (s)</th>
<th align="center">Sum time (s).</th>
<th align="center">Mean &#x23; obs.</th>
<th align="center">Sum &#x23; obs.</th>
<th align="center">&#x23; Sessions</th>
<th align="center">&#x23; Subj.</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<xref ref-type="bibr" rid="B33">Weigend et al. (2023b)</xref>
</td>
<td align="center">
<inline-formula id="inf79">
<mml:math id="m84">
<mml:mrow>
<mml:mn>227</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>47</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">3,855</td>
<td align="center">
<inline-formula id="inf80">
<mml:math id="m85">
<mml:mrow>
<mml:mn>17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> k</td>
<td align="center">287 k</td>
<td align="center">17</td>
<td align="center">6</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref>&#x2a;</td>
<td align="center">
<inline-formula id="inf81">
<mml:math id="m86">
<mml:mrow>
<mml:mn>500</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">5,501</td>
<td align="center">
<inline-formula id="inf82">
<mml:math id="m87">
<mml:mrow>
<mml:mn>17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> k</td>
<td align="center">185 k</td>
<td align="center">11</td>
<td align="center">4</td>
</tr>
<tr>
<td align="left">(New) Cnst. body orient.&#x2a;&#x2a;</td>
<td align="center">
<inline-formula id="inf83">
<mml:math id="m88">
<mml:mrow>
<mml:mn>409</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>116</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">5,323</td>
<td align="center">
<inline-formula id="inf84">
<mml:math id="m89">
<mml:mrow>
<mml:mn>24</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> k</td>
<td align="center">305 k</td>
<td align="center">13</td>
<td align="center">5</td>
</tr>
<tr>
<td align="left">(New) Free body orient.&#x2a;&#x2a;&#x2a;</td>
<td align="center">
<inline-formula id="inf85">
<mml:math id="m90">
<mml:mrow>
<mml:mn>378</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>76</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1,515</td>
<td align="center">
<inline-formula id="inf86">
<mml:math id="m91">
<mml:mrow>
<mml:mn>26</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> k</td>
<td align="center">103 k</td>
<td align="center">4</td>
<td align="center">3</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The first two rows represent previous studies. The bottom two rows represent new data collected in this study where subjects were asked to perform movements with constant forward-facing body orientation (Cnst. body orient.) and with free body orientation (Free body orient.). The asterisks indicate the modes for which the data was utilized (&#x2a; Pocket Mode only; &#x2a;&#x2a; All modes; &#x2a;&#x2a;&#x2a; Upper arm and Pocket Mode).</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-2">
<title>3.2 Model accuracy</title>
<p>We employed our dataset to assess WearMoCap performance in two ways: all-subjects validation and leave-one-out validation. For the all-subjects validation, we utilized <inline-formula id="inf87">
<mml:math id="m92">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of each subject&#x2019;s data for training, reserving the remaining portion for testing. We train five models with randomly initialized weights and report the average error. We consider these results to be indicative of performance within controlled settings where the model can be fine-tuned on a known population. In contrast, the leave-one-out validation involves a cross-validation approach, where we systematically reserved all the data from one subject at a time for testing while training the model on the data from the remaining subjects. The leave-one-out performance measures the ability of the model to generalize to new subjects and is, hence more suitable to assess performance in real-world applications. Our results are summarized in <xref ref-type="fig" rid="F6">Figure 6</xref> and in <xref ref-type="table" rid="T3">Table 3</xref> we compare against the state-of-the-art baseline methods wherever applicable.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Model performance for each WearMoCap mode and comparison to baselines.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Watch Only baseline</th>
<th rowspan="2" align="center">Evaluation</th>
<th rowspan="2" align="center">Metric</th>
<th colspan="2" align="center">Wrist (cm)</th>
<th colspan="2" align="center">Elbow (cm)</th>
<th align="center">Hip <inline-formula id="inf88">
<mml:math id="m93">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th align="center">Theirs</th>
<th align="center">Ours</th>
<th align="center">Theirs</th>
<th align="center">Ours</th>
<th align="center">Ours</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">
<xref ref-type="bibr" rid="B14">Liu et al. (2023)</xref>
</td>
<td align="center">All</td>
<td align="center">MAE</td>
<td align="center">10.93</td>
<td align="center">
<inline-formula id="inf89">
<mml:math id="m94">
<mml:mrow>
<mml:mn>10.82</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.04</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">-</td>
<td align="center">
<inline-formula id="inf90">
<mml:math id="m95">
<mml:mrow>
<mml:mn>9.45</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.08</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">-</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>.A</td>
<td align="center">1out</td>
<td align="center">MAE</td>
<td align="center">8.5</td>
<td align="center">
<inline-formula id="inf91">
<mml:math id="m96">
<mml:mrow>
<mml:mn>12.17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.03</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">8.5</td>
<td align="center">
<inline-formula id="inf92">
<mml:math id="m97">
<mml:mrow>
<mml:mn>10.09</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.73</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">-</td>
</tr>
<tr>
<td align="center">
<xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>.B</td>
<td align="center">1out</td>
<td align="center">MAE</td>
<td align="center">15</td>
<td align="center">
<inline-formula id="inf93">
<mml:math id="m98">
<mml:mrow>
<mml:mn>12.17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.03</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">11.5</td>
<td align="center">
<inline-formula id="inf94">
<mml:math id="m99">
<mml:mrow>
<mml:mn>10.09</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.73</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">-</td>
</tr>
<tr>
<td colspan="8" align="left">Upper Arm</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B10">Joukov et al. (2017)</xref>
</td>
<td align="center">All</td>
<td align="center">RMSE</td>
<td align="center">
<inline-formula id="inf95">
<mml:math id="m100">
<mml:mrow>
<mml:mn>6.9</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.7</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf96">
<mml:math id="m101">
<mml:mrow>
<mml:mn>6.79</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.57</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf97">
<mml:math id="m102">
<mml:mrow>
<mml:mn>5.2</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2.6</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf98">
<mml:math id="m103">
<mml:mrow>
<mml:mn>4.24</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.31</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">-</td>
</tr>
<tr>
<td colspan="8" align="left">Pocket</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B5">DeVrio et al. (2023)</xref>
</td>
<td align="center">1out</td>
<td align="center">MAE</td>
<td align="center">
<inline-formula id="inf99">
<mml:math id="m104">
<mml:mrow>
<mml:mn>15.1</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.42</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf100">
<mml:math id="m105">
<mml:mrow>
<mml:mn>11.4</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.87</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf101">
<mml:math id="m106">
<mml:mrow>
<mml:mn>10.0</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf102">
<mml:math id="m107">
<mml:mrow>
<mml:mn>10.01</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf103">
<mml:math id="m108">
<mml:mrow>
<mml:mn>4.17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The by the baseline chosen type of evaluation is characterized the by the Evaluations and Metrics columns. Abbreviations stand for: trained on data from all subjects (All), leave-one-out (1out), Mean Absolute Error (MAE), and Root Mean Squared Error (RMSE). We reported standard deviations where available.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s3-2-1">
<title>3.2.1 Watch only</title>
<p>As depicted in <xref ref-type="fig" rid="F6">Figure 6</xref> (Left), we trained seven distinct models for the Watch Only leave-one-out validation corresponding to seven different subjects. On average, the predicted wrist positions deviated by <inline-formula id="inf104">
<mml:math id="m109">
<mml:mrow>
<mml:mn>12.17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.03</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm and elbow positions by <inline-formula id="inf105">
<mml:math id="m110">
<mml:mrow>
<mml:mn>10.09</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.73</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm. In the all-subjects validation, our model achieved slightly better prediction errors with <inline-formula id="inf106">
<mml:math id="m111">
<mml:mrow>
<mml:mn>10.82</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.04</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for wrist and <inline-formula id="inf107">
<mml:math id="m112">
<mml:mrow>
<mml:mn>9.45</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.08</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for elbow positions. In <xref ref-type="table" rid="T3">Table 3</xref>, we show that these results do not deviate strongly the works of <xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>; <xref ref-type="bibr" rid="B14">Liu et al. (2023)</xref>, which also estimated the arm pose from a single smartwatch on the wrist. The authors of <xref ref-type="bibr" rid="B14">Liu et al. (2023)</xref> evaluated their method using data from all subjects in the training and test set. Their method is able to estimate the wrist position in any forward-facing direction; however, they require inference in the same environment where the training data was collected. In our work, we enforce a constant forward-facing direction but allow for inference to be performed anywhere. The authors of <xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref> evaluated their method using leave-one-out validation against two ground truth measures&#x2013;the first using two IMUs (denoted as <xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>. A in <xref ref-type="table" rid="T3">Table 3</xref>) and the second from a Kinect sensor (<xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>. B). Their approach, akin to our Watch Only Mode, necessitates users to maintain a constant forward-facing direction. Our leave-one-out prediction error falls between the reported errors of <xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>. A and <xref ref-type="bibr" rid="B30">Wei et al. (2021)</xref>.B.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Euclidean Mean Absolute Error (MAE) of wrist and elbow position estimates in leave-one-subject-out cross validations. Specifically, we trained on data from all the subjects except the Nth subject, and tested on the Nth subject.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g006.tif"/>
</fig>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Upper arm</title>
<p>Similar to our Upper Arm mode, <xref ref-type="bibr" rid="B10">Joukov et al. (2017)</xref> proposes the use of one IMU on the lower arm and the second on the upper arm. Their evaluation is based on all-subjects validation and uses RMSE as the performance measure. <xref ref-type="table" rid="T3">Table 3</xref> shows that our errors of <inline-formula id="inf108">
<mml:math id="m113">
<mml:mrow>
<mml:mn>6.79</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.57</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for wrist and <inline-formula id="inf109">
<mml:math id="m114">
<mml:mrow>
<mml:mn>4.24</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.31</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for elbow positions are similar to those reported by <xref ref-type="bibr" rid="B10">Joukov et al. (2017)</xref>, despite our mode being evaluated across multiple commercial devices and a wider range of motions. <xref ref-type="fig" rid="F6">Figure 6</xref> summarizes our leave-one-out validation results, where prediction errors were slightly higher with MAEs of <inline-formula id="inf110">
<mml:math id="m115">
<mml:mrow>
<mml:mn>7.93</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.68</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for wrist and <inline-formula id="inf111">
<mml:math id="m116">
<mml:mrow>
<mml:mn>6.23</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.28</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm for elbow positions.</p>
</sec>
<sec id="s3-2-3">
<title>3.2.3 Pocket</title>
<p>Similar to our Pocket mode, the authors of <xref ref-type="bibr" rid="B5">DeVrio et al. (2023)</xref> also leveraged data from a smartwatch and additional sensor data from a smartphone placed in the pocket. The authors conducted a leave-one-out evaluation. A comparison of WearMoCap to their reported results is shown in <xref ref-type="table" rid="T3">Table 3</xref>, and also here are comparable. With an average wrist error of <inline-formula id="inf112">
<mml:math id="m117">
<mml:mrow>
<mml:mn>11.4</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.87</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm, WearMoCap appears to be more accurate for the wrist on our data, but marginally less accurate for the elbow with an error of <inline-formula id="inf113">
<mml:math id="m118">
<mml:mrow>
<mml:mn>10.01</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm. Further, our method provides an additional hip orientation estimate with an average error of <inline-formula id="inf114">
<mml:math id="m119">
<mml:mrow>
<mml:mn>4.17</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>05</mml:mn>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>All discussed methods are real-time capable. Our most computationally demanding mode is the Pocket mode, which achieves inference speeds of <inline-formula id="inf115">
<mml:math id="m120">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>62 Hz on a system equipped with an Intel<sup>&#xae;</sup> Xeon(R) W-2125 CPU and NVIDIA GeForce RTX 2080 Ti.</p>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 Sensitivity analysis</title>
<p>To determine the relative importance of each input feature to our models, we conducted a sensitivity analysis where we left each sensor out, one at a time, in the Watch-Only mode. We noted effect on the model performance for prediction of Hand and Elbow positions in <xref ref-type="table" rid="T4">Table 4</xref>. The results show that leaving out the global orientation harms the performance the most, followed by gyroscope and accelerometer. While leaving out the atmospheric pressure sensor did not affect the accuracy significantly, we retained the sensor in our data.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Model performance after removing individual sensors for sensitivity analysis.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="right">Prediction</th>
<th align="center">All</th>
<th align="center">No gyro</th>
<th align="center">No acc (vel, grav)</th>
<th align="center">No orientation</th>
<th align="center">No pressure</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="right">Hand</td>
<td align="center">10.82<inline-formula id="inf116">
<mml:math id="m121">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.04</td>
<td align="center">11.06<inline-formula id="inf117">
<mml:math id="m122">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.13</td>
<td align="center">11.30<inline-formula id="inf118">
<mml:math id="m123">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.10</td>
<td align="center">19.26<inline-formula id="inf119">
<mml:math id="m124">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.13</td>
<td align="center">10.76<inline-formula id="inf120">
<mml:math id="m125">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.09</td>
</tr>
<tr>
<td align="right">Elbow</td>
<td align="center">9.19<inline-formula id="inf121">
<mml:math id="m126">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.08</td>
<td align="center">9.45<inline-formula id="inf122">
<mml:math id="m127">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.08</td>
<td align="center">9.53<inline-formula id="inf123">
<mml:math id="m128">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.07</td>
<td align="center">12.41<inline-formula id="inf124">
<mml:math id="m129">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.06</td>
<td align="center">9.17<inline-formula id="inf125">
<mml:math id="m130">
<mml:mrow>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>0.06</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>For every condition, we trained 5 networks with randomly initialized weights, utilizing 75% of the data of every participant for training and 25% for testing. All numbers are in cm and are averaged over the 5 random networks. Results are shown for Watch Only mode.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-4">
<title>3.4 Real-robot tasks</title>
<p>To assess the practical use of WearMoCap in robotics, we evaluate its application in four human-robot experiments, namely, Handover, Intervention, Teleoperation, and Drone Piloting tasks. The Handover and Intervention tasks were conducted for this work under the ASU IRB ID STUDY00018521. The Teleoperation and Drone Piloting tasks were conducted in <xref ref-type="bibr" rid="B32">Weigend et al. (2024)</xref> under the ASU IRB ID STUDY00018450. We picked these tasks such that our evaluation covers the three WearMoCap pose tracking modes Watch Only, Upper Arm, Pocket and control Modalities A and B with at least two experiments each. <xref ref-type="sec" rid="s3-4-5">Section 3.4.5</xref> discusses the results and compares them to the user performance with the OptiTrack system where possible. OptiTrack provides sub-millimeter accurate tracking and is therefore utilized as our state-of-the-art baseline (<xref ref-type="bibr" rid="B18">Nagym&#xe1;t&#xe9; and Kiss, 2018</xref>; <xref ref-type="bibr" rid="B25">Topley and Richards, 2020</xref>). All human subjects (9 Males; 1 Female; Mean age: 25 <inline-formula id="inf126">
<mml:math id="m131">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 3) provided written consent. 4 human subjects performed all the robotic tasks, 1 subject performed teleoperation and drone tasks, 1 subject performed drone and intervention tasks, and the remaining performed only the drone task. While one subject had prior experience with drone piloting, none of the other subjects had any prior experience with any robotic tasks.</p>
<sec id="s3-4-1">
<title>3.4.1 Handover</title>
<p>In the Handover Task, an arm robot picks up an object from the table and hands it over to a human subject at a given location. Subjects sat on a rotating chair at a fixed location in front of a Universal Robot 5 (UR5). To do this task successfully, the robot must correctly track the human hand position. To this end, we provide the robot with the relative chair position, approximate sitting height, and arm lengths, such that it can estimate handover positions relative to its base.</p>
<p>As depicted in Step 1 on the left of <xref ref-type="fig" rid="F9">Figure 9</xref>, the tabletop area between the robot and the subject was divided into three areas. We ask subjects to perform handovers in each of these areas to ensure a range of diverse poses. With the subject&#x2019;s hand in one of these areas, the subject performed two handovers&#x2014;once with the hand at a low height and once with the hand at a higher height. The subjects then repeated this task for all the other areas at random. The subject&#x2019;s orientation was fixed for Watch Only mode, but for the other two modes, they could change their orientation by rotating the chair.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Handover Task: Human subjects used WearMoCap to perform handovers with a UR5. A voice command completed the handover and the robot let go of the cube. Intervention Task: The UR5 sorted the green and red cubes into the bin. Human subjects use WearMoCap to interrupt the robot and to place the green cube at a target location.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g009.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F9">Figure 9</xref> summarizes the steps for each handover task. From the initial setup (Step 1), the subject raised their arm in one of the three locations at random (Step 2). Then, the robot picked up the green cube (Step 3). Given the known chair position and subject&#x2019;s sitting height, we tracked the hand position of the subject using WearMoCap. The robot moved the cube toward the tracked hand position (Step 4). The subject then issued a voice command (Step 5) after which the robot released the cube (Step 6). Depending upon the accuracy of hand tracking, the subject had to move their hand by a certain &#x201c;handover distance&#x201d; to grab the cube.</p>
<p>Four human subjects performed 24 tasks each, comprising six handovers with Watch Only, Upper Arm, Pocket modes and with OptiTrack. We randomized the order of tracking modes to eliminate potential biases or learning effects. We computed the handover distance, which is the difference between the hand position and the cube at the time the participant triggered the voice command (Step 5). To compute the handover distance, we located the center of the user&#x2019;s wrist and the center of the cube using Optitrack markers on both. Then we took the euclidean difference between the two. We also computed the handover time, which is the time that it takes for the robot to move toward the hand and complete the handover task (from Step 2 to Step 5).</p>
</sec>
<sec id="s3-4-2">
<title>3.4.2 Intervention</title>
<p>In the Intervention Task, the human subject interrupts the robot during its routine when it makes a mistake, and performs corrective action. For this task, a UR5 robot was supposed to autonomously pick up a colored cube (green or red) and drop it at target locations of the same color. However, the robot was not trained to correctly place green cubes. As depicted in Step 1 on the right of <xref ref-type="fig" rid="F9">Figure 9</xref>, the human subject stood in front of the robot and there were three possible target locations for the green cube. Whenever the robot picked up a green cube, the subject stopped the robot with a voice command and made it place the cube at the correct location.</p>
<p>
<xref ref-type="fig" rid="F9">Figure 9</xref> summarizes the steps. The subject watched the robot (Step 2) and stopped it with a voice command from dropping a green cube at the red location (Step 3). Then, the subject instructed the robot to mirror their arm motion, i.e., move the robot end-effector in the same way as the subject&#x2019;s wrist movement (Step 4). The WearMoCap algorithm, in conjunction with control Modality A (<xref ref-type="fig" rid="F7">Figure 7</xref>), tracked the hand position and converted it into end-effector coordinates to control the robot (Step 5). The subject then issued another voice command (&#x201c;Open the gripper&#x201d;) to complete placing the cube at the correct location (Step 6).</p>
<p>Five subjects performed this task for each of the three green target locations and with each WearMoCap mode at random. The performance was evaluated with respect to the placement distance, which is the distance between the position of the placed green cube and the center of the target location. This was measured using OptiTrack. We also computed the task completion time, which is the time that elapsed between issuing the &#x201c;Follow me&#x201d; command and the &#x201c;Open the gripper&#x201d; command.</p>
</sec>
<sec id="s3-4-3">
<title>3.4.3 Teleoperation</title>
<p>As depicted on the left in <xref ref-type="fig" rid="F10">Figure 10</xref>, subjects controlled a UR5 to pick and place cubes from a remote location through a live camera feed on their smartphone. This was done as follows: the subject initiated the task with a &#x201c;Follow me&#x201d; voice command, which started the hand tracking. The subject maneuvered the robot end-effector toward the cube to be picked up. The subject then issued a &#x201c;Close&#x201d; voice command to grab the cube. Then, the subject maneuvered the robot end-effector to the target location and dropped the cube with &#x201c;Open&#x201d; voice command. We employed the Pocket mode of WearMoCap, in conjunction with control Modality B (<xref ref-type="fig" rid="F7">Figure 7</xref>), to estimate the end-effector position for robot control. This combination allowed the subject to control the robot through changes in their body orientation, i.e., the robot turned left (right) whenever the subject turned left (right).</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Teleoperation Task: Human subjects used WearMoCap to pick and place a cube. They were entirely removed from the robot and only watched the end-effector through a camera feed on the phone. Drone Piloting Task: Human subjects piloted a drone to three target locations in random order. Each location was marked with an AprilTag. If a tag was recognized through the drone camera feed the target was reached.</p>
</caption>
<graphic xlink:href="frobt-11-1478016-g010.tif"/>
</fig>
<p>This task was performed by five subjects for six different configurations of pick-up and target locations of the cube. For one instance using OptiTrack and two instances using WearMoCap, the task execution failed because the subject knocked over the cube. For all successful completions of the tasks, we computed the placement accuracy, which is the distance between the placed cube and the target location, as measured by OptiTrack. We also computed the task completion time, which is the time elapsed between issuing the &#x201c;Follow me&#x201d; and &#x201c;Open&#x201d; commands.</p>
</sec>
<sec id="s3-4-4">
<title>3.4.4 Drone piloting</title>
<p>In this task, subjects used motion capture to fly a commercial Parrot Bebop 2 drone to three target locations. Drone control via traditional remotes is hard to master while control through body motions can be more intuitive for inexperienced pilots <xref ref-type="bibr" rid="B15">Macchini et al. (2020)</xref>. As shown on the right in <xref ref-type="fig" rid="F10">Figure 10</xref>, with the subject at the center of a field, the three target locations were at distances of 4 m, 5 m, and 6 m in three directions. The targets were colored cardboard sheets with AprilTags (<xref ref-type="bibr" rid="B29">Wang and Olson, 2016</xref>). Subjects were instructed to fly the drone above these targets in a randomized order. A target was considered to be reached when its corresponding AprilTag ID was recognized through the downward-facing drone camera. The subjects controlled the drone with WearMoCap in Pocket mode, utilizing control Modality B. The drone used GPS and internal IMUs to follow the control commands in a stable trajectory.</p>
<p>Ten human subjects performed the task two times each: first, with WearMoCap and then with the original remote called SkyController. The performance was measured using drone piloting time which is the time it took for the drone from reaching the first target until reaching the third target.</p>
</sec>
<sec id="s3-4-5">
<title>3.4.5 Results summary</title>
<p>We summarize the objective task metrics in <xref ref-type="table" rid="T5">Table 5</xref>. For each task, we compared the performance of WearMoCap against the baseline control methods.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Summarized robot tasks results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th align="right">Method</th>
<th align="center">Dist. (cm)</th>
<th align="center">Time (s)</th>
<th align="center">Trials</th>
<th align="center">Modality</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="4" align="center">Handover</td>
<td align="center">OptiTrack</td>
<td align="center">6.8 &#xb1; 1.6</td>
<td align="center">9.2 &#xb1; 3.2</td>
<td align="center">24</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Watch Only</td>
<td align="center">&#x2b;4.5 &#xb1; 9.7</td>
<td align="center">&#x2b;3.3 &#xb1; 8.1</td>
<td align="center">24</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Pocket</td>
<td align="center">&#x2b;2.2 &#xb1; 3.6</td>
<td align="center">&#x2b;3.4 &#xb1; 6.3</td>
<td align="center">24</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Upper Arm</td>
<td align="center">&#x2b;1.9 &#xb1; 3.7</td>
<td align="center">&#x2b;0.5 &#xb1; 5.2</td>
<td align="center">24</td>
<td align="center">A</td>
</tr>
<tr>
<td rowspan="4" align="center">Intervent.</td>
<td align="center">OptiTrack</td>
<td align="center">2.4 &#xb1; 1.5</td>
<td align="center">17.5 &#xb1; 4.5</td>
<td align="center">15</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Watch Only</td>
<td align="center">&#x2b;5.2 &#xb1; 6.0</td>
<td align="center">&#x2b;10.5 &#xb1; 7.8</td>
<td align="center">15</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Pocket</td>
<td align="center">&#x2b;2.9 &#xb1; 4.3</td>
<td align="center">&#x2b;11.4 &#xb1; 10.7</td>
<td align="center">15</td>
<td align="center">A</td>
</tr>
<tr>
<td align="center">Upper Arm</td>
<td align="center">&#x2b;1.7 &#xb1; 5.2</td>
<td align="center">&#x2b;4.0 &#xb1; 5.6</td>
<td align="center">15</td>
<td align="center">A</td>
</tr>
<tr>
<td rowspan="2" align="center">Tele.</td>
<td align="center">OptiTrack</td>
<td align="center">4.5 &#xb1; 2.9</td>
<td align="center">59.8 &#xb1; 16.5</td>
<td align="center">29</td>
<td align="center">B</td>
</tr>
<tr>
<td align="center">Pocket</td>
<td align="center">&#x2b;1.8 &#xb1; 6.7</td>
<td align="center">&#x2b;13.6 &#xb1; 28.9</td>
<td align="center">28</td>
<td align="center">B</td>
</tr>
<tr>
<td rowspan="2" align="center">Drone</td>
<td align="center">SkyController</td>
<td align="center">-</td>
<td align="center">59.7 &#xb1; 27.8</td>
<td align="center">10</td>
<td align="center">B</td>
</tr>
<tr>
<td align="center">Pocket</td>
<td align="center">-</td>
<td align="center">- 19.2 &#xb1; 24.16</td>
<td align="center">10</td>
<td align="center">B</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Distance errors and time differences are denoted in relation to the baseline. For example, the handover distance in Watch Only mode was on average <inline-formula id="inf127">
<mml:math id="m132">
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>4.5</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>9.7</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm larger than when performing the same task with OptiTrack for motion capture. The Modality column indicates the utilized control modality from <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The Handover and Intervention tasks investigate all WearMoCap pose estimation modes Watch Only, Upper Arm, and Pocket when using control Modality A and compare to OptiTrack as the baseline method. Expectedly, the Watch Only mode is more error-prone than its counterparts, evidenced by its higher handover distance (&#x2b;4.5 cm) and intervention placement distance (&#x2b;5.2 cm). The Upper Arm mode is the most accurate with an increase below &#x2b;2 cm in both tasks. These results are consistent with the evaluation on test data in <xref ref-type="sec" rid="s3-2">Section 3.2</xref>. It is also noteworthy that the Pocket mode too outperformed Watch Only mode in our distance metric. This is because it offers an additional degree of freedom to fine-tune positioning. However, due to this additional degree of freedom, the Pocket mode also incurred longer task completion times, because subjects had to balance changes in arm motion with changes in body orientation.</p>
<p>The Teleoperation and Drone tasks applied control Modality B, which relies on body orientation estimates in Pocket mode. Pocket mode with Modality B was highly accurate in terms of distance metric, with an increase of only <inline-formula id="inf128">
<mml:math id="m133">
<mml:mrow>
<mml:mn>1.8</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>6.7</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> cm from the baseline OptiTrack for teleoperation. As in previous tasks, control through body orientation caused an increase in the completion times when compared to OptiTrack. However, when comparing to the SkyController remote control operation with non-expert drone pilots, WearMoCap incurred significantly shorter task completion times (<inline-formula id="inf129">
<mml:math id="m134">
<mml:mrow>
<mml:mn>19.2</mml:mn>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>24.16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> s). This finding is limited to our specific drone task but still complements the finding of <xref ref-type="bibr" rid="B15">Macchini et al. (2020)</xref> that motion capture control can be more intuitive for inexperienced pilots.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>Reflecting on our presented results, this section discusses WearMoCap in detail: <xref ref-type="sec" rid="s4-1">Section 4.1</xref> contrasts all three WearMoCap modalities with their benefits and limitations. <xref ref-type="sec" rid="s4-2">Section 4.2</xref> discusses the broader significance of our framework, its limitations, and future work. <xref ref-type="sec" rid="s4-3">Section 4.3</xref> concludes this paper.</p>
<sec id="s4-1">
<title>4.1 Modality trade-offs</title>
<p>Given the observed differences in model accuracy on test data, and varying real-robot task performance for each WearMoCap mode, we discuss the following trade-offs for their application.</p>
<sec id="s4-1-1">
<title>4.1.1 Watch only</title>
<p>Using only a smartwatch is the most convenient in terms of availability and setup, but the real-robot task results demonstrate a considerable increase in placement deviations and completion times in contrast to other modes. The applicability of the Watch Only mode depends on the task. If the application requires high-fidelity teleoperation control to perform pick-and-place tasks, the prediction deviations of about 10 cm are too large to be practical. Even though users were able to complete the Intervention task in Watch Only mode, the teleoperation required patience and users were not in full control. On the contrary, in a handover task, the human can compensate for the final centimeters by reaching. In such lower-fidelity applications, being able to replace an optical motion capture system with a single smartwatch is promising for future work.</p>
</sec>
<sec id="s4-1-2">
<title>4.1.2 Upper arm</title>
<p>While an upper-arm fitness strap is widely used and available, it adds an extra step compared to the other two modes. Nevertheless, the increase in accuracy of arm pose tracking with two IMUs has previously been assessed in <xref ref-type="bibr" rid="B34">Yang et al. (2016)</xref>; <xref ref-type="bibr" rid="B10">Joukov et al. (2017)</xref>, and is confirmed by our results. Out of all WearMoCap modes, the Upper Arm mode is the most accurate on the test data and incurs the smallest deviations in our real-robot task completion times and placement accuracy compared to baselines. The relatively small placement deviations of below 2 cm suggest that this mode can be a viable alternative to robot control through motion capture from OptiTrack or Virtual Reality hardware when ease-of-setup is a concern and ubiquity matters.</p>
</sec>
<sec id="s4-1-3">
<title>4.1.3 Pocket</title>
<p>The Pocket mode allows for the most seamless experience because users simply put the phone in their pocket and are free to turn their body. This is in contrast to the Watch Only mode, where users have to maintain a constant forward-facing direction. Our Handover and Intervention real-robot tasks indicate that the additional tracking of body orientation enables users to exert more precise control. However, this mode is less precise than the arm pose estimates in the Upper Arm mode. The Pocket mode, therefore, balances the precision and convenience of the other two modes.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Significance and limitations</title>
<p>WearMoCap enables motion capture from smartwatches and smartphones. Apart from the atmospheric pressure sensor and microphone data, collected measurements are identical to those provided by other IMU devices designed for motion capture purposes, e.g., Movella&#x2019;s XSens Suite (<xref ref-type="bibr" rid="B23">Roetenberg et al., 2009</xref>). The significant difference between WearMoCap and established IMU solutions like XSens lies in the ubiquity and familiarity of smart devices for the average user. Smartphones and smartwatches are more widespread than customized IMU units, and a large population is familiar with starting and using apps on Android OS. While our motion capture methodology would perform equally well with customized IMUs (<xref ref-type="bibr" rid="B20">Prayudi and Kim, 2012</xref>; <xref ref-type="bibr" rid="B1">Beange et al., 2018</xref>; <xref ref-type="bibr" rid="B13">Li et al., 2021</xref>), it is the ubiquity of smart devices that makes WearMoCap attractive for future research into low-barrier robot control interfaces.</p>
<p>A limitation of WearMoCap is that, because of their reliance on IMUs, the global orientation estimates of smartwatches and smartphones can be subject to sensor drift. While the virtual orientation sensors of Android or Wear OS are robust to short-lived disturbances, e.g., moving a magnet past the device, slower long-term shifts can cause considerable offsets. The Android OS estimates device orientations through sensor fusion from accelerometer, magnetometer, and gyroscope using an Extended Kalman filter. Gyroscope drift is compensated by the gravity estimate from the accelerometer and the magnetic North from the magnetometer. As a result, the orientation is mostly subject to drift around the yaw axis due to shifts in the measured magnetic North. Our training and test data includes recording sessions of up to 10 min duration. Further, during the real-robot tasks, pose estimations typically stayed robust for 15 min or longer, but we had to ask subjects to recalibrate in about 10% or the instances. To mitigate sensor drift during longer sessions, a promising direction for future work involves utilizing our employed stochastic forward passes, which result in widening solution distributions when unrealistic changes or unergonomic angles occur (also depicted in <xref ref-type="fig" rid="F5">Figure 5</xref>). This way of recognizing unergonomic or impossible angles from wide distributions can help mitigating sensor drift by automatically triggering recalibration.</p>
<p>Another source of drift is the sensor-to-segment misalignment, i.e., if the watch is loosely worn and slips post-calibration, we expect the tracking accuracy to be affected. In our experiments, we fitted the subjects with tightly strapped watches and phones to minimize this issue. However, in the future, we can look at better understanding the impact of sensor-to-segment misalignment and adopt techniques to correct it.</p>
<p>A further potential limitation common to phone-based apps is that major Operating System (OS) update, e.g., Android 12 to 13, could break our application if not updated properly to handle the OS change. However, some of our older tested devices, e.g., the OnePlus N100, do not receive long-term support anymore and will not undergo major updates in the future. It is unlikely WearMoCap will break on such older devices. Android OS updates for newer devices are rolled out slowly. To handle these updates in the long run, we have enabled the Issue Tracking function in the Github repository.</p>
<p>Another limitation is that our method assumes default arm lengths. While this is representative of the population that we tested with, unusually long or short arm lengths might adversely affect the tracking performance. Future work will investigate the effects of large variations in anthropometry. We publish WearMoCap as open source with this work to facilitate such future investigations. Lastly, we expect that we can improve the tracking performance by adding more subjects with varied motions and differing limb lengths.</p>
</sec>
<sec id="s4-3">
<title>4.3 Conclusion</title>
<p>This work presented WearMoCap, an extensively documented open-source library for ubiquitous motion capture and robot control from a smartwatch and smartphone. It features three motion capture modes: Watch Only requires the least setup; Upper Arm is the most precise; and Pocket is the most flexible. We benchmarked these modes on large-scale datasets collected from experiments with multiple human subjects and devices. To evaluate their practical use, we demonstrated and discussed their application in four real-robot tasks. Results show that, when chosen for the appropriate task, WearMoCap serves as an ubiquitous and viable alternative to the costly state-of-the-art motion capture systems. Future work involves evaluating the applicability of WearMoCap in more scenarios and implementing strategies for mitigating sensor drift. To this end, the WearMoCap library is published as open source together with step-by-step instructions and all training and test data.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>The studies involving humans were approved by The Institutional Review Board of Arizona State University. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>FCW: Writing&#x2013;original draft, Conzeptualization, Methodology, Project administration. NK: Conceptualization, Writing&#x2013;review and editing. OA: Writing&#x2013;review and editing. HB: Conceptualization, Supervision, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>NK and OA were employed by the Corporate Functions-R&#x26;D, Procter and Gamble.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2024.1478016/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2024.1478016/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Video1.mp4" id="SM1" mimetype="application/mp4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://cloud.google.com/speech-to-text">https://cloud.google.com/speech-to-text</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Beange</surname>
<given-names>K. H.</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Graham</surname>
<given-names>R. B.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Evaluation of wearable imu performance for orientation estimation and motion tracking</article-title>,&#x201d; in <source>2018 IEEE international symposium on medical measurements and applications (MeMeA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Darvish</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Penco</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cisneros</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Pratt</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yoshida</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Teleoperation of humanoid robots: a survey</article-title>. <source>IEEE Trans. Robotics</source> <volume>39</volume>, <fpage>1706</fpage>&#x2013;<lpage>1727</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2023.3236952</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Desmarais</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mottet</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Slangen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Montesinos</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A review of 3d human pose estimation algorithms for markerless motion capture</article-title>. <source>Comput. Vis. Image Underst.</source> <volume>212</volume>, <fpage>103275</fpage>. <pub-id pub-id-type="doi">10.1016/j.cviu.2021.103275</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>DeVrio</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mollyn</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Harrison</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Smartposer: arm pose estimation with a smartphone and smartwatch using uwb and imu data</article-title>,&#x201d; in <source>Proceedings of the 36th annual ACM symposium on user interface software and technology</source> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>UIST &#x2019;23</publisher-name>). <pub-id pub-id-type="doi">10.1145/3586183.3606821</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wetzstein</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
</person-group>(<year>2024</year>). <article-title>Humanplus: humanoid shadowing and imitation from humans</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2406.10454">https://arxiv.org/abs/2406.10454</ext-link>(Acessed July 26, 2024)</comment>.</citation>
</ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gal</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ghahramani</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Dropout as a bayesian approximation: representing model uncertainty in deep learning</article-title>,&#x201d; in <conf-name>Proceedings of the 33rd International Conference on International Conference on Machine Learning - Volume 48</conf-name> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>JMLR.org, ICML</publisher-name>), <volume>16</volume>, <fpage>1050</fpage>&#x2013;<lpage>1059</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hauser</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Watson</surname>
<given-names>E. N.</given-names>
</name>
<name>
<surname>Bae</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bankston</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Behnke</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Borgia</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Analysis and perspectives on the ana avatar xprize competition</article-title>. <source>Int. J. Soc. Robotics</source>. <pub-id pub-id-type="doi">10.1007/s12369-023-01095-w</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hindle</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Keogh</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Lorimer</surname>
<given-names>A. V.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Inertial-based human motion capture: a technical summary of current processing methodologies for spatiotemporal and kinematic measures</article-title>. <source>Appl. Bionics Biomechanics</source> <volume>2021</volume>, <fpage>6628320</fpage>. <pub-id pub-id-type="doi">10.1155/2021/6628320</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kaufmann</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Aksan</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Black</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Hilliges</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Pons-Moll</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep inertial poser: learning to reconstruct human pose from sparse inertial measurements in real time</article-title>. <source>ACM Trans. Graph. (TOG)</source> <volume>37</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1145/3272127.3275108</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Joukov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>&#x106;esi&#x107;</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Westermann</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Markovi&#x107;</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kuli&#x107;</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Petrovi&#x107;</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Human motion estimation on lie groups using imu measurements</article-title>,&#x201d; in <source>2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1965</fpage>&#x2013;<lpage>1972</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>B.-G.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>B.-L.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>W.-Y.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Smartwatch-based driver alertness monitoring with wearable motion and physiological sensor</article-title>,&#x201d; in <source>2015 37th annual international conference of the IEEE engineering in medicine and biology society (EMBC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>6126</fpage>&#x2013;<lpage>6129</lpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Joo</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Mocap everyone everywhere: lightweight motion capture with smartwatches and a head-mounted camera</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>1091</fpage>&#x2013;<lpage>1100</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Real-time human motion capture based on wearable inertial sensor networks</article-title>. <source>IEEE Internet Things J.</source> <volume>9</volume>, <fpage>8953</fpage>&#x2013;<lpage>8966</lpage>. <pub-id pub-id-type="doi">10.1109/jiot.2021.3119328</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chomsin</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Real-time tracking of smartwatch orientation and location by multitask learning</article-title>,&#x201d; in <conf-name>Proceedings of the 20th ACM Conference on Embedded Networked Sensor Systems</conf-name>, <conf-loc>New York, NY, United States</conf-loc> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>Association for Computing Machinery, SenSys</publisher-name>), <fpage>120</fpage>&#x2013;<lpage>133</lpage>. <pub-id pub-id-type="doi">10.1145/3560905.3568548</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Macchini</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Havy</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Weber</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schiano</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Floreano</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Hand-worn haptic interface for drone teleoperation</article-title>,&#x201d; in <source>2020 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>10212</fpage>&#x2013;<lpage>10218</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA40945.2020.9196664</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Malleson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gilbert</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Trumble</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Collomosse</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hilton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Volino</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Real-time full-body motion capture from video and imus</article-title>,&#x201d; in <conf-name>2017 international conference on 3D vision (3DV)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>449</fpage>&#x2013;<lpage>457</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mollyn</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Arakawa</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Goel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Harrison</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ahuja</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Imuposer: full-body pose estimation using imus in phones, watches, and earbuds</article-title>,&#x201d; in <conf-name>Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (Association for Computing Machinery)</conf-name>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1145/3544548.3581392</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nagym&#xe1;t&#xe9;</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kiss</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Application of optitrack motion capture systems in human movement analysis: a systematic literature review</article-title>. <source>Recent Innovations Mechatronics</source> <volume>5</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.17667/riim.2018.1/13</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Noh</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yoon</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>A decade of progress in human motion recognition: a comprehensive survey from 2010 to 2020</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>5684</fpage>&#x2013;<lpage>5707</lpage>. <pub-id pub-id-type="doi">10.1109/access.2024.3350338ACCESS.2024.3350338</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Prayudi</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Design and implementation of imu-based human arm motion capture system</article-title>,&#x201d; in <conf-name>2012 IEEE International conference on mechatronics and automation</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>670</fpage>&#x2013;<lpage>675</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Raghavendra</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sachin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Srinivas</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Talasila</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Design and development of a real-time, low-cost imu based human motion capture system</article-title>,&#x201d; in <conf-name>Computing and Network Sustainability: Proceedings of IRSCNS 2016</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>155</fpage>&#x2013;<lpage>165</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Robinson</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Tidd</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Campbell</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kuli&#x107;</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Corke</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Robotic vision for human-robot interaction and collaboration: a survey and systematic review</article-title>. <source>J. Hum.-Robot Interact.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>66</lpage>. <pub-id pub-id-type="doi">10.1145/3570731</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roetenberg</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Luinge</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Slycke</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Xsens mvn: full 6dof human motion tracking using miniature inertial sensors</article-title>. <source>Xsens Motion Technol. BV, Tech. Rep.</source> <volume>1</volume>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Halilaj</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Markerless motion tracking with noisy video and imu data</article-title>. <source>IEEE Trans. Biomed. Eng.</source> <volume>70</volume>, <fpage>3082</fpage>&#x2013;<lpage>3092</lpage>. <pub-id pub-id-type="doi">10.1109/tbme.2023.3275775</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Topley</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Richards</surname>
<given-names>J. G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A comparison of currently available optoelectronic motion capture systems</article-title>. <source>J. Biomechanics</source> <volume>106</volume>, <fpage>109820</fpage>. <pub-id pub-id-type="doi">10.1016/j.jbiomech.2020.1098202020.109820</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Villani</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Capelli</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Secchi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fantuzzi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sabattini</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020a</year>). <article-title>Humans interacting with multi-robot systems: a natural affect-based approach</article-title>. <source>Aut. Robots</source> <volume>44</volume>, <fpage>601</fpage>&#x2013;<lpage>616</lpage>. <pub-id pub-id-type="doi">10.1007/s10514-019-09889-6</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Villani</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Righi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sabattini</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Secchi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020b</year>). <article-title>Wearable devices for the assessment of cognitive effort for human&#x2013;robot interaction</article-title>. <source>IEEE Sensors J.</source> <volume>20</volume>, <fpage>13047</fpage>&#x2013;<lpage>13056</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2020.3001635</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Walker</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Phung</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chakraborti</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Szafir</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Virtual, augmented, and mixed reality for human-robot interaction: a survey and virtual design element taxonomy</article-title>. <source>J. Hum.-Robot Interact.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.1145/3597623</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Olson</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>AprilTag 2: efficient and robust fiducial detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/RSJ international conference on intelligent robots and systems (IROS)</source>, <fpage>4193</fpage>&#x2013;<lpage>4198</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kurita</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kuang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Real-time limb motion tracking with a single imu sensor for physical therapy exercises</article-title>,&#x201d; in <source>2021 43rd annual international conference of the IEEE engineering in medicine and biology society (EMBC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>7152</fpage>&#x2013;<lpage>7157</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Weigend</surname>
<given-names>F. C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Amor</surname>
<given-names>H. B.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Probabilistic differentiable filters enable ubiquitous robot control with smartwatches</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2309.06606">https://arxiv.org/abs/2309.06606</ext-link> (Acessed July 26, 2024)</comment>.</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Weigend</surname>
<given-names>F. C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Sonawani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Vasudevan</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Ben Amor</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>iRoCo: intuitive robot control from anywhere using a smartwatch</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>17800</fpage>&#x2013;<lpage>17806</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA57147.2024.10610805</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Weigend</surname>
<given-names>F. C.</given-names>
</name>
<name>
<surname>Sonawani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Drolet</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Amor</surname>
<given-names>H. B.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>Anytime, anywhere: human arm pose from smartwatch data for ubiquitous robot control and teleoperation</article-title>. In <conf-name>IEEE/RSJ International Conference on Intelligent Robots and Systems IROS</conf-name>. <fpage>3811</fpage>&#x2013;<lpage>3818</lpage>. <pub-id pub-id-type="doi">10.1109/IROS55552.2023.10341624</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Neural learning enhanced teleoperation control of baxter robot using imu based motion capture</article-title>,&#x201d; in <conf-name>2016 22nd International Conference on Automation and Computing (ICAC)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>389</fpage>&#x2013;<lpage>394</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>