<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2025.1649870</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Imitation-relaxation reinforcement learning for sparse badminton strikes via dynamic trajectory generation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Yuan</surname> <given-names>Yanyan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2993604/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tao</surname> <given-names>Yucheng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Cheng</surname> <given-names>Shaowen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2576661/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Liang</surname> <given-names>Yanhong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Jin</surname> <given-names>Yongbin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2659138/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Hongtao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2591845/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Center for X-Mechanics, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>ZJU-Hangzhou Global Scientific and Technological Innovation Center, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>State Key Laboratory of Fluid Power and Mechatronic System, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Institute of Applied Mechanics, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Long Jin, Lanzhou University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Hongyin Zhang, Westlake University, China</p>
<p>Huang Ou-Yang, Lanzhou University, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Yongbin Jin <email>yongbinjin&#x00040;zju.edu.cn</email></corresp>
<corresp id="c002">Hongtao Wang <email>htw&#x00040;zju.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>02</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>19</volume>
<elocation-id>1649870</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Yuan, Tao, Cheng, Liang, Jin and Wang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Yuan, Tao, Cheng, Liang, Jin and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Robotic racket sports provide exceptional benchmarks for evaluating dynamic motion control capabilities in robots. Due to the highly non-linear dynamics of the shuttlecock, the stringent demands on robots&#x00027; dynamic responses, and the convergence difficulties caused by sparse rewards in reinforcement learning, badminton strikes remain a formidable challenge for robot systems. To address these issues, this study proposes DTG-IRRL, a novel learning framework for badminton strikes that integrates imitation-relaxation reinforcement learning with dynamic trajectory generation. The framework demonstrates significantly improved training efficiency and performance, achieving faster convergence and twice the landing accuracy. Analysis of the reward function within a specific parameter space hyperplane intuitively reveals the convergence difficulties arising from the inherent sparsity of rewards in racket sports and demonstrates the framework&#x00027;s effectiveness in mitigating local and slow convergence. Implemented on hardware with zero-shot transfer, the framework achieves a 90% hitting rate and a 70% landing accuracy, enabling sustained humanrobot rallies. Cross-platform validation using the UR5 robot demonstrates the framework&#x00027;s generalizability while highlighting the requirement for high dynamic performance of robotic arms in racket sports.</p></abstract>
<kwd-group>
<kwd>reinforcement learning</kwd>
<kwd>robotic badminton</kwd>
<kwd>sparse reward</kwd>
<kwd>nonlinear dynamics</kwd>
<kwd>state prediction</kwd>
<kwd>trajectory generation</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="5"/>
<equation-count count="23"/>
<ref-count count="37"/>
<page-count count="14"/>
<word-count count="8140"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Interceptive robotic ball sports, including table tennis (<xref ref-type="bibr" rid="B3">B&#x000FC;chler et al., 2022</xref>), badminton (<xref ref-type="bibr" rid="B20">Mori et al., 2019</xref>), and tennis (<xref ref-type="bibr" rid="B35">Zaidi et al., 2023</xref>; <xref ref-type="bibr" rid="B14">Hattori et al., 2020</xref>) have served as critical testbeds for evaluating the dynamic performance of robotstasks that remain challenging even for skilled human players. These sports typically involve three stages: (1) ball trajectory prediction, (2) hitting strategy decision, and (3) robotic arm motion control. Accurate ball trajectory prediction is crucial for successful interception, while the hitting decision determines the post-impact trajectory and landing position. Real-time motion control ensures the racket reaches the desired state for an effective strike. Collectively, these stages impose stringent demands on precise trajectory prediction, robust hitting decisions, real-time motion control, and high dynamic responsiveness, underscoring the inherent challenges of fast-paced racket sports.</p>
<p>Accurate trajectory prediction intuitively relies on precise dynamics models for ball sports [table tennis (<xref ref-type="bibr" rid="B36">Zhao et al., 2015</xref>; <xref ref-type="bibr" rid="B23">M&#x000FC;lling et al., 2010</xref>), badminton (<xref ref-type="bibr" rid="B31">Waghmare et al., 2016</xref>)]. However, uncertainties in physical parameters often lead to substantial prediction errors, particularly pronounced in badminton due to its significant aerodynamic drag effects and extended flight trajectories (<xref ref-type="bibr" rid="B6">Cohen et al., 2014</xref>; <xref ref-type="bibr" rid="B5">Cohen and Clanet, 2016</xref>). While Kalman Filter-based approaches have been adopted to enhance accuracy in table tennis (<xref ref-type="bibr" rid="B28">Tebbe et al., 2019</xref>, <xref ref-type="bibr" rid="B29">2021</xref>), tennis (<xref ref-type="bibr" rid="B35">Zaidi et al., 2023</xref>), badminton (<xref ref-type="bibr" rid="B37">Zhi et al., 2022</xref>; <xref ref-type="bibr" rid="B32">Yang, 2022</xref>), and other ball (<xref ref-type="bibr" rid="B33">Yu et al., 2023</xref>; <xref ref-type="bibr" rid="B15">Hsiao and Kao, 2023</xref>) sports. Many approaches simplify aerodynamics by neglecting nonlinear forces (air drag force or Magnus force) or assuming constant aerodynamic coefficients. These simplifications are particularly inadequate for precise badminton trajectory prediction, whose unique feathered structure results in highly nonlinear dynamics and varying drag coefficient due to feather deformation, leading to pronounced velocity decay (initial-to-terminal ratio up to 17.5; <xref ref-type="bibr" rid="B6">Cohen et al., 2014</xref>), 5&#x02013;10 times greater than others. These distinctive characteristics severely complicate accurate state prediction, posing unique challenges, especially over longer horizons.</p>
<p>Successful ball-hitting in robotic racket sports necessitates accurate hitting decisions and real-time motion control. Early approaches utilized collision and trajectory prediction models for calculating the desired racket state (<xref ref-type="bibr" rid="B33">Yu et al., 2023</xref>; <xref ref-type="bibr" rid="B28">Tebbe et al., 2019</xref>) and trajectory optimization (<xref ref-type="bibr" rid="B33">Yu et al., 2023</xref>; <xref ref-type="bibr" rid="B22">M&#x000FC;ller et al., 2011</xref>) for motion control, which is inherently limited by model parameter accuracy, particularly critical in the badminton task due to their highly nonlinear dynamics and variable drag coefficients. Recent advancements have employed reinforcement learning (DDPG <xref ref-type="bibr" rid="B29">Tebbe et al., 2021</xref>; <xref ref-type="bibr" rid="B12">Gao et al., 2022</xref>, TD3 <xref ref-type="bibr" rid="B13">Gao and Zell, 2023</xref>), evolutionary search (<xref ref-type="bibr" rid="B7">D&#x00027;Ambrosio et al., 2023</xref>; <xref ref-type="bibr" rid="B11">Gao et al., 2020</xref>) for hitting decision or jointly learning hitting strategies and joint-level motion control (<xref ref-type="bibr" rid="B1">Abeyruwan et al., 2023</xref>; <xref ref-type="bibr" rid="B8">D&#x00027;Ambrosio et al., 2024</xref>), and achieving diverse table tennis playing styles. However, RL approaches often rely on sparse reward functions where the agent rarely sees a reward signal with random exploration (<xref ref-type="bibr" rid="B24">Nair et al., 2018</xref>). This inherent sparsity often impedes efficient exploration and leads to convergence to suboptimal policies, thereby requiring considerable iterative sim-to-real training. Learning from Demonstration (LfD) (<xref ref-type="bibr" rid="B21">Muelling et al., 2010</xref>; <xref ref-type="bibr" rid="B4">Chen et al., 2021</xref>; <xref ref-type="bibr" rid="B2">Akrour et al., 2018</xref>; <xref ref-type="bibr" rid="B16">Huang et al., 2016</xref>) mitigates this by imitating human behaviors from expert demonstrations, but it typically requires expensive datasets, and its generalization is inherently limited by the provided demonstrations.</p>
<p>Beyond control strategies, the dynamic capabilities of the robotic arm are also crucial. Commercial collaborative robotic arms are commonly employed, particularly in table tennis (<xref ref-type="bibr" rid="B29">Tebbe et al., 2021</xref>; <xref ref-type="bibr" rid="B12">Gao et al., 2022</xref>; <xref ref-type="bibr" rid="B1">Abeyruwan et al., 2023</xref>; <xref ref-type="bibr" rid="B9">Ding et al., 2022</xref>), where the required racket speeds and motion ranges are relatively low. However, as the fastest racket-based projectile sport with recorded smash velocities exceeding 137 m/s (<xref ref-type="bibr" rid="B26">Records, 2014</xref>), badminton imposes substantially more stringent requirements on the dynamic performance of robotic armsparticularly in joint velocity (&#x0003E;24 rad/s) and acceleration (&#x0003E;600 rad/s<sup>2</sup>) (<xref ref-type="bibr" rid="B25">Rambely and Osman, 2005</xref>) for competitive-level returnscompared to other robotic ball sports. To address these challenges, <xref ref-type="bibr" rid="B20">Mori et al. (2019</xref>, <xref ref-type="bibr" rid="B19">2018</xref>) developed a lightweight, high-speed robotic arm with pneumatic actuators, achieving a hitting success rate of 69.7%. Meanwhile. <xref ref-type="bibr" rid="B34">Yuan et al. (2025)</xref> also highlighted the critical impact of dynamic performance for badminton robots.</p>
<p>To address the convergence challenges of RL due to sparse rewards and the trajectory prediction challenges posed by the highly nonlinear dynamics in badminton, this letter proposes a learning framework (DTG-IRRL) and a robot-badminton system for sparse robotic badminton striking, which integrates the imitation relaxation reinforcement learning (IRRL; <xref ref-type="bibr" rid="B18">Jin et al., 2022</xref>) with the dynamic trajectory generation (DTG). The DTG generates a feasible arm reference trajectory as an initial hitting strategy through the prediction results of the shuttlecock&#x00027;s hitting time and point using the initial 10 frames of ball state, analogous to feedforward control. Then, the IRRL stage trains the arm motion controller, leveraging the generated reference trajectory as imitation targets. Exploiting the unimodal characteristics of the imitation reward function and dynamic reference trajectory adjustment of DTG, this framework significantly mitigates the challenges of convergence to local optima and slow convergence due to sparse reward and improves landing accuracy compared to baseline methods.</p>
<p>The framework has been experimentally validated through hardware implementation with zero-shot transfer. This system includes a 180 Hz motion capture system with &#x000B1;0.02mm spatial resolution (<xref ref-type="bibr" rid="B10">FZMotion, 2025</xref>), a 4-DOF robotic arm exhibiting 234 m/s<sup>2</sup> peak end-effector acceleration, and a ball launcher. The project is available on <ext-link ext-link-type="uri" xlink:href="https://stylite-y.github.io/DTG-IRRL-For-Badminton/">https://stylite-y.github.io/DTG-IRRL-For-Badminton/</ext-link>. The key contributions include:</p>
<list list-type="bullet">
<list-item><p>We propose a learning framework (DTG-IRRL) that integrates imitation relaxation and reinforcement learning with dynamic trajectory generation, achieving faster convergence speeds and a higher success rate. An analysis of the reward distribution in a specific hyperplane intuitively demonstrates the framework&#x00027;s effectiveness in mitigating local and slow convergence due to sparse rewards.</p></list-item>
<list-item><p>Hardware implementation on physical platforms with zero-shot transfer demonstrates a 90% hitting rate and 70% landing accuracy, while enabling sustained human-robot rallies exceeding six consecutive strokes, as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p></list-item>
<list-item><p>Through comparative evaluations on the UR5 and KirinArm robotic arms, we validate the framework&#x00027;s cross-platform generalization and reveal the impact of robot arm dynamic performance on high-speed ball motion.</p></list-item>
</list>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Multi-round human-robot rallies in badminton tasks. <bold>(A)</bold> depicts the player serving; <bold>(B)</bold> shows the robot returning; <bold>(C)</bold> illustrates the player rallying; <bold>(D)</bold> features the robot counter-rallying. The orange and blue dots show the trajectory of the ball during the rally. The yellow and orange lines are the configuration of the robotic arm and human&#x00027;s racket.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0001.tif">
<alt-text>Four-panel sequence showing a table tennis match between a human player and a robot. Panel (a) depicts the player serving, as indicated by the motion of orange balls and labeled &#x0201C;Player Serves.&#x0201D; Panel (b) shows the robot returning, with blue balls and labeled &#x0201C;Robot Returns.&#x0201D; Panel (c) illustrates the player rallying, with orange balls labeled &#x0201C;Player Rallies.&#x0201D; Panel (d) features the robot counter-rallying, with blue balls labeled &#x0201C;Robot Counter-rallies.&#x0201D;</alt-text>
</graphic>
</fig>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methodology</title>
<sec>
<title>2.1 Overview</title>
<p>This section details the proposed learning framework (DTG-IRRL) for real-time badminton striking, illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. The framework consists of two components: dynamic trajectory generation (DTG) for robotic arm&#x00027;s reference trajectory generation and imitation relaxation reinforcement learning for motor controller training. Inspired by prior work (<xref ref-type="bibr" rid="B18">Jin et al., 2022</xref>), exploiting the unimodal characteristics of the imitation reward function, the IRRL method can guide the policy toward rapid and efficient convergence to mitigate the convergence challenges caused by sparse rewards. However, it typically relies on fixed references, limiting its adaptability for highly dynamic tasks. DTG-IRRL addresses this by dynamically generating a trajectory using a supervised prediction network and a reference trajectory generation module. The motion controller is trained by leveraging the generated trajectory as mimic targets in the imitation phase. The controller is then fine-tuned via reinforcement learning using task-specific rewards in the relaxation phase.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Overview of the DTG-IRRL framework. Prediction Network (red box): Predicts hitting time and the point of the shuttlecock at the target plane based on initial 10-frame observations; Reference Trajectory Generation (purple box): generates the arm&#x00027;s reference trajectory using the prediction results; IRRL (blue box): trains the motion controller by tracking the generated reference trajectory based on the states of both the shuttlecock and the robotic arm.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0002.tif">
<alt-text>Diagram showing a control system architecture for dynamic trajectory generation (DTG) and integrated reinforcement and reward learning (IRRL). The DTG includes a predict network and reference trajectory generation. The IRRL consists of a motion controller interacting with a physical robot arm. Inputs and outputs are labeled with variables such as ball observations and motion commands. A graph illustrates the arms movement on an XYZ axis.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>2.2 Dynamic trajectory generation (DTG)</title>
<sec>
<title>2.2.1 Hitting time and point prediction network</title>
<p>The distinctive feathered design of shuttlecocks results in a non-parabolic flight trajectory governed by Tartaglia&#x00027;s curve. The shuttlecock&#x00027;s motion is governed by a nonlinear dynamic system that incorporates gravitational and velocity-dependent aerodynamic drag forces (<xref ref-type="bibr" rid="B6">Cohen et al., 2014</xref>; <xref ref-type="bibr" rid="B5">Cohen and Clanet, 2016</xref>), as detailed in <xref ref-type="disp-formula" rid="E1">Equation 1</xref>. In simulation, the velocity-related air drag term (second term on the right side of the <xref ref-type="disp-formula" rid="E1">Equation 1</xref>) is also applied to the shuttlecock to simulate the real-world dynamics at each step except for gravity. And <italic>M</italic>, &#x003C1;, <italic>C</italic><sub><italic>D</italic></sub>, <italic>R, U</italic> respectively denote shuttlecock mass, air density, drag coefficient, geometric radius, and speed magnitude, while <bold>g</bold>, <bold>U</bold> represent the gravitational acceleration vector and the instantaneous velocity vector.</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mstyle mathvariant="bold"><mml:mtext>g</mml:mtext></mml:mstyle><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mi>&#x003C1;</mml:mi><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mi>&#x003C0;</mml:mi><mml:msup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mi>U</mml:mi><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Using initial 10-frame sequences of shuttlecock observations <inline-formula><mml:math id="M2"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mn>9</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, we predict the hitting point and time <inline-formula><mml:math id="M3"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of the shuttlecock at user-defined hitting planes (y=0.25) based on the supervised learning network (<xref ref-type="fig" rid="F2">Figure 2</xref>), where <inline-formula><mml:math id="M4"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, and <inline-formula><mml:math id="M5"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denote the 3D position and linear velocity of the shuttlecock and the elapsed time since service at the i-th frame, respectively; The network outputs <bold>x</bold><sub><italic>t</italic><sub><italic>c</italic></sub></sub>, <italic>t</italic><sub><italic>c</italic></sub> denote hitting point and time of the ball at the user-defined plane.</p>
<p>The training dataset was generated in a simulation. In badminton flight trajectory, shuttlecocks unique feather structure causes its posture to flip, oscillate, and subsequently stabilize within approximately 130 ms after impact (<xref ref-type="bibr" rid="B6">Cohen et al., 2014</xref>), leading to variations in the drag coefficient. Therefore, we implement domain randomization to randomly sample the drag coefficient between [0.62, 0.69] during the initial 130 ms of flight for each simulation cycle and hold it constant thereafter in both the prediction network training datasets collection and the RL training process to capture the inherent nonlinear dynamics of the shuttlecock. The prediction network employs an MLP network with two hidden layers of 256 units in each. Training minimizes the mean squared error (MSE) between the predicted and actual hitting point and time.</p>
</sec>
<sec>
<title>2.2.2 Reference trajectory generation</title>
<p>Simultaneously learning both hitting strategy and motion control using RL is challenging due to the sparse reward function. To mitigate this, we calculate the robotic arm&#x00027;s reference trajectory based on trajectory prediction results via reference trajectory generation, which serves as an initial hitting strategy. This strategy implicitly provides initial solutions for desired racket orientation, position, and hitting time, analogous to feedforward control. This initial guidance significantly improves training efficiency and accelerates convergence.</p>
<p>We employed a sigmoid-based trajectory (<xref ref-type="disp-formula" rid="E2">Equation 2</xref>) that guarantees C<sup>2</sup> continuity in joint space, preventing torque fluctuations. For the 4-DOF robotic arm, target joint angles &#x00398;<sup><italic>tar</italic></sup> &#x02208; &#x0211D;<sup>4</sup> can be computed via inverse kinematics from <inline-formula><mml:math id="M6"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, with the wrist pitch angle fixed for strike consistency. The reference trajectory is fully determined by three parameters: initial joint angles &#x00398;<sup><italic>init</italic></sup>, target angles &#x00398;<sup><italic>tar</italic></sup>, and hitting time <italic>t</italic><sub><italic>c</italic></sub>, shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>, where <italic>a</italic><sub><italic>i</italic></sub>, <italic>b</italic><sub><italic>i</italic></sub>, <italic>c</italic><sub><italic>i</italic></sub>, <italic>d</italic><sub><italic>i</italic></sub> are the parameters of the function.</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mtext>&#x02003;&#x000A0;if&#x000A0;t</mml:mtext><mml:mo>&#x02264;</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>02</mml:mn><mml:mtext class="textrm" mathvariant="normal">&#x000A0;s</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E3"><label>(3)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>20</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
</sec>
<sec>
<title>2.3 Imitation-relaxation reinforcement learning (IRRL)</title>
<p>Leveraging the generated trajectory as imitation targets, the two-stage imitation-relaxation reinforcement learning framework (<xref ref-type="bibr" rid="B18">Jin et al., 2022</xref>) is employed to accelerate policy convergence and avoid local optima. In the imitation stage, the policy learns to mimic trajectories generated by the DTG module (shown in <xref ref-type="fig" rid="F2">Figure 2</xref>). In the relaxation stage, policies are refined through reward shaping to achieve precise striking and landing control.</p>
<sec>
<title>2.3.1 Training environment</title>
<p>To learn the badminton striking strategy, we model the task as a finite-horizon, discounted Markov Decision Process (MDP) <inline-formula><mml:math id="M9"><mml:mi mathvariant="script">M</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi mathvariant="script">S</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="script">A</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="script">P</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="script">R</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:math></inline-formula>, where <inline-formula><mml:math id="M10"><mml:mrow><mml:mi mathvariant="script">S</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="script">A</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003B3;</mml:mi></mml:mrow></mml:math></inline-formula> is the state space, action space, and discount factor, and <inline-formula><mml:math id="M11"><mml:mrow><mml:mi mathvariant="script">P</mml:mi></mml:mrow></mml:math></inline-formula> denotes the state transition dynamics: <inline-formula><mml:math id="M12"><mml:mrow><mml:mi mathvariant="script">S</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi mathvariant="script">A</mml:mi><mml:mo>&#x02192;</mml:mo><mml:mi mathvariant="script">S</mml:mi></mml:mrow></mml:math></inline-formula>, and each transition is rewarded with a reward function <italic>r</italic>: <inline-formula><mml:math id="M13"><mml:mrow><mml:mi mathvariant="script">S</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi mathvariant="script">A</mml:mi><mml:mo>&#x02192;</mml:mo><mml:mi mathvariant="script">R</mml:mi></mml:mrow></mml:math></inline-formula>. We use MuJoCo as the physics engine for simulation. The policy, represented by an MLP network with two hidden layers of 512 units in each, was trained with the PPO algorithm (<xref ref-type="bibr" rid="B27">Schulman et al., 2017</xref>) in the Stable Baselines3 package.</p>
</sec>
<sec>
<title>2.3.2 State and action</title>
<p>The state <italic>s</italic> is an 18-dimensional vector defined as <inline-formula><mml:math id="M14"><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, where <inline-formula><mml:math id="M15"><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x02219;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> are the joint angle and velocity of the robotic arm, ball position and linear velocity, and previous action. The real-time shuttlecocks position and velocity are received as feedback state to compensate for the prediction deviations caused by the nonlinear dynamics and variations in drag coefficient. In simulation, the initial position and velocity range of the badminton are as follows: <italic>x</italic> &#x02208; [&#x02212;0.12, &#x02212;0.14], <italic>y</italic> &#x02208; [3.8, 4.2], <italic>z</italic> &#x02208; [0.6, 0.8], <italic>V</italic><sub><italic>x</italic></sub> &#x02208; [&#x02212;0.8, 0.8], <italic>V</italic><sub><italic>y</italic></sub> &#x02208; [&#x02212;10, &#x02212;7.5], <italic>V</italic><sub><italic>z</italic></sub> &#x02208; [6.0, 7.5]. The action space <bold>a</bold> is the desired joint angle. To facilitate learning, we train the policy to infer the desired angle around the default angle of the robotic arm &#x003B8;<sub><italic>default</italic></sub>. Therefore, the desired joint angle can be calculated by</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>k</mml:mi><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>2.3.3 Imitation stage reward</title>
<p>For the imitation stage, the policy learns to track reference trajectories through a composite reward function consisting of the joint and velocity trajectory imitation term <italic>r</italic><sup><italic>m</italic></sup>, the action smoothness and power penalty term <italic>r</italic><sup>&#x003C4;</sup>, and the collision penalty <italic>r</italic><sup><italic>c</italic></sup>. Each reward is shaped using a Gaussian kernel and normalized to (0, 1), as shown in <xref ref-type="disp-formula" rid="E5">Equation 5</xref>. <bold>&#x003C9;</bold> &#x0003D; [&#x003C9;<sub><italic>m</italic></sub>, &#x003C9;<sub>&#x003C4;</sub>, &#x003C9;<sub><italic>c</italic></sub>, &#x003C9;<sub><italic>sp</italic></sub>] is the reward weight coefficient. For all undesirable interactions excluding robot-ball contact and ball-ground impact, <italic>r</italic><sup><italic>c</italic></sup> will return a negative value of &#x02212;1.</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M17"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>m</mml:mi></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>c</mml:mi></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mi>r</mml:mi><mml:mi>m</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mn>1.75</mml:mn><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mi>m</mml:mi></mml:msub><mml:msup><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:mn>0.75</mml:mn><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mi>m</mml:mi></mml:msub><mml:msup><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mi>r</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mn>2.0</mml:mn><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003C4;</mml:mi><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003C4;</mml:mi></mml:mstyle><mml:mi>m</mml:mi></mml:msub><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>q</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mi>m</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:msup><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup><mml:mo>+</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mn>1.0</mml:mn><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mo stretchy="false">&#x02016;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:msup><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:mn>1.0</mml:mn><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>0.5</mml:mn><mml:mo stretchy="false">&#x02016;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:mn>2</mml:mn><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:msup><mml:mo stretchy="false">&#x02016;</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mi>r</mml:mi><mml:mi>c</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if&#x000A0;no&#x000A0;collision</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>2.3.4 Relaxation stage reward</title>
<p>For the relaxation stage, the sparse task-specific rewards <italic>r</italic><sup><italic>sp</italic></sup> are introduced to guide the policy in achieving badminton striking. <italic>r</italic><sup><italic>sp</italic></sup> contains two components: <italic>r</italic><sup><italic>h</italic></sup> and <italic>r</italic><sup><italic>l</italic></sup>, where <italic>r</italic><sup><italic>h</italic></sup> is activated upon racket-shuttlecock contact, providing a positive reward, and <italic>r</italic><sup><italic>l</italic></sup> is triggered when the shuttlecock lands within the target area, yielding a positive reward. The total reward shown in <xref ref-type="disp-formula" rid="E6">Equation 6</xref> is employed to train the policy in the relaxation stage.</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M18"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;</mml:mtext><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>m</mml:mi></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:msup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003C9;</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:msup><mml:mi>r</mml:mi><mml:mi>c</mml:mi></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mi>r</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mn>250</mml:mn><mml:msup><mml:mi>r</mml:mi><mml:mi>h</mml:mi></mml:msup><mml:mo>+</mml:mo><mml:mn>250</mml:mn><mml:msup><mml:mi>r</mml:mi><mml:mi>l</mml:mi></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;</mml:mtext><mml:msup><mml:mi>r</mml:mi><mml:mi>h</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if&#x000A0;racket&#x000A0;collides&#x000A0;with&#x000A0;ball</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mtext>&#x02009;</mml:mtext></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:msup><mml:mi>r</mml:mi><mml:mi>l</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if&#x000A0;ball&#x000A0;lands&#x000A0;in&#x000A0;target&#x000A0;area</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mtext>&#x02009;</mml:mtext></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
</sec>
<sec>
<title>2.4 Domain randomization</title>
<p>To enhance the robustness of the policy trained in simulation and overcome the sim-to-real gap, we implement domain randomization (<xref ref-type="bibr" rid="B17">Hwangbo et al., 2019</xref>) for the kinematic and dynamic parameters of both the robotic arm and the shuttlecock. We randomize the initial angle, PD gains of the robotic arm, initial position and velocity of the ball, air drag coefficient, and system delay at the beginning of each rollout. Meanwhile, because the motion capture system introduces an approximate delay of 10 ms, we introduce a random delay of 0&#x02013;20 ms to the shuttlecock&#x00027;s state observation during the training stage to enhance the controller&#x00027;s robustness to sensor latency. Additionally, noise perturbations are introduced to both network observations and robotic arm joint torques to enhance perceptual realism and controller robustness. The ranges for randomization of each parameter and noise are specified in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Domain randomization ranges.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Parameters</bold></th>
<th valign="top" align="center"><bold>Range</bold></th>
<th valign="top" align="left"><bold>Unit</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Kp factor</td>
<td valign="top" align="center">[0.9, 1.1]</td>
<td valign="top" align="left">/</td>
</tr>
<tr>
<td valign="top" align="left">Kd factor</td>
<td valign="top" align="center">[0.9, 1.1]</td>
<td valign="top" align="left">/</td>
</tr>
<tr>
<td valign="top" align="left">Initial Joint angle rand</td>
<td valign="top" align="center">[&#x02212;0.1, 0.1]</td>
<td valign="top" align="left"><italic>rad</italic></td>
</tr>
<tr>
<td valign="top" align="left">Initial ball position rand</td>
<td valign="top" align="center">[&#x02212;0.1, 0.1]</td>
<td valign="top" align="left"><italic>m</italic></td>
</tr>
<tr>
<td valign="top" align="left">Air drag coefficient</td>
<td valign="top" align="center">[0.65, 0.69]</td>
<td valign="top" align="left">/</td>
</tr>
<tr>
<td valign="top" align="left">System delay</td>
<td valign="top" align="center">[0, 20]</td>
<td valign="top" align="left"><italic>ms</italic></td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Initial ball velocity</td>
<td valign="top" align="center"><italic>v</italic><sub><italic>x</italic></sub> &#x0003D; [&#x02212;0.8, 0.8]</td>
<td valign="top" align="left" rowspan="3"><italic>m</italic>/<italic>s</italic></td>
</tr>
<tr>
<td valign="top" align="center"><italic>v</italic><sub><italic>y</italic></sub> &#x0003D; [&#x02212;10.0, &#x02212;7.5]</td>
</tr>
<tr>
<td valign="top" align="center"><italic>v</italic><sub><italic>Z</italic></sub> &#x0003D; [6.0, 7.0]</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3">
<title>3 Experiment and results</title>
<sec>
<title>3.1 Experiment platform</title>
<sec>
<title>3.1.1 Robot-badminton system</title>
<p>The proposed framework is implemented on a high-dynamic 4-DOF robotic arm (KirinArm) with a 500 Hz control frequency, as shown in <xref ref-type="fig" rid="F3">Figure 3A</xref>. A programmable shuttlecock launcher is implemented, featuring speed control in the range of 5&#x02013;30 m/s and an adjustable angle of &#x000B1;30&#x000B0; (<xref ref-type="fig" rid="F3">Figure 3C</xref>).</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p><bold>(A)</bold> The 4-DOF robotic arm named KirinArm. <bold>(B)</bold> Motion capture system. <bold>(C)</bold> Shuttlecock launcher.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0003.tif">
<alt-text>(A) Robotic arm holding a slender, elongated object diagonally upward. (B) Two cameras positioned in front of a computer monitor displaying a 3D modeling interface. (C) Small telescope mounted on a tripod with an angled eyepiece.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>3.1.2 Motion capture system</title>
<p>The FZMotion optical motion capture system (<xref ref-type="bibr" rid="B10">FZMotion, 2025</xref>) is deployed for real-time shuttlecock tracking, using a 16-camera array operating at 180Hz sampling frequency with 2048 &#x000D7; 1536 resolution (<xref ref-type="fig" rid="F3">Figure 3B</xref>).</p>
</sec>
<sec>
<title>3.1.3 Simulation platform</title>
<p>The policy is trained on a workstation with an AMD Ryzen Threadripper 3970X &#x00040; 2.20 GHz, and an NVIDIA RTX 3090Ti GPU. The policy achieved convergence within approximately 10 hours through parallel training across 60 environments for 36,000 iterations. The physical simulation frequency and control frequency are 1000 Hz and 500 Hz, respectively. The policy is training with weight <bold>&#x003C9;</bold> &#x0003D; [0.4, 0.4, 0.1, 0.1]. The detailed hyperparameters are shown in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Hyperparameters for PPO and neural network.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Parameters</bold></th>
<th valign="top" align="center"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Number of environments</td>
<td valign="top" align="center">60</td>
</tr>
<tr>
<td valign="top" align="left">Learning epochs</td>
<td valign="top" align="center">8</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="center">0.0001</td>
</tr>
<tr>
<td valign="top" align="left">Gamma</td>
<td valign="top" align="center">0.995</td>
</tr>
<tr>
<td valign="top" align="left">Lamda</td>
<td valign="top" align="center">0.99</td>
</tr>
<tr>
<td valign="top" align="left">Number of batches</td>
<td valign="top" align="center">4</td>
</tr>
<tr>
<td valign="top" align="left">Network hidden layers</td>
<td valign="top" align="center">[512, 512]</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec>
<title>3.2 Shuttlecock prediction results analysis</title>
<p>The accuracy of the prediction network determines the initial reference trajectory precision, while its inference time affects training efficiency. To train the prediction network, we generated 22,000 shuttlecock trajectories with randomized initial positions and velocities in simulation to ensure data diversity. For real-world validation, 206 sets of real trajectories were collected, with the first 10 frames of each serving as input to test the shuttlecocks hitting point and time error at the user-defined plane.</p>
<p>The proposed prediction network achieved average position errors of 0.040 m (X) and 0.078 m (Z), with a 0.024 s time error. In comparison, the physics-based model (<xref ref-type="bibr" rid="B31">Waghmare et al., 2016</xref>) exhibited prediction errors approximately 4-fold larger (0.21m in X, 0.26m in Z), while an EKF-RBF method (<xref ref-type="bibr" rid="B37">Zhi et al., 2022</xref>) yielded landing position prediction errors of 0.08m (X) and 0.15 m (Y), exhibiting maximum errors 2 times that of ours. The network trained on simulated data with a fixed drag coefficient (<xref ref-type="bibr" rid="B32">Yang, 2022</xref>) resulted in an average spatial error of 0.13 m, 1.5 times that of ours. These results demonstrate the superior accuracy of the prediction network on the highly nonlinear dynamics. Furthermore, we conducted 100 trials to assess the computational cost of the reference trajectory generation module, which impacts the policy training efficiency. The average cost is 0.048 ms, significantly below the 2 ms control cycle.</p>
<p>It is worth noting that in the DTG-IRRL framework, the prediction network (red dotted frame in <xref ref-type="fig" rid="F2">Figure 2</xref>) serves to provide an a priori estimate of the shuttlecock&#x00027;s interception point. This estimate is used exclusively during the training phase to generate an initial reference trajectory for the robot arm. Crucially, the DTG module itself is a training-time component designed to guide the controller to learn the motion style of the reference trajectory and is not deployed on the hardware. The final policy deployed on the physical robot is solely the motion controller (as depicted in the blue frame in <xref ref-type="fig" rid="F2">Figure 2</xref>). Furthermore, during simulation, the prediction module operates independently for each shot, eliminating drift issues.</p>
</sec>
<sec>
<title>3.3 DTG-IRRL framework training results analysis</title>
<p>To validate the effectiveness of the DTG-IRRL framework for badminton striking, we conducted a comparative analysis across three training paradigms:</p>
<list list-type="bullet">
<list-item><p><bold>DTG-IRRL</bold>: integrates IRRL with the DTG module.</p></list-item>
<list-item><p><bold>IRRL-o</bold>: utilizes IRRL without the DTG module, relying on a fixed reference trajectory independent of the shuttlecock&#x00027;s state.</p></list-item>
<list-item><p><bold>DTG-RL</bold>: one-stage RL with DTG module, where the policy is trained directly via reward <xref ref-type="disp-formula" rid="E6">Equation 6</xref> without an imitation learning stage.</p></list-item>
</list>
<p>All implementations maintained the identical network architectures and hyperparameters.</p>
<p>Quantitative comparisons (<xref ref-type="fig" rid="F4">Figure 4</xref>) demonstrate DTG-IRRL&#x00027;s superior performance, converging in 20,000 episodes with a peak reward of 1.62 during relaxationoutperforming both DTG-RL (converging in 25,000 episodes with a reward of 1.56) and IRRL-o (1.48). While IRRL-o matches DTG-IRRL&#x00027;s convergence rate, its significantly lower final reward highlights the prediction network&#x00027;s contribution.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>The reward curves of three frameworks: DTG-IRRL (red), DTG-RL (blue), and IRRL (green). The light blue shaded area indicates the imitation training stage, while the light red shaded area denotes the relaxation stage.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0004.tif">
<alt-text>Line graph showing reward versus episodes, highlighting two stages: Imitation Stage (0-15000 episodes) and Relaxation Stage (15000-35000 episodes). Three methods are compared: DTG-RL (blue), IRRL-o (green), and DTG-IRRL (red). DTG-IRRL peaks above 1.6 in the Relaxation Stage, showing the highest performance.</alt-text>
</graphic>
</fig>
<p>Then we evaluate the policy performance using three metrics: hitting rate (<italic>R</italic><sub><italic>h</italic></sub>), landing accuracy (<italic>R</italic><sub><italic>l</italic></sub>), and landing deviation (<italic>D</italic><sub><italic>e</italic></sub>) (<xref ref-type="disp-formula" rid="E7">Equation 7</xref>) between the average landing position and the target center, where lower values indicate better precision. The target landing area spans <italic>x</italic> &#x02208; [&#x02212;0.8, 0.8]<italic>m</italic> and <italic>y</italic> &#x02208; [3.5, 6.0]<italic>m</italic>, centered at [0.0, 4.75] m. Quantitative evaluation through 50 simulated trials shows DTG-IRRL&#x00027;s superior control performance, achieving 100% <italic>R</italic><sub><italic>h</italic></sub> with 80% <italic>R</italic><sub><italic>l</italic></sub> (shown in <xref ref-type="table" rid="T3">Table 3</xref>). In comparison, IRRL-o and DTG-RL show significantly lower <italic>R</italic><sub><italic>l</italic></sub> (34% and 36%, respectively). DTG-IRRL achieved superior landing deviation with <italic>D</italic><sub><italic>e</italic></sub> &#x0003D; 0.426 approximately 67% of DTG-RL&#x00027;s (<italic>D</italic><sub><italic>e</italic></sub> &#x0003D; 0.63) and half of IRRL-o&#x00027;s (<italic>D</italic><sub><italic>e</italic></sub> &#x0003D; 0.0.94), illustrated in <xref ref-type="fig" rid="F5">Figure 5</xref>. Notably, DTG-IRRL&#x00027;s landing points are tightly clustered within the target zone, whereas DTG-RL produces scattered distributions, and IRRL-o misses the target in 66% of cases. IRRL-o performs poorly in accommodating shuttlecock deviation due to the absence of initial reference trajectory guidance. DTG-RL&#x00027;s lack of imitation learning results in slower convergence and potential convergence to suboptimal policies because of the non-convex reward function &#x02013; such as the robotic arm adopting unnatural motion patterns like moving quickly to a vertical position, remaining stationary, and swinging just before the ball&#x00027;s arrival.</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x02225;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mo>&#x02225;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Comparative analysis reveals two primary advantages of DTG-IRRL: (1) Leveraging the generated trajectory, the IRRL strategy efficiently achieves faster convergence by exploiting the unimodal nature of the imitation reward function in the parameter space; (2) utilizing the initial arm reference trajectory generated by DTG, DTG-IRRL dynamically adjusts the initial reference trajectory based on the prediction results, thereby enhancing the hitting rate and landing accuracy.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Results of badminton striking tests.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Target area (m)</bold></th>
<th valign="top" align="left"><bold>Robotic arm</bold></th>
<th valign="top" align="left"><bold>Approach</bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>h</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>l</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>D</italic><sub><italic>e</italic></sub> (m)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="3"><italic>x</italic> &#x02208; [&#x02212;0.8, 0.8] <break/><italic>y</italic> &#x02208; [3.5, 6.0]</td>
<td valign="top" align="left" rowspan="3"><bold>KirinArm</bold></td>
<td valign="top" align="left"><bold>DTG-IRRL</bold></td>
<td valign="top" align="center"><bold>100%</bold></td>
<td valign="top" align="center"><bold>80%</bold></td>
<td valign="top" align="center"><bold>0.426</bold></td>
</tr>
 <tr>
<td valign="top" align="left">DTG-RL</td>
<td valign="top" align="center">88%</td>
<td valign="top" align="center">36%</td>
<td valign="top" align="center">0.63</td>
</tr>
 <tr>
<td valign="top" align="left">IRRL-o</td>
<td valign="top" align="center">92%</td>
<td valign="top" align="center">34%</td>
<td valign="top" align="center">0.94</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2"><italic>x</italic> &#x02208; [&#x02212;0.8, 0.8] <break/><italic>y</italic> &#x02208; [3.5, 6.0]</td>
<td valign="top" align="left"><bold>KirinArm</bold></td>
<td valign="top" align="left"><bold>DTG-IRRL</bold></td>
<td valign="top" align="center"><bold>100%</bold></td>
<td valign="top" align="center"><bold>80%</bold></td>
<td valign="top" align="center"><bold>0.426</bold></td>
</tr>
 <tr>
<td valign="top" align="left">UR5</td>
<td valign="top" align="left">DTG-IRRL</td>
<td valign="top" align="center">94%</td>
<td valign="top" align="center">58%</td>
<td valign="top" align="center">1.17</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2"><italic>x</italic> &#x02208; [&#x02212;0.8, 0.8] <break/><italic>y</italic> &#x02208; [5.0, 8.0]</td>
<td valign="top" align="left"><bold>KirinArm</bold></td>
<td valign="top" align="left"><bold>DTG-IRRL</bold></td>
<td valign="top" align="center"><bold>96%</bold></td>
<td valign="top" align="center"><bold>72%</bold></td>
<td valign="top" align="center"><bold>0.59</bold></td>
</tr>
 <tr>
<td valign="top" align="left">UR5</td>
<td valign="top" align="left">DTG-IRRL</td>
<td valign="top" align="center">90%</td>
<td valign="top" align="center">0.0%</td>
<td valign="top" align="center">2.75</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicates the best-performing results obtained using the DTG-IRRL learning framework and the KirinArm robotic arm.</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>The shuttlecock landing positions across three frameworks: DTG-IRRL (red circle), DTG-RL (blue circle), and IRRL (green circle). The light red, blue, and green shaded areas represent the 90% confidence intervals for landing positions, while the pentagram is the average position of the landing point, and the light gray box denotes the target landing area.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0005.tif">
<alt-text>Scatter plot showing data points and ellipses representing position distributions with various algorithms. Green, blue, and red points mark different methodologies with stars indicating mean positions. Key includes IRRL, DTG-RL, DTG-IRRL, and target positions. X-axis is X Position (meters); Y-axis is Y Position (meters).</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>3.4 Sparse rewards analysis on hyperplane</title>
<p>For high-dimensional nonlinear multi-objective problems, the policy is sensitive to the distribution of the reward function and the initial solution settings. In racket sports, the sparse reward and the high-dimensional network parameter space complicate policy convergence analysis. Therefore, inspired by <xref ref-type="bibr" rid="B18">Jin et al. (2022)</xref>, we define a special parameter space hyperplane to analyze the reward function distribution:</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>l</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>f</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>m</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mstyle></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003B3;</mml:mi><mml:mo>&#x02265;</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x00398;<bold><sup>f</sup></bold>, &#x00398;<sup><bold>m</bold></sup>, &#x00398;<sup><bold>sp</bold></sup> are network parameters of the controllers <bold>U</bold>(<bold>x</bold>; &#x00398;<sup><bold>f</bold></sup>), <bold>U</bold>(<bold>x</bold>; &#x00398;<sup><bold>m</bold></sup>), <bold>U</bold>(<bold>x</bold>; &#x00398;<sup><bold>sp</bold></sup>) that are trained using the reward functions with <italic>r</italic><sub><italic>t</italic></sub>, <italic>r</italic><sup><italic>m</italic></sup> and <italic>r</italic><sup><italic>sp</italic></sup> in <xref ref-type="disp-formula" rid="E5">Equations 5</xref>, <xref ref-type="disp-formula" rid="E6">6</xref>, <bold>x</bold> is the input vector. The surface plot of the cumulative reward on the hyperplane can be represented as a ternary plot with <italic>&#x003C9;</italic> &#x0003D; [0.4, 0.4, 0.1, 0.1] (<xref ref-type="fig" rid="F6">Figure 6A</xref>), where <bold>&#x003B7;</bold> &#x0003D; (&#x003B1;, &#x003B2;, &#x003B3;) represent the triangle coordinates of &#x00398;<sup><bold>l</bold></sup>. The reward for each controller is averaged over 50 simulation trials.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>The cumulative reward surfaces over the characteristic hyperplane based on &#x00398;<sup><italic>f</italic></sup>, &#x00398;<sup><italic>m</italic></sup>, &#x00398;<sup><italic>sp</italic></sup>. Colored lines within the plane represent contour lines. <bold>(A)</bold> Total reward <italic>r</italic><sub><italic>t</italic></sub>; <bold>(B)</bold> imitation reward <italic>r</italic><sup><italic>m</italic></sup>; <bold>(C)</bold> sparse reward <italic>r</italic><sup><italic>sp</italic></sup>; <bold>(D)</bold> torque reward <italic>r</italic><sup>&#x003C4;</sup>; <bold>(E)</bold> collision reward <italic>r</italic><sup><italic>c</italic></sup>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0006.tif">
<alt-text>Five 3D graphs labeled A to E depict various reward categories for robotic actions, ranging from imitation to collision. Each graph shows a cumulative reward spectrum from 300 to 550, transitioning from dark to light colors. Graph (A) shows the total reward, (B) highlights imitation, (C) shows strike, (D) shows torque, and (E) depicts collision. Below each graph, a three-node mechanism is illustrated, indicating different action parameters.</alt-text>
</graphic>
</fig>
<p>Results in <xref ref-type="fig" rid="F6">Figure 6C</xref> indicate that the sparse reward surface <inline-formula><mml:math id="M21"><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> exhibits multi-local optima on the hyperplane, with a global maximum around &#x00398;<sup><bold>f</bold></sup>, where the policy initialized around &#x00398;<sup><bold>sp</bold></sup> will converge to the suboptimal solution &#x00398;<sup><bold>sp</bold></sup>. Furthermore, the gradients of the reward surface are low in regions distant from the three special controllers (black area in <xref ref-type="fig" rid="F6">Figure 6C</xref>), severely hindering convergence speed. In contrast, the torque and collision reward surface (<xref ref-type="fig" rid="F6">Figures 6D</xref>, <xref ref-type="fig" rid="F6">E</xref>) both have a single maximum. And the imitation reward surface <inline-formula><mml:math id="M22"><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> (<xref ref-type="fig" rid="F6">Figure 6B</xref>) features a single maximum near &#x00398;<sup><bold>m</bold></sup>, enabling fast convergence from any initial <bold>U</bold>(<bold>x</bold>; &#x00398;). Notably, both sparse and total reward (<xref ref-type="fig" rid="F6">Figure 6A</xref>) surfaces exhibit ascending gradients from &#x00398;<sup><bold>m</bold></sup> to &#x00398;<sup><bold>f</bold></sup>, suggesting that initializations around &#x00398;<sup><bold>m</bold></sup> can effectively guide convergence to the optima &#x00398;<sup><bold>f</bold></sup>. Leveraging these characteristics, the DTG-IRRL firstly guides the policy to quickly converge to <bold>U</bold>(<bold>x</bold>; &#x00398;<sup><bold>m</bold></sup>) (white arrow in <xref ref-type="fig" rid="F6">Figure 6B</xref>) using only <italic>r</italic><sup><bold>m</bold></sup>. Then, utilizing &#x00398;<sup><bold>m</bold></sup> as the initial parameters, the policy can converge to the optimal controller <bold>U</bold>(<bold>x</bold>; &#x00398;<sup><bold>f</bold></sup>) with the total reward <italic>r</italic><sub><italic>t</italic></sub> (white arrow in <xref ref-type="fig" rid="F6">Figure 6A</xref>). Empirical results demonstrate that DTG-IRRL can effectively mitigate the challenges of convergence to local optima and slow convergence due to sparse reward.</p>
<p>Moreover, to evaluate the impact of the framework&#x00027;s dynamic reference trajectory adjustment on policy performance, we also define a parameter space hyperplane based on <inline-formula><mml:math id="M23"><mml:mstyle mathvariant="bold"><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup></mml:mstyle></mml:math></inline-formula>, where <inline-formula><mml:math id="M24"><mml:mstyle mathvariant="bold"><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup></mml:mstyle></mml:math></inline-formula> denote the parameters of controllers <inline-formula><mml:math id="M25"><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle><mml:mo>;</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>U</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle><mml:mo>;</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>m</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, trained via DTG-IRRL and IRRL-o methods with reward <italic>r</italic><sup><bold>m</bold></sup>, respectively. As illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>, both the sparse and total reward surfaces reveal that the gradient from <inline-formula><mml:math id="M26"><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>o</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>m</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:math></inline-formula> to optima &#x00398;<sup><bold>f</bold></sup> is notably low in most regions, with significant gradient only near &#x00398;<sup><bold>f</bold></sup>. This indicates that IRRL-o converges inefficiently unless initialized close to &#x00398;<sup><bold>f</bold></sup>. In contrast, DTG-IRRL demonstrates a pronounced gradient from <inline-formula><mml:math id="M27"><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>f</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>m</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:math></inline-formula> to &#x00398;<sup><bold>f</bold></sup>, suggesting that policies initialized around <inline-formula><mml:math id="M28"><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>f</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>m</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:math></inline-formula> can effectively and quickly converge to &#x00398;<sup><bold>f</bold></sup>. These results confirm that DTGs dynamic trajectory adjustment capability enables the policy to converge to the optimal solution &#x00398;<sup><bold>f</bold></sup> more efficiently and quickly.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>The cumulative reward surfaces over the characteristic hyperplane based on <inline-formula><mml:math id="M29"><mml:msup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mo>&#x00398;</mml:mo></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. Colored lines within the plane represent contour lines. <bold>(A)</bold> Total reward <italic>r</italic><sub><italic>t</italic></sub>; <bold>(B)</bold> sparse reward <italic>r</italic><sup><bold>sp</bold></sup>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0007.tif">
<alt-text>Graphical comparison of cumulative rewards. Panel A depicts Total r with a layered, colorful 3D surface chart showing rewards increasing from dark to light hues. Panel B shows Strike r with a similar visual structure. Both charts include geometric diagrams with labels, f, and mf at the base. A color bar indicates reward levels from 440 to 560.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>3.5 Reward weights sensitivity and policy stability analysis</title>
<sec>
<title>3.5.1 Reward weights sensitivity analysis</title>
<p>To further verify the effectiveness of the DTG-IRRL in guiding faster policy convergence and its sensitivity to different weights, we conducted an ablation study to evaluate the impact of the different weights for the imitation and sparse reward components during the relaxation stage of training. We tested four different weight configurations, and the reward curves are presented in <xref ref-type="fig" rid="F8">Figure 8</xref>. The results demonstrate that the policy&#x00027;s convergence is not sensitive to different weights. While the final cumulative reward values differ due to the scaling factor of the sparse reward weights, all four policies exhibit similar convergence profiles, stabilizing after approximately 20,000 training iterations.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>The reward curves of three frameworks under different reward weights.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0008.tif">
<alt-text>Line graph showing rewards over episodes, divided into Imitation Stage and Relaxation Stage. Four lines represent different parameter settings: blue, red, green, and purple. Each shows a reward increase around 20,000 episodes. Legend lists parameter values for each line.</alt-text>
</graphic>
</fig>
<p>To quantitatively evaluate the landing accuracy of different rewards on landing accuracy, each policy was evaluated over 50 hitting trials, with the results summarized in <xref ref-type="table" rid="T4">Table 4</xref>. The results show that all four policies achieve consistently high performance: the hitting rate (<italic>R</italic><sub><italic>h</italic></sub>) exceeds 90%, landing accuracy (<italic>R</italic><sub><italic>l</italic></sub>) is approximately 80%, and the landing deviation (<italic>D</italic><sub><italic>e</italic></sub>) is around 0.4m. This demonstrates that the final policy&#x00027;s performance is robust to variations in the reward function weights. We attribute this stability to our DTG-IRRL framework, which effectively guides the policy to reach the optimal solution, making it less sensitive to minor tuning of reward components.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Test results of different reward weights.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Reward weights</bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>h</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>l</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>D</italic><sub><italic>e</italic></sub>(<italic>m</italic>)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">&#x003C9; &#x0003D; [0.2, 0.6, 0.1, 0.1]</td>
<td valign="top" align="center">92%</td>
<td valign="top" align="center">74%</td>
<td valign="top" align="center">0.37</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C9; &#x0003D; [0.4, 0.4, 0.1, 0.1]</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">80%</td>
<td valign="top" align="center">0.43</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C9; &#x0003D; [0.5, 0.3, 0.1, 0.1]</td>
<td valign="top" align="center">98%</td>
<td valign="top" align="center">74%</td>
<td valign="top" align="center">0.36</td>
</tr>
<tr>
<td valign="top" align="left">&#x003C9; &#x0003D; [0.6, 0.2, 0.1, 0.1]</td>
<td valign="top" align="center">94%</td>
<td valign="top" align="center">78%</td>
<td valign="top" align="center">0.40</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>3.5.2 Stability analysis</title>
<p>Stability determines the controller&#x00027;s ability to resist external perturbations. However, theoretical stability analysis is often difficult for systems controlled by complex, high-dimensional neural network policies. Therefore, inspired by prior work [1], we conducted an empirical analysis to assess the systems stability to perturbations. We introduced varying levels of disturbance (from 5% to 45%) to the initial position and velocity of the shuttlecock while ensuring the initial states remained within the training distribution (<xref ref-type="table" rid="T5">Table 5</xref>). We tested the controllers landing error (defined as the Euclidean distance between the landing position and the target center) under different disturbances. The landing box-line error plot in <xref ref-type="fig" rid="F9">Figure 9</xref> demonstrates that the landing errors of most balls under different disturbances are small. When the disturbance is above 40%, the variance of the landing error increases slightly. The results, shown in <xref ref-type="table" rid="T5">Table 5</xref>, illustrate remarkable stability. Despite significant perturbations, the hitting rate (<italic>R</italic><sub><italic>h</italic></sub>) remains at 100% and landing accuracy (<italic>R</italic><sub><italic>l</italic></sub>) stays above 94%. Crucially, the landing deviation (<italic>D</italic><sub><italic>e</italic></sub>) shows only a slight fluctuation increase, approximately 2.6%, which can be almost ignored. Within the operational range, the closed-loop system maintains consistent performance despite increasing disturbances, demonstrating inherent stability.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Test results of different level disturbance.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Disturbance</bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>h</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>R</italic><sub><italic>l</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>D</italic><sub><italic>e</italic></sub>(<italic>m</italic>)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">5%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">94%</td>
<td valign="top" align="center">0.189</td>
</tr>
<tr>
<td valign="top" align="left">10%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">96%</td>
<td valign="top" align="center">0.183</td>
</tr>
<tr>
<td valign="top" align="left">15%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">0.192</td>
</tr>
<tr>
<td valign="top" align="left">20%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">96%</td>
<td valign="top" align="center">0.187</td>
</tr>
<tr>
<td valign="top" align="left">30%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">96%</td>
<td valign="top" align="center">0.184</td>
</tr>
<tr>
<td valign="top" align="left">40%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">98%</td>
<td valign="top" align="center">0.201</td>
</tr>
<tr>
<td valign="top" align="left">45%</td>
<td valign="top" align="center">100%</td>
<td valign="top" align="center">94%</td>
<td valign="top" align="center">0.194</td>
</tr></tbody>
</table>
</table-wrap>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>The landing error under different levels of disturbance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0009.tif">
<alt-text>Box plot chart showing landing error in meters across various disturbance levels from 5% to 45%. Each colored box represents a different disturbance percentage, showing the median, quartiles, and range of errors.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec>
<title>3.6 Generalization of DTG-IRRL framework</title>
<p>To validate the generalizability of the framework, we deployed it on a UR5 robotic arm (<xref ref-type="bibr" rid="B30">UR5e, 2025</xref>) using identical hyperparameter configurations. Moreover, to analyze the influences on shuttlecock speed and flight distance of the robotic arm&#x00027;s dynamic performance, we trained two controller&#x02013;one for the UR5 and another for the KirinArm&#x02013;using the DTG-IRRL framework with identical training parameters, varying only the target landing areas. A multi-rally experiment then compared their performance across different landing distances.</p>
<p>The results shown in <xref ref-type="table" rid="T3">Table 3</xref> demonstrate successful generalization across robotic platforms in 50 trials, with both KirinArm and UR5 achieving high <italic>R</italic><sub><italic>h</italic></sub> (100% and 94%, respectively). <italic>R</italic><sub><italic>l</italic></sub> was 80% (KirinArm) and 58% (UR5). Quantitative analysis shows that a higher <italic>D</italic><sub><italic>e</italic></sub> value for UR5 (1.17 m) compared to KirinArm (0.426 m), indicating relatively lower accuracy. Crucially, these results were achieved without a reward function or weight adjustments, demonstrating cross-platform generalization capability.</p>
<p>Experimental results in <xref ref-type="table" rid="T3">Table 3</xref> reveal significant performance differences between KirinArm and UR5 across varying target distances. For close-range targets, both KirinArm and UR5 achieve <italic>R</italic><sub><italic>h</italic></sub> exceeding 90% and <italic>R</italic><sub><italic>l</italic></sub> exceeding 50%. Notably, KirinArm demonstrated superior accuracy, exhibiting a 1.5-fold higher <italic>R</italic><sub><italic>l</italic></sub> and a 50% reduction in <italic>D</italic><sub><italic>e</italic></sub> compared to the UR5. At farther ranges, while UR5 maintained an <italic>R</italic><sub><italic>h</italic></sub> of 90%, it failed to land the shuttlecock within the target area (<italic>R</italic><sub><italic>l</italic></sub>= 0%). In contrast, KirinArm sustains high performance (<italic>R</italic><sub><italic>h</italic></sub>= 96%, <italic>R</italic><sub><italic>l</italic></sub>= 72%). <xref ref-type="fig" rid="F10">Figure 10</xref> shows landing distributions from 50 trials, highlighting KirinArm&#x00027;s consistent precision with an average position nearer to the target center and a substantially lower <italic>D</italic><sub><italic>e</italic></sub> (0.59 vs UR5&#x00027;s 2.75, 21.5% of UR5&#x00027;s).</p>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>The shuttlecock landing positions of KirinArm (red circle) and UR5 (blue circle). The light red and blue shaded areas represent the 90% confidence intervals for landing positions, while the pentagram is the average position of the landing point; the light gray box denotes the target landing area.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0010.tif">
<alt-text>Scatter plot showing positions of two robotic arms, UR5 and KirinArm, on an X-Y axis. UR5 positions are marked by blue circles, and KirinArm positions by red circles. Mean positions are indicated with a blue star for UR5 and a red star for KirinArm. A black star denotes the target mean position. Ellipses represent position clusters with a gray rectangle indicating the landing target area. The plot measures X position from &#x02212;1.0 to 1.0 meters and Y position from 3 to 8 meters.</alt-text>
</graphic>
</fig>
<p>To investigate the reasons for the performance differences, we conducted a comparative analysis of both robotic arms&#x00027; velocity and acceleration capabilities (<xref ref-type="fig" rid="F11">Figure 11</xref>). KirinArm demonstrates superior terminal speed and acceleration, approximately twice and three times higher than those of UR5, respectively, explaining UR5&#x00027;s limitations in dynamic tasks. Torque-speed curves in <xref ref-type="fig" rid="F11">Figures 11C</xref>, <xref ref-type="fig" rid="F11">D</xref> reveal that UR5&#x00027;s elbow joint operates at its motor capacity limit (gray dashed lines are the motor constraints, which are modeled as a piecewise linear function to approximate the motors external characteristic curve <xref ref-type="bibr" rid="B34">Yuan et al., 2025</xref>), while KirinArm operates well within its limits. The limitation of UR5 is attributed to its higher mass, necessitating greater torque during rapid movements, and its high gear reduction ratio, limiting maximum joint speeds. The results demonstrate that dynamic capability is critical for high-speed badminton tasks.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p><bold>(A)</bold> The racket speed of KirinArm and UR5. <bold>(B)</bold> The racket acceleration of KirinArm and UR5. <bold>(C, D)</bold> Elbow torque-speed curve of the KirinArm and UR5, where the gray dashed line shows the motor constraint.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0011.tif">
<alt-text>Four graphs display data comparisons between &#x0201C;Our Robot&#x0201D; and &#x0201C;UR5&#x0201D;. Graph A shows end velocity over time, with &#x0201C;Our Robot&#x0201D; peaking higher. Graph B presents end acceleration, where &#x0201C;Our Robot&#x0201D; again exhibits a higher peak. Graph C illustrates torque versus angular velocity for &#x0201C;Our Robot Elbow&#x0201D;, contained within dashed boundaries. Graph D shows the same for &#x0201C;UR5 Elbow&#x0201D;, also within dashed boundaries.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<title>3.7 Hardware experiment results</title>
<p>Implemented on the hardware system, the framework achieves zero-shot transfer. Across 60 trials with randomized initial states of the shuttlecock (generated by a pan-tilt shuttlecock launcher), the DTG-IRRL controller achieved a 90% hitting rate (54/60) and a 70% landing accuracy (42/60). Missed strikes (10%) occurred only when the shuttlecocks altitude exceeded the arm&#x00027;s workspace. The landing positions showed a deviation of 0.2 m (<xref ref-type="fig" rid="F12">Figure 12</xref>), confirming both the controller&#x00027;s spatial consistency and its practical applicability for badminton tasks.</p>
<fig position="float" id="F12">
<label>Figure 12</label>
<caption><p>The shuttlecock landing points of 60 trials. The light gray area denotes the target area, while varying shades of blue represent the probability density of landings within that region.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0012.tif">
<alt-text>Contour plot showing probability density of positions with a color gradient from light to dark blue. White markers represent data points, mostly clustered within the high-density area at the top. A dashed rectangle labeled &#x0201C;Landing Target&#x0201D; is positioned nearby. The X and Y axes are labeled Position in meters, with values ranging from negative 1 to 1. Probability density is indicated by the color bar on the right.</alt-text>
</graphic>
</fig>
<p>Human-robot interaction tests with three novice players (<xref ref-type="fig" rid="F13">Figure 13</xref>) demonstrated sustained rally capability (an average of six consecutive hits), with physical implementation achieving simulation-equivalent performance while confirming real-world robustness. The experiment video is available in the <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref> and on the project website (<ext-link ext-link-type="uri" xlink:href="https://stylite-y.github.io/DTG-IRRL-For-Badminton/">https://stylite-y.github.io/DTG-IRRL-For-Badminton/</ext-link>).</p>
<fig position="float" id="F13">
<label>Figure 13</label>
<caption><p><bold>(A)</bold> Robotic arm successfully intercepting shuttlecocks from varied launch angles during robot-launcher interaction, and <bold>(B)</bold> multi-rally human-robot interaction trials. The gradient arrows depict the shuttlecock&#x00027;s trajectory, while the yellow segments represent the robotic arm&#x00027;s posture.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-19-1649870-g0013.tif">
<alt-text>Two-panel image showing badminton shuttlecock trajectories and robots. Panel (a) illustrates a robot launching a shuttlecock three times, each with different colored arcs indicating varied trajectories. Panel (b) shows a person testing shuttlecock flight with a robotic arm and similar trajectory arcs.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="conclusions" id="s4">
<title>4 Conclusion</title>
<p>In this study, we propose a learning framework (DTG-IRRL) for robotic badminton to address the convergence difficulties in RL due to sparse rewards and the trajectory prediction challenges posed by non-linear dynamics. The framework achieves zero-shot transfer on a robot system, demonstrating a 90% hitting rate, a 70% landing accuracy, and enabling sustained multi-round human-robot rallies. Further analysis of the reward function on a special hyperplane demonstrates that DTG-IRRL can effectively mitigate the challenges of local optima and slow convergence due to sparse rewards. Comparative experiments with UR5 confirm the framework&#x00027;s cross-platform generalization capability and highlight the importance of high dynamic performance for high-speed tasks. While the proposed framework demonstrates promising results, its performance is constrained by the absence of a mobile platform and a limited repertoire of badminton techniques. Future studies will integrate a mobile platform and expand the stroke techniques (such as smash, drop shot, and net shot) to achieve human-level badminton performance.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>YY: Conceptualization, Writing &#x02013; review &#x00026; editing, Formal analysis, Writing &#x02013; original draft, Methodology, Visualization. YT: Writing &#x02013; review &#x00026; editing, Validation. SC: Writing &#x02013; review &#x00026; editing, Validation. YL: Validation, Writing &#x02013; review &#x00026; editing. YJ: Writing &#x02013; review &#x00026; editing, Conceptualization. HW: Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This study was supported by the Key R&#x00026;D Program of Zhejiang (2023C03001).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fnbot.2025.1649870/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fnbot.2025.1649870/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Video_1.mp4" id="SM1" mimetype="video/mp4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Abeyruwan</surname> <given-names>S. W.</given-names></name> <name><surname>Graesser</surname> <given-names>L.</given-names></name> <name><surname>DAmbrosio</surname> <given-names>D. B.</given-names></name> <name><surname>Singh</surname> <given-names>A.</given-names></name> <name><surname>Shankar</surname> <given-names>A.</given-names></name> <name><surname>Bewley</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;i-Sim2Real: Reinforcement learning of robotic policies in tight human-robot interaction loops,&#x0201D;</article-title> in <source>Conference on Robot Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>212</fpage>&#x02013;<lpage>224</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Akrour</surname> <given-names>R.</given-names></name> <name><surname>Abdolmaleki</surname> <given-names>A.</given-names></name> <name><surname>Abdulsamad</surname> <given-names>H.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name> <name><surname>Neumann</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>Model-free trajectory-based policy optimization with monotonic improvement</article-title>. <source>J. Mach. Learn. Res</source>. <volume>19</volume>, <fpage>1</fpage>&#x02013;<lpage>25</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>B&#x000FC;chler</surname> <given-names>D.</given-names></name> <name><surname>Guist</surname> <given-names>S.</given-names></name> <name><surname>Calandra</surname> <given-names>R.</given-names></name> <name><surname>Berenz</surname> <given-names>V.</given-names></name> <name><surname>Sch&#x000F6;lkopf</surname> <given-names>B.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Learning to play table tennis from scratch using muscular robots</article-title>. <source>IEEE Trans. Robot</source>. <volume>38</volume>, <fpage>3850</fpage>&#x02013;<lpage>3860</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2022.3176207</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Paleja</surname> <given-names>R.</given-names></name> <name><surname>Gombolay</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Learning from suboptimal demonstration via self-supervised reward regression,&#x0201D;</article-title> in <source>Conference on Robot Learning</source> (<publisher-loc>New York</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1262</fpage>&#x02013;<lpage>1277</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cohen</surname> <given-names>C.</given-names></name> <name><surname>Clanet</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <article-title>Physics of ball sports</article-title>. <source>Europhys News</source> <volume>47</volume>, <fpage>13</fpage>&#x02013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1051/epn/2016301</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cohen</surname> <given-names>C.</given-names></name> <name><surname>Darbois-Texier</surname> <given-names>B.</given-names></name> <name><surname>Dupeux</surname> <given-names>G.</given-names></name> <name><surname>Brunel</surname> <given-names>E.</given-names></name> <name><surname>Qu&#x000E9;r&#x000E9;</surname> <given-names>D.</given-names></name> <name><surname>Clanet</surname> <given-names>C.</given-names></name></person-group> (<year>2014</year>). <article-title>The aerodynamic wall</article-title>. <source>Proc. R. Soc. A: Math. Phys. Eng. Sci</source>. <volume>470</volume>:<fpage>20130497</fpage>. <pub-id pub-id-type="doi">10.1098/rspa.2013.0497</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>D&#x00027;Ambrosio</surname> <given-names>D. B.</given-names></name> <name><surname>Abelian</surname> <given-names>J.</given-names></name> <name><surname>Abeyruwan</surname> <given-names>S.</given-names></name> <name><surname>Ahn</surname> <given-names>M.</given-names></name> <name><surname>Bewley</surname> <given-names>A.</given-names></name> <name><surname>Boyd</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Robotic table tennis: a case study into a high speed learning system</article-title>. <source>arXiv</source> [preprint] arXiv:2309.03315. <pub-id pub-id-type="doi">10.15607/RSS.2023.XIX.006</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>D&#x00027;Ambrosio</surname> <given-names>D. B.</given-names></name> <name><surname>Abeyruwan</surname> <given-names>S. W.</given-names></name> <name><surname>Graesser</surname> <given-names>L.</given-names></name> <name><surname>Iscen</surname> <given-names>A.</given-names></name> <name><surname>Amor</surname> <given-names>H. B.</given-names></name> <name><surname>Bewley</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>&#x0201C;Achieving human-level competitive robot table tennis,&#x0201D;</article-title> in <source>Proceedings of the 7th Robot Learning Workshop: Towards Robots with Human-Level Abilities at the International Conference on Learning Representations (ICLR)</source> (<publisher-loc>Singapore</publisher-loc>).</citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>T.</given-names></name> <name><surname>Graesser</surname> <given-names>L.</given-names></name> <name><surname>Abeyruwan</surname> <given-names>S.</given-names></name> <name><surname>D&#x00027;Ambrosio</surname> <given-names>D. B.</given-names></name> <name><surname>Shankar</surname> <given-names>A.</given-names></name> <name><surname>Sermanet</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Learning high speed precision table tennis on a physical robot,&#x0201D;</article-title> in <source>2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</source> (<publisher-loc>Kyoto</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10780</fpage>&#x02013;<lpage>10787</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="web"><person-group person-group-type="author"><collab>FZMotion</collab></person-group> (<year>2025</year>). <source>FZMotion Capture System</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.lusterinc.com/FZMotion-Baidu/">https://www.lusterinc.com/FZMotion-Baidu/</ext-link> (Accessed March 18, 2025).</citation>
</ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>W.</given-names></name> <name><surname>Graesser</surname> <given-names>L.</given-names></name> <name><surname>Choromanski</surname> <given-names>K.</given-names></name> <name><surname>Song</surname> <given-names>X.</given-names></name> <name><surname>Lazic</surname> <given-names>N.</given-names></name> <name><surname>Sanketi</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Robotic table tennis with model-free reinforcement learning,&#x0201D;</article-title> in <source>2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5556</fpage>&#x02013;<lpage>5563</lpage>.<pub-id pub-id-type="pmid">24756167</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Tebbe</surname> <given-names>J.</given-names></name> <name><surname>Zell</surname> <given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;A model-free approach to stroke learning for robotic table tennis,&#x0201D;</article-title> in <source>2022 International Joint Conference on Neural Networks (IJCNN)</source> (<publisher-loc>Padua</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Zell</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Optimal stroke learning with policy gradient approach for robotic table tennis</article-title>. <source>Appl. Intellig</source>. <volume>53</volume>, <fpage>13309</fpage>&#x02013;<lpage>13322</lpage>. <pub-id pub-id-type="doi">10.1007/s10489-022-04131-w</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hattori</surname> <given-names>M.</given-names></name> <name><surname>Kojima</surname> <given-names>K.</given-names></name> <name><surname>Noda</surname> <given-names>S.</given-names></name> <name><surname>Sugai</surname> <given-names>F.</given-names></name> <name><surname>Kakiuchi</surname> <given-names>Y.</given-names></name> <name><surname>Okada</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Fast tennis swing motion by ball trajectory prediction and joint trajectory modification in standalone humanoid robot real-time system,&#x0201D;</article-title> in <source>2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3612</fpage>&#x02013;<lpage>3619</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hsiao</surname> <given-names>T.</given-names></name> <name><surname>Kao</surname> <given-names>H.-C.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Decision making of ball-batting robots based on deep reinforcement learning,&#x0201D;</article-title> in <source>2023 American Control Conference (ACC)</source> (<publisher-loc>San Diego, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>782</fpage>&#x02013;<lpage>787</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>B&#x000FC;chler</surname> <given-names>D.</given-names></name> <name><surname>Ko&#x000E7;</surname> <given-names>O.</given-names></name> <name><surname>Sch&#x000F6;lkopf</surname> <given-names>B.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Jointly learning trajectory generation and hitting point prediction in robot table tennis,&#x0201D;</article-title> in <source>2016 IEEE-RAS 16th International Conference on Humanoid Robots (Humanoids)</source> (<publisher-loc>Cancun</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>650</fpage>&#x02013;<lpage>655</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hwangbo</surname> <given-names>J.</given-names></name> <name><surname>Lee</surname> <given-names>J.</given-names></name> <name><surname>Dosovitskiy</surname> <given-names>A.</given-names></name> <name><surname>Bellicoso</surname> <given-names>D.</given-names></name> <name><surname>Tsounis</surname> <given-names>V.</given-names></name> <name><surname>Koltun</surname> <given-names>V.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Learning agile and dynamic motor skills for legged robots</article-title>. <source>Sci. Robot</source>. <volume>4</volume>:<fpage>eaau5872</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.aau5872</pub-id><pub-id pub-id-type="pmid">33137755</pub-id></citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Shao</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>W.</given-names></name></person-group> (<year>2022</year>). <article-title>High-speed quadrupedal locomotion by imitation-relaxation reinforcement learning</article-title>. <source>Nat. Mach. Intellig</source>. <volume>4</volume>, <fpage>1198</fpage>&#x02013;<lpage>1208</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-022-00576-3</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mori</surname> <given-names>S.</given-names></name> <name><surname>Tanaka</surname> <given-names>K.</given-names></name> <name><surname>Nishikawa</surname> <given-names>S.</given-names></name> <name><surname>Niiyama</surname> <given-names>R.</given-names></name> <name><surname>Kuniyoshi</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>). <article-title>High-speed and lightweight humanoid robot arm for a skillful badminton robot</article-title>. <source>IEEE Robot. Automat. Letters</source> <volume>3</volume>, <fpage>1727</fpage>&#x02013;<lpage>1734</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2018.2803207</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mori</surname> <given-names>S.</given-names></name> <name><surname>Tanaka</surname> <given-names>K.</given-names></name> <name><surname>Nishikawa</surname> <given-names>S.</given-names></name> <name><surname>Niiyama</surname> <given-names>R.</given-names></name> <name><surname>Kuniyoshi</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>High-speed humanoid robot arm for badminton using pneumatic-electric hybrid actuators</article-title>. <source>IEEE Robot. Automat. Letters</source> <volume>4</volume>, <fpage>3601</fpage>&#x02013;<lpage>3608</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2019.2928778</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Muelling</surname> <given-names>K.</given-names></name> <name><surname>Kober</surname> <given-names>J.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name></person-group> (<year>2010</year>). <article-title>&#x0201C;Learning table tennis with a mixture of motor primitives,&#x0201D;</article-title> in <source>2010 10th IEEE-RAS International Conference on Humanoid Robots</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>411</fpage>&#x02013;<lpage>416</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>M&#x000FC;ller</surname> <given-names>M.</given-names></name> <name><surname>Lupashin</surname> <given-names>S.</given-names></name> <name><surname>D&#x00027;Andrea</surname> <given-names>R.</given-names></name></person-group> (<year>2011</year>). <article-title>&#x0201C;Quadrocopter ball juggling,&#x0201D;</article-title> in <source>2011 IEEE/RSJ international conference on Intelligent Robots and Systems</source> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5113</fpage>&#x02013;<lpage>5120</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>M&#x000FC;lling</surname> <given-names>K.</given-names></name> <name><surname>Kober</surname> <given-names>J.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name></person-group> (<year>2010</year>). <article-title>&#x0201C;Simulating human table tennis with a biomimetic robot setup,&#x0201D;</article-title> in <source>From Animals to Animats 11: 11th International Conference on Simulation of Adaptive Behavior, SAB 2010</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>273</fpage>&#x02013;<lpage>282</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Nair</surname> <given-names>A.</given-names></name> <name><surname>McGrew</surname> <given-names>B.</given-names></name> <name><surname>Andrychowicz</surname> <given-names>M.</given-names></name> <name><surname>Zaremba</surname> <given-names>W.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Overcoming exploration in reinforcement learning with demonstrations,&#x0201D;</article-title> in <source>2018 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-loc>Brisbane, QLD</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6292</fpage>&#x02013;<lpage>6299</lpage>.</citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rambely</surname> <given-names>A. S.</given-names></name> <name><surname>Osman</surname> <given-names>N. A. A.</given-names></name></person-group> (<year>2005</year>). <article-title>&#x0201C;The contribution of upper limb joints in the development of racket velocity in the badminton smash,&#x0201D;</article-title> in <source>23 International Symposium on Biomechanics in Sports</source> (<publisher-loc>Beijing</publisher-loc>).</citation>
</ref>
<ref id="B26">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Records</surname> <given-names>G. W.</given-names></name></person-group> (<year>2014</year>). <source>Guinness World Records 2015</source>. <publisher-loc>London</publisher-loc>: <publisher-name>Guinness World Records</publisher-name>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schulman</surname> <given-names>J.</given-names></name> <name><surname>Wolski</surname> <given-names>F.</given-names></name> <name><surname>Dhariwal</surname> <given-names>P.</given-names></name> <name><surname>Radford</surname> <given-names>A.</given-names></name> <name><surname>Klimov</surname> <given-names>O.</given-names></name></person-group> (<year>2017</year>). <article-title>Proximal policy optimization algorithms</article-title>. <source>arXiv</source> [preprint] arXiv:1707.06347. <pub-id pub-id-type="doi">10.48550/arXiv.1707.06347</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tebbe</surname> <given-names>J.</given-names></name> <name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Sastre-Rienietz</surname> <given-names>M.</given-names></name> <name><surname>Zell</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;A table tennis robot system using an industrial kuka robot arm,&#x0201D;</article-title> in <source>Pattern Recognition: 40th German Conference, GCPR 2018</source> (<publisher-loc>Stuttgart</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>33</fpage>&#x02013;<lpage>45</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tebbe</surname> <given-names>J.</given-names></name> <name><surname>Krauch</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Zell</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Sample-efficient reinforcement learning in robotic table tennis,&#x0201D;</article-title> in <source>2021 IEEE International Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Xi&#x00027;an</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4171</fpage>&#x02013;<lpage>4178</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="web"><person-group person-group-type="author"><collab>UR5e</collab></person-group> (<year>2025</year>). <source>Universal Robots</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.universal-robots.com/products/ur5-robot/">https://www.universal-robots.com/products/ur5-robot/</ext-link> (Accessed March 18, 2025).</citation>
</ref>
<ref id="B31">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Waghmare</surname> <given-names>G.</given-names></name> <name><surname>Borkar</surname> <given-names>S.</given-names></name> <name><surname>Saley</surname> <given-names>V.</given-names></name> <name><surname>Chinchore</surname> <given-names>H.</given-names></name> <name><surname>Wabale</surname> <given-names>S.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Badminton shuttlecock detection and prediction of trajectory using multiple 2 dimensional scanners,&#x0201D;</article-title> in <source>2016 IEEE First International Conference on Control, Measurement and Instrumentation (CMI)</source> (<publisher-loc>Kolkata</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>234</fpage>&#x02013;<lpage>238</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>C.-A.</given-names></name></person-group> (<year>2022</year>). Shuttlecock Trajectories Modeling and Forecasting (Master&#x00027;s thesis). National Yang Ming Chiao Tung University, Hsinchu, Taiwan.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Tu</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Zheng</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>K.</given-names></name> <name><surname>Lu</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Bat planner: aggressive flying ball player</article-title>. <source>IEEE Robot. Automat. Letters</source> <volume>8</volume>, <fpage>5307</fpage>&#x02013;<lpage>5314</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3293355</pub-id></citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Jiang</surname> <given-names>L.</given-names></name> <name><surname>Jin</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name></person-group> (<year>2025</year>). <article-title>Optimal design of high-dynamic robotic arm based on angular momentum maximum</article-title>. <source>IEEE Robot. Automat. Letters</source>. <volume>10</volume>, <fpage>3542</fpage>&#x02013;<lpage>3549</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2025.3541910</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zaidi</surname> <given-names>Z.</given-names></name> <name><surname>Martin</surname> <given-names>D.</given-names></name> <name><surname>Belles</surname> <given-names>N.</given-names></name> <name><surname>Zakharov</surname> <given-names>V.</given-names></name> <name><surname>Krishna</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>K. M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Athletic mobile manipulator system for robotic wheelchair tennis</article-title>. <source>IEEE Robot. Automat. Letters</source> <volume>8</volume>, <fpage>2245</fpage>&#x02013;<lpage>2252</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3249401</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Xiong</surname> <given-names>R.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Optimal state estimation of spinning ping-pong ball using continuous motion model</article-title>. <source>IEEE Trans. Instrum. Meas</source>. <volume>64</volume>, <fpage>2208</fpage>&#x02013;<lpage>2216</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2014.2386951</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhi</surname> <given-names>J.</given-names></name> <name><surname>Luo</surname> <given-names>D.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>A novel method of shuttlecock trajectory tracking and prediction for a badminton robot</article-title>. <source>Robotica</source> <volume>40</volume>, <fpage>1682</fpage>&#x02013;<lpage>1694</lpage>. <pub-id pub-id-type="doi">10.1017/S0263574721001053</pub-id></citation>
</ref>
</ref-list>
</back>
</article>