<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="brief-report" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1788395</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2026.1788395</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Brief Research Report</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Adaptive multi-mode locomotion for bipedal wheel-legged robots via sparse mixture-of-experts deep reinforcement learning</article-title>
<alt-title alt-title-type="left-running-head">He et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2026.1788395">10.3389/frobt.2026.1788395</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>He</surname>
<given-names>Pan</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3349658"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhao</surname>
<given-names>Zeang</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3350817"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Duan</surname>
<given-names>Shengyu</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Panding</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lei</surname>
<given-names>Hongshuai</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>Institute of Advanced Structure Technology, Beijing Institute of Technology</institution>, <city>Beijing</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Zeang Zhao, <email xlink:href="mailto:zza@pku.edu.cn">zza@pku.edu.cn</email>; Shengyu Duan, <email xlink:href="mailto:shengyu_duan@126.com">shengyu_duan@126.com</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-25">
<day>25</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1788395</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>11</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 He, Zhao, Duan, Wang and Lei.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>He, Zhao, Duan, Wang and Lei</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-25">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>The bipedal wheel-legged robot combines the high energy efficiency of wheeled movement with the terrain adaptability of legged locomotion. However, achieving a smooth transition between these two heterogeneous motion modes within a unified control framework remains challenging. This study proposes a reinforcement learning control framework that integrates the Mixture of Experts (MoE) architecture. This approach employs a &#x201c;divide and conquer&#x201d; strategy by introducing a dynamic gating network and a Top-K sparse activation mechanism, which automatically allocates different motion modes to specific expert subnetworks, effectively decoupling conflicting gradients. Simulation results demonstrate that, compared to the single-network PPO method, the MoE-enhanced algorithm exhibits significant improvements in training stability and rewards. The learned policy successfully achieved smooth rolling on flat surfaces and transitioned to dynamic leg-lifting gaits when confronted with obstacles. In various test terrains, it showed a markedly higher success rate compared to the single-network PPO method.</p>
</abstract>
<kwd-group>
<kwd>bipedal wheel-legged robot</kwd>
<kwd>curriculum learning</kwd>
<kwd>gradient conflict</kwd>
<kwd>mixture of experts</kwd>
<kwd>reinforcement learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. National Natural Science Foundation of China (Grant Nos. 12372162 and 12302078).</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="1"/>
<equation-count count="11"/>
<ref-count count="29"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Robot Learning and Evolution</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>The trade-off between wheeled efficiency and legged versatility is a significant issue in mobile robots. While wheeled robots excel on flat terrain and legged robots adapt to unstructured environments, bipedal wheel-legged robots have emerged as a promising hybrid to combine these advantages (<xref ref-type="bibr" rid="B12">Klemm et al., 2019</xref>; <xref ref-type="bibr" rid="B25">Wang et al., 2021</xref>; <xref ref-type="bibr" rid="B8">Guo et al., 2022</xref>; <xref ref-type="bibr" rid="B9">He et al., 2025</xref>; <xref ref-type="bibr" rid="B26">Wang et al., 2025</xref>). These systems can switch between continuous rolling for cruising and discrete stepping or jumping for obstacles (<xref ref-type="bibr" rid="B13">Klemm et al., 2020</xref>; <xref ref-type="bibr" rid="B5">Cui et al., 2021</xref>; <xref ref-type="bibr" rid="B16">Lee et al., 2024</xref>). However, unlocking this potential requires a control policy capable of mastering these distinct modes simultaneously. Traditional control of wheel-legged robots often relies on model-based optimization. For example, analytical second-order derivatives of rigid-body contact dynamics have been derived to enhance the multi-shooting Differential Dynamic Programming (DDP) algorithm (<xref ref-type="bibr" rid="B23">Singh et al., n.d.</xref>), thereby enabling a highly efficient approach to handling complex contact interactions in humanoid and hybrid locomotion robots. While such trajectory optimization methods deliver precise control performance based on analytical gradients, reinforcement learning (RL) provides an alternative by learning robust policies through trial and error, which can be more adaptable to unstructured environments.</p>
<p>Developing a unified control policy is challenging due to the heterogeneous dynamics between high-precision rolling (<xref ref-type="bibr" rid="B2">Bouton et al., 2020</xref>; <xref ref-type="bibr" rid="B7">Garc&#xed;a and Duarte, 2024</xref>) and high-frequency maneuvers like jumping (<xref ref-type="bibr" rid="B11">Hwangbo et al., 2019</xref>; <xref ref-type="bibr" rid="B15">Lee et al., 2020</xref>; <xref ref-type="bibr" rid="B29">Zheng et al., 2025</xref>). Attempting to master these diverse skills within a monolithic reinforcement learning agent often triggers severe gradient conflicts (<xref ref-type="bibr" rid="B28">Yu et al., 2020</xref>), where updates for one mode interfere with the optimization of another. Consequently, the agent settles for a compromised policy, suffering from catastrophic forgetting or unstable convergence (<xref ref-type="bibr" rid="B30">Kirkpatrick et al., 2017</xref>; <xref ref-type="bibr" rid="B20">Schaul et al., 2019</xref>).</p>
<p>To address these limitations, this research proposes a Mixture of Experts (MoE) architecture integrated into the PPO framework, which effectively tackles the problem of gradient conflict (<xref ref-type="bibr" rid="B22">Shazeer et al., 2017</xref>; <xref ref-type="bibr" rid="B3">Celik et al., 2024</xref>; <xref ref-type="bibr" rid="B17">Li et al., 2024</xref>; <xref ref-type="bibr" rid="B19">Obando-Ceron et al., 2024</xref>; <xref ref-type="bibr" rid="B1">Akrour et al., 2022</xref>). The core concept of our approach is &#x201c;divide-and-conquer.&#x201d; Instead of forcing a single neural network to encode all locomotion primitives, we employ a dynamic gating mechanism that selects specific expert sub-networks based on the robot&#x2019;s state. This architecture effectively disentangles the conflicting gradients. One expert can specialize in steady-state rolling, while another focuses on dynamic leg-lifting, with the gating network learning the optimal switching logic. Aside from innovations in algorithm architecture, we also propose a phased curriculum learning strategy (<xref ref-type="bibr" rid="B14">Kumar et al., 2021</xref>; <xref ref-type="bibr" rid="B16">Lee et al., 2024</xref>). The training process is divided into two phases. In the first phase, the agent is trained on specific terrains, focusing the acquisition of vertical mobility skills. In the second phase, the policy is generalized across randomized mixed terrains. Blind locomotion is realized by relying solely on proprioceptive history without privileged terrain information.</p>
<p>Based on the Diablo bipedal wheel-legged robot platform, we demonstrated the superiority of the proposed MoE-enhanced framework over the single-network PPO method. The results highlight three key contributions:<list list-type="bullet">
<list-item>
<p>Training Stability: A reinforcement learning framework integrated with the Mixture of Experts (MoE) architecture is introduced, which effectively resolves gradient conflicts in multi-modal locomotion tasks, significantly improving training stability and mitigating catastrophic forgetting compared to single-network PPO.</p>
</list-item>
<list-item>
<p>Higher Performance: A dynamic gating network coupled with a Top-K sparse activation mechanism is introduced. This design automatically decouples heterogeneous motion modes, enabling expert subnet specialization and achieving higher reward peaks for better stability-agility balance.</p>
</list-item>
<list-item>
<p>Enhanced Traversability: A two-phase curriculum learning strategy is designed to progress from deterministic terrains to noisy complex environments. This approach ensures robust generalization, granting the robot enhanced traversability over unstructured obstacles.</p>
</list-item>
</list>
</p>
<p>This work offers a novel solution for general motion learning in bipedal wheel-legged robots, providing a scalable pathway for deploying agile hybrid locomotion in complex environments.</p>
</sec>
<sec sec-type="methods" id="s2">
<label>2</label>
<title>Methods</title>
<sec id="s2-1">
<label>2.1</label>
<title>Overall control framework</title>
<p>The robotic platform utilized in this study is the bipedal wheel-legged robot, Diablo (Direct Drive Tech, China), with simulations conducted in the Isaac Gym environment (<xref ref-type="bibr" rid="B11">Hwangbo et al., 2019</xref>; <xref ref-type="bibr" rid="B15">Lee et al., 2020</xref>; <xref ref-type="bibr" rid="B14">Kumar et al., 2021</xref>; <xref ref-type="bibr" rid="B18">Nahrendra et al., 2023</xref>). The overall control framework comprises a high-level decision controller and a low-level PD controller (<xref ref-type="fig" rid="F1">Figure 1A</xref>). The high-level decision controller computes the target positions for each joint of the robot at a given time, which are then be scaled and transmitted to the low-level PD controller. The PD controller calculates the joint torques required to track these target positions, thereby driving the physics simulation.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Robot control algorithm framework. <bold>(A)</bold> Overall control framework for the robot training process. <bold>(B)</bold> Composition and principle of the MoE algorithm controller.</p>
</caption>
<graphic xlink:href="frobt-13-1788395-g001.tif">
<alt-text content-type="machine-generated">Diagram illustrating a hierarchical mixture-of-experts reinforcement learning architecture for robot control. Panel A shows a controller with a gating and experts network outputting weighted actions, linked via a PD controller to a simulated robot in Isaac Gym, with data flows for action, reward, and state, and experience replay for updates. Panel B details the architecture, including expert networks for separate robot behaviors, critic and actor network structures, and data exchange between simulation, controller, and PD controller for joint target positions and value estimation.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Design of the controller based on the PPO algorithm</title>
<p>We initially trained the robot using a standard reinforcement learning approach. The objective was to achieve multi-gait capabilities within a single decision-making network. In this section, we employ the PPO algorithm (<xref ref-type="bibr" rid="B21">Schulman et al., 2017</xref>) to train the robot, and both the Actor and Critic consist of a neural network with dimension of [512, 256, 128].</p>
<sec id="s2-2-1">
<label>2.2.1</label>
<title>Simulation environment setup</title>
<p>The training environment was constructed on the NVIDIA Isaac Gym simulation platform, utilizing an NVIDIA GeForce RTX 3090 Ti graphics card and configured with 4,096 parallel environments to efficiently train the Diablo robot, with a maximum episode duration of 20 s. The environment features specific terrains, which compel the robot to transition from rolling to stepping.</p>
</sec>
<sec id="s2-2-2">
<label>2.2.2</label>
<title>State space</title>
<p>In this study, the state space <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is constructed as a high-dimensional composite vector of dimension <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>153</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The goal is to spatialize the temporal information, enabling a standard multi-layer perceptron (MLP) network to effectively perceive dynamic trends in the system, such as changes in acceleration and contact states. This vector is formed by concatenating real-time proprioceptive observations with historical joint data. Crucially, to ensure a truly blind locomotion policy, the input state <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> relies strictly on onboard proprioceptive sensors (IMU, joint encoders, and command input) and a history sequence of the most recent 10 frames. We explicitly exclude any exteroceptive data or privileged terrain heightmaps from the policy input. This configuration is maintained consistently across both the training and testing phases to validate the robot&#x2019;s adaptability using only proprioception.</p>
<p>For the proprioceptive observations <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, this component is represented as a vector in <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>33</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, reflecting the robot&#x2019;s current physical state and the task objectives. As shown in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>, the specific elements include: the linear velocity of the robot&#x2019;s base in the body coordinate system <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; the angular velocity <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; the projection of the gravity vector onto the body axes <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; the commands <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which encompass the target linear velocities <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>x</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and yaw rate <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, as well as the jump height target <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the adjustment angles for the left and right legs <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>; the current position error for the joints <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; the joint velocities <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; and the action vector from the previous time step <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.<disp-formula id="e1">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>j</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The historical observations <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is represented as a vector in <inline-formula id="inf20">
<mml:math id="m21">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>120</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which stores the joint data from the most recent 10 frames, corresponding to a continuous segment of time in the physical simulation. This includes the historical position errors of the robot&#x2019;s six joints <inline-formula id="inf21">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>60</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and the historical velocities <inline-formula id="inf22">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>60</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. At any given time <inline-formula id="inf23">
<mml:math id="m24">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the input state vector is formulated as follows:<disp-formula id="e2">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-2-3">
<label>2.2.3</label>
<title>Action space</title>
<p>In this study, we employ a continuous action space <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for the precise control of the six degrees of freedom on both sides of the bipedal wheel-legged robot. The policy network does not directly output torques. Instead, it generates a normalized position deviation vector <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mn>6</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which is transformed into target joint positions <inline-formula id="inf26">
<mml:math id="m28">
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> through linear mapping, as shown in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>. Here, <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the robot&#x2019;s initial posture, and <inline-formula id="inf28">
<mml:math id="m30">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the action scaling factor. These high-level position commands are then passed to a low-level PD controller to compute the final execution torques <inline-formula id="inf29">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, with the calculation method outlined in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. In this context, <inline-formula id="inf30">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf31">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the current joint positions and velocities, while <inline-formula id="inf32">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf33">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the specified position stiffness and velocity damping gains.<disp-formula id="e3">
<mml:math id="m36">
<mml:mrow>
<mml:msubsup>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2a;</mml:mo>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2a;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:msub>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo>&#x2d9;</mml:mo>
</mml:mover>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>max</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x3c4;</mml:mi>
<mml:mi>max</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-2-4">
<label>2.2.4</label>
<title>Reward function</title>
<p>The reward function <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> consists of four components: task objective rewards, gait induction and constraint rewards, stability and physical constraint rewards, and motion smoothness and safety rewards. The specific parameters and calculation methods for these reward components are detailed in <xref ref-type="table" rid="T1">Table 1</xref>. In the gait induction and constraint rewards, it is worth noting that the &#x201c;leg lifting reward&#x201d; and &#x201c;no moonwalk reward&#x201d; are complementary in their functionality. Specifically, the positive &#x201c;leg lifting reward&#x201d; is designed to induce a vertical height difference between the hub centers of the robot&#x2019;s left and right legs, thereby prompting a leg-lifting and stepping gait while preventing the training process from converging to the local optimum of pure rolling locomotion. In contrast, the negative &#x201c;no moonwalk reward&#x201d; is introduced to eliminate horizontal offset between the left and right leg hub centers, effectively suppressing the undesired gait of staggered wheel positions along the forward direction.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Specific reward parameters and calculation methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Category</th>
<th align="center">Reward itemname</th>
<th align="center">Scale</th>
<th align="center">Calculationmethod</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="center">Task objective rewards</td>
<td align="center">Tracking lin vel</td>
<td align="center">5.5</td>
<td align="center">
<inline-formula id="inf35">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Tracking ang vel</td>
<td align="center">5.5</td>
<td align="center">
<inline-formula id="inf36">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Base height</td>
<td align="center">&#x2212;0.25</td>
<td align="center">
<inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>arg</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td rowspan="3" align="center">Gait induction and constraint reward</td>
<td align="center">Leg lifting</td>
<td align="center">20</td>
<td align="center">
<inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mfenced open="|" close="|" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">No fly</td>
<td align="center">0.25</td>
<td align="center">
<inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mspace width="0.3em"/>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">No moonwalk</td>
<td align="center">&#x2212;2.5</td>
<td align="center">
<inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td rowspan="3" align="center">Stability and physical constraint reward</td>
<td align="center">Orientation</td>
<td align="center">&#x2212;5.5</td>
<td align="center">
<inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Lin vel z</td>
<td align="center">&#x2212;0.1</td>
<td align="center">
<inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>v</mml:mi>
<mml:mi>z</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Ang vel xy</td>
<td align="center">&#x2212;0.25</td>
<td align="center">
<inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td rowspan="4" align="center">Motion smoothness and safety reward</td>
<td align="center">Collision</td>
<td align="center">&#x2212;180</td>
<td align="center">
<inline-formula id="inf44">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext> </mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Dof pos limits</td>
<td align="center">&#x2212;0.5</td>
<td align="center">
<inline-formula id="inf45">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>&#x2211;</mml:mo>
<mml:mtext>clip</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mtext>limit</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Action rate</td>
<td align="center">&#x2212;0.008</td>
<td align="center">
<inline-formula id="inf46">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Torques</td>
<td align="center">-1e-5</td>
<td align="center">
<inline-formula id="inf47">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="&#x7c;">
<mml:mrow>
<mml:mi>&#x3c4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-2-5">
<label>2.2.5</label>
<title>Domain randomization and termination conditions</title>
<p>To enhance the robustness of the policy, this study implements a comprehensive domain randomization framework that includes environmental dynamics randomization, initial state perturbations, sensor noise injection, and external mechanical impacts. The ground friction coefficient <inline-formula id="inf48">
<mml:math id="m52">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the physics engine is independently sampled from a uniform distribution <inline-formula id="inf49">
<mml:math id="m53">
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>0.5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1.25</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> during each reset, and the base&#x2019;s horizontal position is randomly offset within a range of &#xb1;1 m. The observation space is augmented with parameterized additive noise, as shown in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>, where <inline-formula id="inf50">
<mml:math id="m54">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the noise amplitude <inline-formula id="inf51">
<mml:math id="m55">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is set with differentiated intensities, the specific values are given in <xref ref-type="sec" rid="s11">Supplementary Table S4</xref> of the <xref ref-type="sec" rid="s11">Supplementary Material</xref>. Furthermore, a random horizontal velocity impact of 1.0 m/s is applied every 15 s to enforce balance recovery. The termination condition <inline-formula id="inf52">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is triggered by both time truncation and physical contact.<disp-formula id="e5">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Design of the MoE controller</title>
<p>Based on the original PPO framework, we introduced a MoE mechanism in the Actor. The algorithm is illustrated in <xref ref-type="fig" rid="F1">Figure 1B</xref>. The new algorithm maintains consistency with the original PPO algorithm in terms of state space, action space, and reward function. For the new algorithm that incorporates the MoE, the Actor is composed of a gating network with dimensions of [128, 64] and two expert networks sized [512, 256, 128]. At each time step, the final action output of the Actor corresponds to the expert network with the highest weight.</p>
<sec id="s2-3-1">
<label>2.3.1</label>
<title>Composition of the loss function</title>
<p>The loss function <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the basic PPO algorithm is shown in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>. Here, <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denotes the clipped surrogate loss that restricts the magnitude of policy updates, <inline-formula id="inf55">
<mml:math id="m60">
<mml:mrow>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the squared error loss for evaluating state values, and <inline-formula id="inf56">
<mml:math id="m61">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a linear combination of entropy to encourage policy exploration. The coefficients <inline-formula id="inf57">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf58">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are used to adjust the contributions of the latter two losses.<disp-formula id="e6">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msup>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>After introducing the MoE mechanism, we augmented the total loss function to <inline-formula id="inf59">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="e7">Equation 7</xref> to prevent mode collapse within the expert networks. The newly added term is the load balancing auxiliary loss <inline-formula id="inf60">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is regulated by the coefficient <inline-formula id="inf61">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The calculation method for this loss is detailed in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>, where <inline-formula id="inf62">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the average gating weight of the <italic>i</italic>-th expert in the current training batch, and <inline-formula id="inf63">
<mml:math id="m69">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the total number of experts. This term enforces uniform activation and utilization of all experts by minimizing both the negative entropy of the average weights and the mean absolute deviation.<disp-formula id="e7">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mtext>aux</mml:mtext>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>ln</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>0.5</mml:mn>
<mml:mo>&#xb7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-3-2">
<label>2.3.2</label>
<title>Top-K mechanism in the MoE framework</title>
<p>To achieve computational sparsity and encourage the policy network to transition clearly between different motion modes, we incorporate a Top-K gating mechanism in the policy framework (<xref ref-type="bibr" rid="B4">Chen et al., 2025</xref>; <xref ref-type="bibr" rid="B6">D&#x2019;Souza et al., 2025</xref>; <xref ref-type="bibr" rid="B10">Huang et al., 2025</xref>; <xref ref-type="bibr" rid="B27">Xu et al., 2025</xref>; <xref ref-type="bibr" rid="B24">Vincze et al., 2025</xref>). For state input <inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the gating network <inline-formula id="inf65">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>&#x3d5;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> generates a probability distribution <inline-formula id="inf66">
<mml:math id="m74">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>N</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for <inline-formula id="inf67">
<mml:math id="m75">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> experts via Softmax (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>). A truncation operation then retains only the top k experts to form the active set <inline-formula id="inf68">
<mml:math id="m76">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="disp-formula" rid="e10">Equation 10</xref>). Selected weights are re-normalized to ensure effective gradient propagation. The final output <inline-formula id="inf69">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a linear combination of the outputs <inline-formula id="inf70">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> from these <inline-formula id="inf71">
<mml:math id="m79">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> active experts (<xref ref-type="disp-formula" rid="e11">Equation 11</xref>), enforcing sparse activation and allowing each expert to focus on specific sub-tasks. In this study, the gating network and expert networks take the same input <inline-formula id="inf72">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="disp-formula" rid="e2">Equation 2</xref>). Meanwhile, we fix the number of activated experts <inline-formula id="inf73">
<mml:math id="m81">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to 1 at each time step, thereby ensuring that only the most relevant expert is active at any given time to maximize the decoupling of motion primitives.<disp-formula id="e9">
<mml:math id="m82">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m83">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bc;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a3;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>Phased training design</title>
<p>For a bipedal wheel-legged robot, the learning process begins with mastering fundamental equilibrium on planar surfaces, followed by tracking velocity commands via rolling locomotion. Subsequently, the robot negotiates slopes and overcomes vertical obstacles on complex terrains using leg-lifting gaits. Ultimately, it masters dynamically switching between rolling and leg-lifting modes. To this end, we designed a progressive training framework to induce these skills sequentially. Central to this approach is the two-phase training pipeline illustrated in <xref ref-type="fig" rid="F2">Figure 2A</xref>, where each phase features distinct terrain compositions and complexity levels tailored to specific skills. The first phase consists exclusively of flat ground, slopes, and vertical stairs. An inclination threshold is defined such that slopes exceeding this value are rendered as vertical stairs in the simulation. This process evolves the initial policy <inline-formula id="inf74">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into an obstacle-capable policy <inline-formula id="inf75">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The second phase involves transferring the policy <inline-formula id="inf76">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to a comprehensive terrain set for re-training. To increase terrain complexity, Phase 2 incorporates wave terrain and discrete terrain into the training set, with Perlin noise added to introduce irregular surface perturbations. This phase serves as a comprehensive training ground for hybrid locomotion switching and dynamic balancing. Building upon the results of Phase 1, this process culminates in the final generalized policy <inline-formula id="inf77">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Schematics of the training process. <bold>(A)</bold> Schematic of the phased training design. <bold>(B)</bold> Schematic of the curriculum learning design.</p>
</caption>
<graphic xlink:href="frobt-13-1788395-g002.tif">
<alt-text content-type="machine-generated">Two-part diagram illustrating a robotic terrain training curriculum. Panel A outlines Phase 1 with initial policy learning on sloped and stair terrains, then Phase 2 with all-terrain training incorporating Perlin noise, producing an adaptable all-terrain policy. Panel B shows a flowchart for curriculum-based training: starting with difficulty initialization, robot training on current terrain, performance checks to adjust difficulty up or down, and a forgetting mitigation step involving random level selection if maximum difficulty is reached.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>Curriculum learning design</title>
<p>Prior research demonstrates that curriculum learning&#x2014;where task difficulty is incrementally increased&#x2014;is a highly effective strategy for training complex locomotion policies. Accordingly, we propose an adaptive terrain curriculum designed to facilitate effective learning for bipedal wheel-legged robots in unstructured environments. At the onset of training, robots are uniformly distributed across the lowest difficulty tier of each terrain type, and the terrain difficulty level is dynamically updated based on the robot&#x2019;s real-time performance. If the robot successfully traverses more than half the terrain length within a fixed time, it is deemed to have mastered the current difficulty. Consequently, the terrain level is incremented upon the next reset. Conversely, the difficulty level is decremented, allowing the robot to consolidate basic skills on simpler terrain. To mitigate catastrophic forgetting, robots that reach the maximum difficulty level are randomly reassigned to intermediate terrains.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Result</title>
<sec id="s3-1">
<label>3.1</label>
<title>Training performance</title>
<p>Since the Actor in MoE-enhanced algorithm contains two networks of size [512, 256, 128], its total parameter count is approximately twice that of the standard PPO baseline. To exclude the impact of increased network capacity, this study introduces an additional large PPO baseline with a similar parameter count to MoE-enhanced algorithm. The large PPO baseline has a network size of [768, 384, 192], corresponding to scaling each layer of the Actor in the standard PPO baseline by a factor of 1.5. In both training phases, all three algorithms are trained with identical hyperparameters, including reward term weights, terrain settings, randomization parameters, and total training iterations. In the first phase, no Perlin noise is applied to the terrain, while in the second phase, Perlin noise is added to introduce irregular surface disturbances, as shown in <xref ref-type="fig" rid="F3">Figure 3A</xref>. Both phases contain 6,000 training iterations, and the average rewards are compared, with results presented in <xref ref-type="fig" rid="F3">Figures 3B,C</xref>, respectively. In the first phase, no obvious training collapse is observed for the three algorithms. Before around 1,600 iterations, their reward values and convergence speeds are nearly identical. After 1,600 iterations, the reward curves diverge significantly: the reward of the MoE-enhanced algorithm rises steadily, while that of the standard PPO baseline declines continuously. Although the reward of the large PPO baseline does not drop obviously, it shows no upward trend. By the end of the first phase, the reward of the MoE-enhanced algorithm is significantly higher than the other two baselines. Initially, the terrain mainly includes flat ground, slopes, and low stairs with small differences in dynamic properties. For low stairs, the robot can even traverse them via pure rolling, with dynamics similar to normal flat-ground locomotion and slope traversal. As the training progressed and stair heights increased, reaching a maximum height of 14 cm, which is much larger than the wheel radius, the robot can only overcome such obstacles through jumping or leg-lifting stepping, whose dynamics differ greatly from pure rolling. Training with a single network easily causes gradient update interference: after learning leg-lifting or jumping, the robot partially forgets pure rolling; similarly, improving pure rolling leads to partial forgetting of obstacle-crossing skills. Thus, the policy learned by a single network is a compromised solution. When facing higher stairs, its performance is insufficient, leading to stagnant or even severely degraded rewards.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Terrain environment and stair climbing tests. <bold>(A)</bold> The training terrain after the addition of Perlin noise. <bold>(B)</bold> Comparison of training reward curves among the standard PPO baseline, large PPO baseline, and MoE in the first training phase. <bold>(C)</bold> Comparison of training reward curves among the standard PPO baseline, large PPO baseline, and MoE in the second training phase. <bold>(D)</bold> Average weight changes of the two expert networks throughout the training process. <bold>(E)</bold> Results of the robot using the trained locomotion policy to ascend stairs, with action decompositions shown from both frontal and lateral views.</p>
</caption>
<graphic xlink:href="frobt-13-1788395-g003.tif">
<alt-text content-type="machine-generated">Panel A displays four simulated terrain types labeled Rough Terrain, Flat Terrain, Discrete Terrain, and Stair Terrain. Panel B and C are line graphs showing reward versus iteration for different reinforcement learning models with annotations highlighting training phases. Panel D is a line graph showing model expert weights over training iterations, divided into first phase single terrain and second phase all-terrain. Panel E presents two time-lapse sequences of a boxy robot traversing stairs, with both frontal and lateral views and time markers from zero to 0.4 seconds.</alt-text>
</graphic>
</fig>
<p>In the second phase, Perlin noise with an amplitude of 6 cm is introduced into the terrain, and terrain types are extended to full terrain. Under these conditions, both the standard PPO baseline and large PPO baseline exhibit severe training instability: reward curves fail to converge, and training collapse frequently occurs. The primary cause is that Perlin noise increases terrain surface unevenness, leading to large fluctuations in the robot&#x2019;s state observations at each time step. The policy continuously chases changing optimal behavior, resulting in reward volatility. If the frequency of terrain variations exceeds the adaptation speed of the policy, training collapse occurs. For single-network algorithms, limited learning capacity causes a trade-off dilemma, making stable training infeasible on rough terrain and preventing a robust final policy. In contrast, the MoE-enhanced algorithm maintains favorable reward curves throughout entire training process. In the first phase, its policy gradually adapts to increasing stair heights, with consistent reward rises. In the second phase, since the two expert networks&#x2019; skills were specialized in the first phase, with each expert responsible for distinct gait tasks, stable locomotion can be achieved by alternating between different experts even on rough terrain surfaces, thereby ensuring training stability. To further verify MoE-enhanced algorithm stability, we trained all three algorithms with different random seeds. The results are shown in <xref ref-type="sec" rid="s11">Supplementary Figure S1</xref> of the <xref ref-type="sec" rid="s11">Supplementary Material</xref>. The results demonstrate that the MoE-enhanced algorithm performs stably across different seeds, whereas the single-network standard PPO and large PPO baselines exhibit significant variations across seeds and struggle to learn a stable policy. Overall, these results preliminarily validate the inherent limitations of single-network structures when handling multi-mode locomotion tasks in complex environments. The performance improvement of the MoE-enhanced algorithm arises primarily from its intrinsic mechanism, rather than the increase in total network parameters.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Experts contributions during training</title>
<p>The variation in the average weights of the experts during the training process of the MoE-enhanced algorithm was extracted in <xref ref-type="fig" rid="F3">Figure 3D</xref>. It is evident that in the first phase, there was a substantial difference in the call frequencies between the two expert networks; however, this disparity significantly decreased in the second phase. The primary reason for this change can be attributed to the relatively simple terrain in the first phase, which consisted solely of flat surfaces, slopes, and vertical stairs without any added Perlin noise. In this scenario, the use of the leg-lifting gait was required only for navigating the vertical stairs. When encountering vertical stairs, the robot needed to roll a certain distance on the step before transitioning to the next one, which meant that calls to the expert responsible for the leg-lifting gait were infrequent, occurring only during the transition from one stair to the next, while the majority of the time involved pure rolling. Consequently, during the first phase of training, the call frequency for the leg-lifting expert (Expert 2 in the figure) was significantly lower than that of the expert responsible for pure rolling (Expert 1), leading to a notable difference in their average weights as the number of training epochs increased. In contrast, the second phase introduced discrete terrains that required frequent alternation between pure rolling and the leg-lifting gait, along with the addition of Perlin noise, which exacerbated the irregularity of the terrain surface. This heightened the demand for the leg-lifting gait. Under these terrain conditions, the call frequency for Expert 2 in the second phase noticeably increased to accommodate the undulating ground. As the number of training epochs in the second phase increased, the difference in average weights between the two experts significantly diminished.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Stair-climbing test</title>
<p>The locomotion policy derived from the MoE-enhanced algorithm was tested for ascending stairs, with the results presented in <xref ref-type="fig" rid="F3">Figure 3E</xref>. When faced with stairs of a vertical height of 12 cm, the robot successfully executed an ideal leg-lifting gait. The motion breakdown was analyzed from both the frontal and lateral views, as shown in <xref ref-type="fig" rid="F3">Figure 3E</xref>. With a speed command set at 0.6 m/s, the robot took approximately 0.4 s to clear the step. During the ascent, the robot first lifted its left wheel onto the next step before bringing up the right wheel, thereby effectively utilizing the leg-lifting gait to complete the stair-climbing task. In general, for bipedal robots encountering vertical obstacles, common motion policies include jumping or employing a leg-lifting gait. However, in this study, due to the incorporation of a significant leg-lifting reward, the resulting obstacle negotiation policy was exclusively the leg-lifting gait. Utilizing only the leg-lifting gait for obstacle negotiation effectively addresses several issues associated with jumping, such as problems related to body stability and the impacts and vibrations that may affect the robot&#x2019;s internal components.</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Obstacle navigation test</title>
<p>A specially designed testing environment is established to test the obstacle negotiation success rates of the two policies (<xref ref-type="fig" rid="F4">Figure 4A</xref>), which included three types of terrain: rough terrain, discrete terrain, and vertical stairs. Each terrain type was set within a square area, with the rough terrain generated using Perlin noise at an amplitude of 6 cm, and the maximum vertical heights for the discrete terrain and vertical stairs set between 6 and 12 cm. During the testing process, a successful trial required the robot to remain upright throughout, not exit the terrain boundaries, and reach the endpoint. Each terrain with distinct difficulty was measured repeatedly, and 500 robots were used in each single test for statistical analysis. The results are shown in <xref ref-type="fig" rid="F4">Figures 4B,C</xref>. From the results, the locomotion policy trained by the MoE-enhanced algorithm significantly outperforms the standard PPO baseline and the large PPO baseline. In terrains of all difficulty levels, its success rate exceeds 50% across all velocity commands. For the standard PPO baseline, insufficient obstacle-crossing performance can be clearly observed, especially under low velocity commands. Even in the simplest terrain with an obstacle height of 6 cm, its success rate remains low. In the terrain with a height of 12 cm, it performs poorly across all velocity commands. The large PPO baseline also exhibits a considerable performance gap compared with the MoE-enhanced algorithm, and this gap becomes increasingly pronounced as terrain difficulty rises. In particular, when the maximum vertical obstacle height reaches 12 cm, the success rate of the large PPO baseline is generally below 50%. Overall, these results further demonstrate that the superiority of the MoE-enhanced algorithm mainly stems from the inherent mechanism of MoE, and is not significantly affected by the specific amount of network parameters.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Obstacle navigation test. Success is defined as the robot traveling from the &#x201c;Start point&#x201d; to the &#x201c;End point&#x201d; without colliding with the ground and without straying outside the 10-m-wide test area. For each test statistic, 500 robots are tested independently, and the final success rate of these 500 robots is calculated. <bold>(A)</bold> Setup of the obstacle testing area. <bold>(B)</bold> Comparison of success rates for the three algorithms at a maximum vertical height of 6 cm. <bold>(C)</bold> Comparison of success rates for the three algorithms at a maximum vertical height of 8 cm. <bold>(D)</bold> Comparison of success rates for the three algorithms at a maximum vertical height of 10 cm. <bold>(E)</bold> Comparison of success rates for the three algorithms at a maximum vertical height of 12 cm.</p>
</caption>
<graphic xlink:href="frobt-13-1788395-g004.tif">
<alt-text content-type="machine-generated">Panel A shows a labeled simulation environment with a start point, ten meters of rough terrain, ten meters of discrete terrain, and ten meters of stair terrain ending at an endpoint. Panels B to E present grouped bar charts comparing success rates of MoE, Std-PPO, and Large PPO algorithms at different speeds for obstacle heights of six, eight, ten, and twelve centimeters, showing decreasing success rates as obstacle height and speed increase.</alt-text>
</graphic>
</fig>
<p>Based on this experiment, we further conducted ablation experiments to analyze the effects of MoE and phased training. We established an additional baseline trained directly for 12,000 iterations without phasing. Similarly, we evaluate the two training schemes using two metrics: the average training reward and the obstacle-crossing success rate on terrains of different heights. First, with respect to the average reward, training was performed with four different random seeds, and the reward results were statistically analyzed, as shown in <xref ref-type="sec" rid="s11">Supplementary Figure S2</xref> in the <xref ref-type="sec" rid="s11">Supplementary Material</xref>. It can be clearly observed that the MoE-enhanced algorithm achieves higher reward values under phased training than under non-phased training. Subsequently, the locomotion policies obtained by the two training methods were tested for obstacle-crossing success rate, with the results presented in <xref ref-type="sec" rid="s11">Supplementary Figure S3</xref> in the <xref ref-type="sec" rid="s11">Supplementary Material</xref>. The results show that the performance difference between the two policies is not significant on relatively simple terrains. When the maximum vertical height of obstacles reaches 12 cm, the success rate of the policy trained with phased training is significantly higher than that of the non-phased policy. This validates that phased training can improve the upper bound of obstacle-crossing performance. Combined with the comparison results against the PPO algorithm using a single network, it can be concluded that in the MoE-enhanced algorithm proposed in this study, the mechanism of MoE itself plays a more dominant role, while phased training serves as an auxiliary approach to further improve the algorithm&#x2019;s performance.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<label>4</label>
<title>Conclusion</title>
<p>This study proposes a reinforcement learning control framework that integrates the Mixture of Experts (MoE) for bipedal wheel-legged robots. By incorporating MoE into the Actor network of the PPO algorithm, we leverage a sparse gating mechanism to decouple parameters for pure rolling and leg-lifting tasks, effectively resolving the gradient conflict issues associated with single-network multi-task learning. Simulation test results demonstrate that this approach not only eliminates catastrophic forgetting but also significantly enhances training stability. To ensure the robustness and generalization capabilities of the policy, we implemented a two-phase curriculum learning strategy that transitions from specific terrains to complex terrains with Perlin noise. The MoE-enhanced strategy exhibits remarkable adaptability when navigating complex terrains, automatically adjusting expert weights according to the terrain&#x2019;s undulations, thus achieving a smooth transition from rolling on flat ground to dynamic obstacle negotiation. Notably, in high-difficulty obstacle tests, this policy achieved a success rate that greatly exceeded that of the single-network PPO algorithm when traversing vertical stairs of 12 cm. Future work will focus on several key areas: recognizing the significant impact of collision forces on balance during high-speed movements, further research will aim to enhance the system&#x2019;s active safety and self-recovery capabilities under extreme physical contact and external disturbances. Additionally, we will conduct real-world deployment tests in more diverse and unstructured outdoor environments to validate and optimize the long-term reliability of this method in practical applications.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>PH: Writing &#x2013; review and editing, Writing &#x2013; original draft. ZZ: Writing &#x2013; review and editing, Writing &#x2013; original draft. SD: Methodology, Conceptualization, Writing &#x2013; review and editing. PW: Writing &#x2013; review and editing, Investigation. HL: Project administration, Methodology, Writing &#x2013; review and editing.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>We would like to thank Direct Drive Tech for the technical assistance of robot hardware and control. General: Thank others for any contributions.</p>
</ack>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2026.1788395/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2026.1788395/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akrour</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Tateo</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Continuous action reinforcement learning from a mixture of interpretable experts</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>, <fpage>6795</fpage>&#x2013;<lpage>6806</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3103132</pub-id>
<pub-id pub-id-type="pmid">34375280</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bouton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grand</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Benamar</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Design and control of a compliant wheel-on-leg rover which conforms to uneven terrain</article-title>. <source>IEEE/ASME Trans. Mechatronics</source> <volume>25</volume>, <fpage>2354</fpage>&#x2013;<lpage>2363</lpage>. <pub-id pub-id-type="doi">10.1109/TMECH.2020.2973752</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Celik</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Taranovic</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Acquiring diverse skills using curriculum reinforcement learning with mixture of experts</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2403.06966</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>VMTS: vision-assisted teacher-student reinforcement learning for multi-terrain locomotion in bipedal robots</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2503.07049</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Learning-based balance control of wheel-legged robots</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>6</volume>, <fpage>7667</fpage>&#x2013;<lpage>7674</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2021.3100269</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>D&#x2019;Souza</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Karthikeyan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pant</surname>
<given-names>Y. V.</given-names>
</name>
<name>
<surname>Fischmeister</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>SAC-MoE: reinforcement learning with mixture-of-experts for control of hybrid dynamical systems with uncertainty</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2511.12361</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garc&#xed;a</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>F. G.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Mobile rolling robots designed to overcome obstacles: a review</article-title>. <source>Forces Mech.</source> <volume>16</volume>, <fpage>100283</fpage>. <pub-id pub-id-type="doi">10.1016/j.finmec.2024.100283</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Design and dynamic analysis of jumping wheel-legged robot in complex terrain environment</article-title>. <source>Front. Neurorobot.</source> <volume>16</volume>, <fpage>1066714</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2022.1066714</pub-id>
<pub-id pub-id-type="pmid">36531915</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A review: exploring the designs of bio-bots</article-title>. <source>Soft Sci.</source> <volume>0&#x2013;23</volume>. <pub-id pub-id-type="doi">10.20517/ss.2024.50</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>MENTOR: mixture-of-experts network with task-oriented perturbation for visual reinforcement learning</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2410.14972</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hwangbo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dosovitskiy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bellicoso</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tsounis</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Koltun</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Learning agile and dynamic motor skills for legged robots</article-title>. <source>Sci. Robot.</source> <volume>4</volume>, <fpage>eaau5872</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.aau5872</pub-id>
<pub-id pub-id-type="pmid">33137755</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirkpatrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rabinowitz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Desjardins</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Overcoming catastrophic forgetting in neural networks</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>114</volume>, <fpage>3521</fpage>&#x2013;<lpage>3526</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1611835114</pub-id>
<pub-id pub-id-type="pmid">28292907</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Klemm</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Morra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Salzmann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tschopp</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Bodie</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Gulich</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Ascento: a two-wheeled jumping robot</article-title>,&#x201d; in <source>2019 international conference on robotics and automation (ICRA)</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7515</fpage>&#x2013;<lpage>7521</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2019.8793792</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Klemm</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Morra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gulich</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Mannhart</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rohr</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kamel</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>LQR-assisted whole-body control of a wheeled bipedal robot with kinematic loops</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>5</volume>, <fpage>3745</fpage>&#x2013;<lpage>3752</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2020.2979625</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pathak</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>RMA: rapid motor adaptation for legged robots</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2107.04034</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hwangbo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wellhausen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Koltun</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Learning quadrupedal locomotion over challenging terrain</article-title>. <source>Sci. Robot.</source> <volume>5</volume>, <fpage>eabc5986</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.abc5986</pub-id>
<pub-id pub-id-type="pmid">33087482</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bjelonic</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Reske</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wellhausen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Miki</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Learning robust autonomous navigation and locomotion for wheeled-legged robots</article-title>. <source>Sci. Robot.</source> <volume>9</volume>, <fpage>eadi9641</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.adi9641</pub-id>
<pub-id pub-id-type="pmid">38657088</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Cucuringu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>S&#xe1;nchez-Betancourt</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Willi</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Mixtures of experts for scaling up neural networks in order execution</article-title>,&#x201d; in <source>Proceedings of the 5th ACM international conference on AI in finance</source> (<publisher-loc>Brooklyn, NY</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>669</fpage>&#x2013;<lpage>676</lpage>. <pub-id pub-id-type="doi">10.1145/3677052.3698691</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nahrendra</surname>
<given-names>I. M. A.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Myung</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>DreamWaQ: learning robust quadrupedal locomotion with implicit terrain imagination <italic>via</italic> deep reinforcement learning</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2301.10602</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Obando-Ceron</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sokar</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Willi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lyle</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Farebrother</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Foerster</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Mixtures of experts unlock parameter scaling for deep RL</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2402.08609</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schaul</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Borsa</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Modayil</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Ray interference: a source of plateaus in deep reinforcement learning</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.1904.11455</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wolski</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Dhariwal</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Radford</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Klimov</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Proximal policy optimization algorithms</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.1707.06347</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shazeer</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mirhoseini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maziarz</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Outrageously large neural networks: the sparsely-gated mixture-of-experts layer</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.1701.06538</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Wensing</surname>
<given-names>P. M.</given-names>
</name>
</person-group> (<year>n.d.</year>). <article-title>Analytical second-order derivatives of rigid-body contact dynamics: application to multi-shooting DDP</article-title>. <pub-id pub-id-type="doi">10.1109/Humanoids57100.2023.10375214</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vincze</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ferrarotti</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Custode</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Lepri</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Iacca</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>SMOSE: sparse mixture of shallow experts for interpretable reinforcement learning in continuous control tasks</article-title>. <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>39</volume>, <fpage>20982</fpage>&#x2013;<lpage>20990</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v39i20.35394</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Balance control of a novel wheel-legged robot: design and experiments</article-title>,&#x201d; in <source>2021 IEEE international conference on robotics and automation (ICRA), (Xi&#x2019;an, China: IEEE)</source>, <fpage>6782</fpage>&#x2013;<lpage>6788</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA48506.2021.9561579</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Wheeled-legged robots for multi-terrain locomotion in plateau environments</article-title>. <source>Biomim. Intell. Rob.</source> <volume>5</volume>, <fpage>100256</fpage>. <pub-id pub-id-type="doi">10.1016/j.birob.2025.100256</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>MoSE: skill-by-skill mixture-of-experts learning for embodied autonomous machines</article-title>. <pub-id pub-id-type="doi">10.48550/arXiv.2507.07818</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hausman</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Gradient surgery for multi-task learning</article-title>. <volume>0&#x2013;14</volume>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Asano</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tokuda</surname>
<given-names>I. T.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Tensegrity-based legged robot generates passive walking, skipping, and crawling gaits in accordance with environment</article-title>. <source>IEEE/ASME Trans. Mechatronics</source> <volume>30</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1109/TMECH.2024.3522904</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/147408/overview">Giovanni Iacca</ext-link>, University of Trento, Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3114000/overview">Shubham Singh</ext-link>, The University of Texas at Austin, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3356720/overview">M&#xe1;ty&#xe1;s Vincze</ext-link>, Bruno Kessler Foundation (FBK), Italy</p>
</fn>
</fn-group>
</back>
</article>