<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="systematic-review" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1752914</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2026.1752914</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Systematic Review</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep learning-based robotic cloth manipulation applications: systematic review, challenges and opportunities for physical AI</article-title>
<alt-title alt-title-type="left-running-head">Gu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2026.1752914">10.3389/frobt.2026.1752914</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Gu</surname>
<given-names>Ningquan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3288669"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hayashibe</surname>
<given-names>Mitsuhiro</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/84197"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kutsuzawa</surname>
<given-names>Kyo</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1181476"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yu</surname>
<given-names>Hui</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/27428"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Neuro-Robotics Lab, Department of Robotics, Graduate School of Engineering, Tohoku University</institution>, <city>Sendai</city>, <country country="JP">Japan</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Graduate School of Science and Engineering, Saitama University</institution>, <city>Saitama</city>, <country country="JP">Japan</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>School of Psychology &#x26; Neuroscience, University of Glasgow</institution>, <city>Glasgow</city>, <country country="GB">United Kingdom</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Ningquan Gu, <email xlink:href="mailto:gu.ningquan.t1@dc.tohoku.ac.jp">gu.ningquan.t1@dc.tohoku.ac.jp</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-06">
<day>06</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1752914</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>07</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Gu, Hayashibe, Kutsuzawa and Yu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Gu, Hayashibe, Kutsuzawa and Yu</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-06">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Cloth unfolding and folding are fundamental tasks in autonomous robotic cloth manipulation as Physical AI. Driven by recent advances in deep learning, this area has developed rapidly in recent years. This review aims to systematically identify and summarize current progress in deep learning-based cloth unfolding and folding. Following the Systematic Reviews and Meta-Analyses (PRISMA) guidelines, 41 relevant papers from 2019 to 2024 were selected for analysis. We examines various factors influencing cloth manipulation and find that, while current methods show impressive performance, several challenges remain unaddressed. These challenges include irregular cloth sizes and diverse initial garment states. Concerning datasets, there is a need for improved real-world data collection systems and more realistic cloth simulators, and the Sim2Real gap must be carefully considered. Additionally, the review highlights the importance of incorporating multi-modal sensors into current platforms and the emergence of novel primitive actions that enhance performance. The need for more consistent comparison metrics is emphasized, and strategies for addressing failure modes are discussed to further advance the field. From an algorithmic perspective, we reorganize existing learning methods into six learning and control paradigms: perception-guided heuristics, goal-conditioned manipulation policies, predictive and model-based state representation methods, reward-driven reinforcement learning over primitive actions, demonstration-driven skill transfer methods, and emerging large language model-based planning methods. We discuss how each paradigm contributes to unfolding and folding, their respective strengths and limitations, and the open problems that arise. Finally, we summarize the remaining challenges and provide future perspectives for physical AI.</p>
</abstract>
<kwd-group>
<kwd>cloth unfolding and folding</kwd>
<kwd>deep learning</kwd>
<kwd>LLM</kwd>
<kwd>physical AI</kwd>
<kwd>robotic manipulation</kwd>
<kwd>systematic review</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the JSPS Grant-in-Aid for Scientific Research under Grant 24K00841. The work of Ningquan Gu was supported by GP-Mech International Joint Graduate Program, Tohoku University.</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="6"/>
<equation-count count="0"/>
<ref-count count="82"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Robot Learning and Evolution</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Cloth manipulation is essential in daily life and various industries. Automating this process has significant implications for improving quality of life and enhancing productivity and efficiency in laundry services, retail, and manufacturing. However, cloth manipulation presents challenges for robotics due to the infinite-dimensional configuration space, self-occlusion, and the complex dynamics of cloth. Robotic cloth manipulation encompasses various operations, such as cloth unfolding and folding (<xref ref-type="bibr" rid="B57">Seita et al., 2019</xref>; <xref ref-type="bibr" rid="B67">Tsurumine et al., 2019</xref>; <xref ref-type="bibr" rid="B79">Zhou et al., 2024</xref>; <xref ref-type="bibr" rid="B76">Yang et al., 2024</xref>), assisted human dressing (<xref ref-type="bibr" rid="B77">Zhang and Demiris, 2020</xref>), ironing (<xref ref-type="bibr" rid="B39">Li et al., 2016</xref>), and sewing (<xref ref-type="bibr" rid="B34">Ku et al., 2023</xref>). Among these, folding and unfolding operations are the most fundamental tasks. They are crucial for applications such as laundry automation, retail inventory management, and personal assistant robots, playing a significant role in both everyday life and industrial processes. <xref ref-type="bibr" rid="B46">Maitin-Shepard et al. (2010)</xref> developed the first system using a PR2 robot to unfold and fold wrinkled towels. However, early methods for unfolding or folding were often slow and lacked the ability to generalize to arbitrary initial and target states.</p>
<p>In recent years, researchers have explored deep learning-based (DL-based) approaches for cloth manipulation, which have shown improved results compared to traditional methods. For instance, <xref ref-type="bibr" rid="B57">Seita et al. (2019)</xref> utilized the YOLO detection algorithm to identify keypoints on a blanket, facilitating the unfolding task and demonstrating the efficacy of DL-based methods. Later <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref> employed deep reinforcement learning (RL) to perform garment unfolding, ironing, and folding tasks. The application of DL-based methods has not only introduced algorithmic advancements but also impacted other elements, such as datasets, manipulation strategies, and comparison metrics.</p>
<p>In this systematic review, we examine recent advances in DL-based robotic cloth unfolding and folding. Prior surveys have discussed related aspects such as cloth perception for assistive manipulation (<xref ref-type="bibr" rid="B30">Jim&#xe9;nez and Torras, 2020</xref>) and deformable-object modeling (<xref ref-type="bibr" rid="B28">Hou et al., 2019</xref>). <xref ref-type="bibr" rid="B49">Nocentini et al. (2022)</xref> reviewed learning-based cloth manipulation and dressing from a supervision-type perspective (e.g., supervised, reinforcement, imitation learning), but their coverage ends in 2019 and therefore does not reflect the substantial methodological progress made in recent years. Moreover, supervision-based taxonomies (<xref ref-type="bibr" rid="B49">Nocentini et al., 2022</xref>) provide a conventional perspective but do not adequately capture the underlying perception, representation, and control structures in the cloth-manipulation field.</p>
<p>To address this gap, we reorganize the literature into six learning-and-control paradigms that more directly reflect how existing methods perceive cloth, represent its state, and decide actions. Our review focuses on DL-based approaches for cloth unfolding and folding under this paradigm-oriented perspective.</p>
<p>To clarify the scope of our review, we briefly characterize the two core tasks considered in this survey: cloth unfolding and cloth folding.</p>
<p>The unfolding process consists of applying a sequence of actions to transform a cloth from an arbitrary crumpled configuration into a flattened state with maximal coverage. This process typically exhibits the following characteristics:<list list-type="bullet">
<list-item>
<p>Random Initial State: The starting configuration of the cloth can vary significantly, often being crumpled in an arbitrary manner.</p>
</list-item>
<list-item>
<p>Various Manipulation Strategies: The manipulation strategies are various, e.g., one or multiple robot arms, diverse action primitives, combination of actions.</p>
</list-item>
<list-item>
<p>Uniform End Criterion: Coverage is the primary criterion for the unfolded result. Further, other customized configurations, such as cloth orientation, are also considered in some cases.</p>
</list-item>
</list>
</p>
<p>In contrast, the folding process starts from an unfolded or nearly unfolded configuration and aims to reach a predefined structured goal shape. Its key characteristics include:<list list-type="bullet">
<list-item>
<p>Regular Initial and Goal State: The starting configuration of the cloth is flattened, with variable positions and sizes. The folding goal state is predefined.</p>
</list-item>
</list>
</p>
<p>We reviewed 41 eligible papers published between 2019 and 2024, analyzing their task contents, datasets, platforms, primitive actions, evaluation metrics, failure modes, and learning methodologies. As DL-based techniques advance, these related aspects continue to evolve in parallel. Despite notable progress, significant challenges remain, leaving ample opportunities for future research.</p>
<p>The systematic review makes the following contributions:<list list-type="order">
<list-item>
<p>A paradigm-oriented taxonomy: We reorganize recent DL-based cloth manipulation methods into six learning-and-control paradigms that more accurately reflect their perception, representation, and decision-making structures, providing a more meaningful alternative to supervision-based taxonomies used in prior surveys.</p>
</list-item>
<list-item>
<p>A comprehensive analysis of unfolding and folding tasks: We clearly define and distinguish cloth unfolding and folding processes, and analyze key aspects including task contents, datasets, manipulation platforms, primitive actions, metrics, and common failure modes.</p>
</list-item>
<list-item>
<p>Insights into challenges and future opportunities: We identify the limitations across paradigms, and outline promising directions for advancing DL-based cloth manipulation.</p>
</list-item>
</list>
</p>
<p>The remainder of this paper is structured as follows. <xref ref-type="sec" rid="s2">Section 2</xref> outlines the methodology used to identify relevant literature on DL-based cloth unfolding and folding, detailing the criteria for paper inclusion and exclusion. <xref ref-type="sec" rid="s3">Section 3</xref> presents the outcomes of the literature search from seven aspects. <xref ref-type="sec" rid="s4">Section 4</xref> discusses these findings and current challenges while suggesting potential solutions for future research. Finally, <xref ref-type="sec" rid="s5">Section 5</xref> provides a conclusion to the review.</p>
</sec>
<sec sec-type="methods" id="s2">
<label>2</label>
<title>Methods</title>
<p>The method employed to identify relevant empirical papers follows the guidelines of the Preferred Reporting Items for Systematic Reviews and Meta-Analyses (PRISMA) (<xref ref-type="bibr" rid="B41">Liberati et al., 2009</xref>). This review focuses on DL-based cloth manipulation, specifically cloth unfolding and folding, within the realm of robotic manipulation. It encompasses a thorough examination of empirical papers published between 2019 and 2024, aiming to uncover the latest research developments and trends in this field. The search terms and their combinations were defined as follows:</p>
<disp-quote>
<p>(&#x201c;cloth&#x201d; OR &#x201c;fabric&#x201d; OR &#x201c;garment&#x201d; OR &#x201c;towel&#x201d; OR &#x201c;textile&#x201d; OR &#x201c;blanket&#x201d;) AND (&#x201c;robot&#x2a;&#x201d;) AND (&#x201c;shape&#x2a;&#x201d; OR &#x201c;unfold&#x2a;&#x201d; OR &#x201c;fold&#x2a;&#x201d; OR &#x201c;smooth&#x2a;&#x201d; OR &#x201c;flatten&#x2a;&#x201d;) AND (&#x201c;learning based&#x201d; OR &#x201c;learning-based&#x201d; OR &#x201c;deep learning&#x201d; OR &#x201c;deep-learning&#x201d; OR &#x201c;neural network&#x201d; OR &#x201c;reinforcement learning&#x201d; OR &#x201c;RL&#x201d; OR &#x201c;SL&#x201d; OR &#x201c;imitation learning&#x201d; OR &#x201c;IL&#x201d; OR &#x201c;supervised learning&#x201d;)</p>
</disp-quote>
<p>The rationale behind the selection of specific search terms and their combinations is outlined as follows:<list list-type="bullet">
<list-item>
<p>Domain-Specific Keywords: To capture all pertinent aspects of cloth and textile manipulation, terms related to various cloth types (e.g., &#x201c;cloth&#x201d;, &#x201c;fabric&#x201d;, &#x201c;garment&#x201d; etc.) and actions (e.g., &#x201c;shape&#x201d;, &#x201c;unfold&#x201d;, &#x201c;fold&#x201d; etc.) were included.</p>
</list-item>
<list-item>
<p>Robotic Related: The inclusion of the term &#x201c;robot&#x2a;&#x201d; ensures the search is specifically focused on robotic system.</p>
</list-item>
<list-item>
<p>Comprehensive and Inclusive Search: Synonyms and variations of core terms related to DL methodologies (e.g., &#x201c;learning-based&#x201d;, &#x201c;deep learning&#x201d;, &#x201c;neural network&#x201d; etc.) were included to cover the wide spectrum of terminologies used across different studies.</p>
</list-item>
<list-item>
<p>Simultaneous Inclusion Requirement: The empirical robotic manipulation papers should include all three categories of terms simultaneously to ensure comprehensive coverage of the topic.</p>
</list-item>
</list>
</p>
<p>See <xref ref-type="fig" rid="F1">Figure 1</xref>, a bibliography was developed based on searches in IEEE Xplore, Scopus, Web of Science, and ACM Digital Library between 2019 and 2024. We collected 655 related records from these four databases, after excluding duplicates, screened based on abstract and full text, 36 records remained. To make the research sample for the review more comprehensive, we employed backward snowball sampling (<xref ref-type="bibr" rid="B29">Jalali and Wohlin, 2012</xref>) with the same exclusion criteria. At the end of the search, 41 papers were identified for our systematic review. More details are provided in the eligibility stage in <xref ref-type="fig" rid="F1">Figure 1</xref>. We also find that most eligible studies were published in conferences (63.4%).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Flowchat of the literature selection process.</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g001.tif">
<alt-text content-type="machine-generated">Flowchart detailing a systematic review process. It starts with formulating keywords into four groups: Cloth-related, Robot-related, Manipulation-related, and Learning-related. Databases include IEEE Xplore, Scopus, Web of Science, and ACM Digital Library, yielding 655 records. After removing duplicates, 564 records were screened. Titles and abstracts narrowed this to 53 records. Full-text screening left 36 records. Further exclusions were made, with 5 additional studies identified through snowball sampling, resulting in 41 studies in the final review. Timeframe is 2019-2024.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3">
<label>3</label>
<title>Synthesis of results</title>
<p>We illustrate the cloth unfolding and folding manipulation process in <xref ref-type="fig" rid="F2">Figure 2</xref>. In this section, we synthesize the eligible papers by examining various important aspects during the manipulation process. These aspects include task contents, datasets, manipulation platforms, primitive actions, performance metrics, failure modes, and learning methods.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overview of robotic cloth unfolding and folding manipulation process. The robotic manipulation platform is built with robot arms, a top-view camera for main observation, and an optional front-view camera. <bold>(a)</bold> During manipulation, the main top-view camera observes the current state of the cloth to provide the RGB or depth image. <bold>(b)</bold> After inputting the image into the DL algorithm, it perceives the scene and plans the manipulation action. <bold>(c)</bold> The robot executes the planned manipulation with primitive actions. <bold>(d)</bold> The cloth is manipulated into a new state. The process will continue iteratively until the cloth meets the termination criteria and evaluates according to performance metrics.</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a robotic manipulation process. Part a shows the observation of a cloth via RGB and depth cameras. Part b depicts a deep-learning based manipulation method. Part c shows robotic arms performing an action on the cloth. Part d displays the cloth after manipulation with performance metrics indicated.</alt-text>
</graphic>
</fig>
<sec id="s3-1">
<label>3.1</label>
<title>Task contents</title>
<p>Cloth unfolding and folding are closely related processes, with unfolding often serving as a preliminary step for folding. Although they are sometimes treated as distinct tasks, they are closely connected. <xref ref-type="table" rid="T1">Table 1</xref> summarizes the distribution of tasks and manipulated object types in the 41 reviewed papers. Overall, most studies focus on a single task and primarily use small towels or napkins, while only a minority address large cloths or more complex garments.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Summary of task contents in the 41 reviewed papers.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Category</th>
<th align="center">Count (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="2" align="left">Task type</td>
</tr>
<tr>
<td align="left">&#x2003;Folding/unfolding only</td>
<td align="center">29 (70.7%)</td>
</tr>
<tr>
<td align="left">&#x2003;Both unfolding and folding</td>
<td align="center">12 (29.3%)</td>
</tr>
<tr>
<td colspan="2" align="left">Manipulated object type</td>
</tr>
<tr>
<td align="left">&#x2003;Towels/napkins (within reach)</td>
<td align="center">22 (53.7%)</td>
</tr>
<tr>
<td align="left">&#x2003;Garments (T-shirts, skirts, trousers)</td>
<td align="center">12 (29.3%)</td>
</tr>
<tr>
<td align="left">&#x2003;Both towels and garments</td>
<td align="center">4 (9.8%)</td>
</tr>
<tr>
<td align="left">&#x2003;Large cloths (beyond reach)</td>
<td align="center">3 (7.3%)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Datasets</title>
<p>Data plays a crucial role in the development of DL-based methods (<xref ref-type="bibr" rid="B61">Tampuu et al., 2020</xref>) for cloth unfolding and folding. Due to the complexity of robotic manipulation of deformable objects (<xref ref-type="bibr" rid="B81">Zhu et al., 2022</xref>), which includes various cloth configurations, interactions with physical environments, diverse robot capabilities and morphologies, and different hardware setups across labs, there is currently no universally recognized public dataset in this field. Consequently, each of the 41 eligible papers in this review has collected its own dataset. Dataset collection was primarily conducted through two strategies: simulation and real-world experiments.</p>
<sec id="s3-2-1">
<label>3.2.1</label>
<title>Types of collected data</title>
<p>Due to differences in learning paradigms, the types of data collected vary across studies. Visual information, such as RGB images, depth images, and grayscale images, is collected in all selected papers. In addition to visual observations, other data modalities are also used to provide richer supervision. These include keypoint annotations (<xref ref-type="bibr" rid="B57">Seita et al., 2019</xref>; <xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>) for keypoint perception, action-related data such as pick and place points (<xref ref-type="bibr" rid="B67">Tsurumine et al., 2019</xref>), pick-and-pull directions (<xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>; <xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>), and manipulation trajectories (<xref ref-type="bibr" rid="B12">Chen et al., 2022</xref>) for supervising control policy learning, as well as manipulation stage or phase annotations (<xref ref-type="bibr" rid="B48">Mo et al., 2022</xref>; <xref ref-type="bibr" rid="B68">Wang et al., 2022</xref>) to support temporal decomposition of manipulation processes. Another category of collected data is cloth state representations, such as cloth particle poses (<xref ref-type="bibr" rid="B69">Weng et al., 2022</xref>) and cloth mesh representations (<xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>), which allow explicit representations of cloth dynamics and deformation. A single study may collect multiple types of data depending on its learning formulation.</p>
</sec>
<sec id="s3-2-2">
<label>3.2.2</label>
<title>Dataset collection from simulation</title>
<p>Collecting datasets or training in simulation is a widely used strategy for learning-based robotic cloth manipulation (65.9%, 27 out of 41). The most commonly used simulators in this field include gym-based environments, such as SoftGym (<xref ref-type="bibr" rid="B42">Lin et al., 2021</xref>), which accounts for more than half of the simulation studies, and FEM-based simulators integrated with Gym (<xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>), as well as Blender, MuJoCo, and others. SoftGym (<xref ref-type="bibr" rid="B42">Lin et al., 2021</xref>), built on the PyFleX (<xref ref-type="bibr" rid="B40">Li et al., 2019</xref>) bindings to NVIDIA FleX, can load various garment meshes including T-shirts, trousers, and dresses. However, its current version does not support loading robot models due to NVIDIA&#x2019;s permission constraints, limiting its use for training real robots that rely on Cartesian control. FEM-based fabric simulators interfaced with OpenAI Gym provide another option, though the authors acknowledge that these simulators exhibit lower physical fidelity compared to other engines. <xref ref-type="bibr" rid="B25">Hietala et al. (2022)</xref> instead used MuJoCo, which supports loading robot URDF models. All simulation datasets mentioned above were collected entirely within simulated environments. <xref ref-type="bibr" rid="B75">Xue et al. (2023)</xref> employed RFUniverse (<xref ref-type="bibr" rid="B16">Fu H. et al., 2023</xref>), which enables the acquisition of cloth-environment interaction data through a Virtual Reality (VR) setup, thereby enhancing interactive capabilities from the real world to the simulated environment.</p>
<p>While simulation facilitates the development of DL-based methods, the gap between simulation and the real world remains a challenge in robotic cloth manipulation. This gap mainly comes from inaccurate modeling of cloth properties, limited visual diversity, dynamics mismatch, and unmodeled interactions, etc. Therefore, addressing the Sim2Real gap is an important consideration (<xref ref-type="bibr" rid="B13">Collins et al., 2019</xref>; <xref ref-type="bibr" rid="B47">Matas et al., 2018</xref>). The strategies include Domain Randomization (DR), Data Augmentation (DA), depth or point-cloud observations, fine-tuning with real-world data, texture replacement, and training under real settings, and others. <xref ref-type="table" rid="T2">Table 2</xref> summarizes the primary Sim2Real gaps encountered in cloth manipulation and the corresponding strategies used to mitigate them.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Summary of Sim2Real gaps in robotic cloth manipulation and the corresponding mitigation strategies.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Primary Sim2Real gap</th>
<th align="left">Sim2Real strategy used</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Inaccurate or oversimplified modeling of cloth physical properties and appearance variability across real garments. (<xref ref-type="bibr" rid="B25">Hietala et al., 2022</xref>; <xref ref-type="bibr" rid="B5">Blanco-Mulero et al., 2023</xref>; <xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>)</td>
<td align="left">DR: Randomizing cloth stiffness, mass, size, color, shading, lighting, background, and camera pose to improve robustness across fabric types</td>
</tr>
<tr>
<td align="left">Limited visual diversity and viewpoint mismatch between simulated and real cloth observations. (<xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>)</td>
<td align="left">DA: Applying transformations such as cropping, rotation, flipping, and noise injection to enhance robustness and generalization</td>
</tr>
<tr>
<td align="left">Photometric inconsistencies caused by lighting, texture, wrinkles, and shading in RGB images. (<xref ref-type="bibr" rid="B69">Weng et al., 2022</xref>; <xref ref-type="bibr" rid="B48">Mo et al., 2022</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>)</td>
<td align="left">Using depth/Point clouds: Using depth maps or point-cloud observations to reduce the photometric gap between simulation and reality</td>
</tr>
<tr>
<td align="left">Residual dynamics mismatch and unmodeled interactions (e.g., friction, contact, grasping errors) after simulation pre-training. (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>)</td>
<td align="left">Fine-tuning: Training policies in simulation and then fine-tuning them with real-world data</td>
</tr>
<tr>
<td align="left">Visual domain gap arising from complex real-world textures and background clutter. (<xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>)</td>
<td align="left">Texture replacement: Replacing cloth and background textures in real images with simulation-like uniform colors</td>
</tr>
<tr>
<td align="left">Mismatch between simulated controllers, sensors, or object properties and real robotic hardware. (<xref ref-type="bibr" rid="B25">Hietala et al., 2022</xref>; <xref ref-type="bibr" rid="B68">Wang et al., 2022</xref>)</td>
<td align="left">Training with real settings: Incorporating real hardware characteristics or real-object textures directly into the simulation environment</td>
</tr>
<tr>
<td align="left">Others: Task-specific data or model dependency, action and reward modeling gap. (<xref ref-type="bibr" rid="B20">Ganapathi et al., 2021</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>; <xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>)</td>
<td align="left">Geometric structure from visual representations, simplified action models, or specialized reward designs to facilitate Sim2Real transfer</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2-3">
<label>3.2.3</label>
<title>Dataset collection from real world</title>
<p>Collecting datasets or training in the real world presents challenges but offers significant benefits. It avoids the Sim2Real problem and generally results in better generalization compared to simulation-based training. However, the considerable workload of human annotation and the wear and tear on robots are notable factors. To reduce the labeling workload, <xref ref-type="bibr" rid="B57">Seita et al. (2019)</xref> proposed a color-based keypoint annotation strategy that enables automatic extraction of cloth corners from color-marked visual observations for depth images. Later, <xref ref-type="bibr" rid="B17">Fu T. et al. (2023)</xref> adopted a similar color-labeling strategy for dataset collection and segmentation training, using different paint color for cloth corners and edges. However, the training dataset collected using this approach includes only depth images, which consist of a single depth channel and contain no color cues. <xref ref-type="bibr" rid="B58">Seita et al. (2020)</xref> reported that the color contrast between the fabric and the workspace in RGB images can facilitate better performance. Furthermore, depth sensing requires dedicated hardware. Therefore, <xref ref-type="bibr" rid="B64">Thananjeyan et al. (2022)</xref> addressed this by introducing a UV-based labeling technique for deformable objects in RGB images, referred to as Labels from UltraViolet (LUV). Transparent UV fluorescent paint is invisible under standard light but detectable under UV light. Similarly, <xref ref-type="bibr" rid="B21">Gu et al. (2024)</xref> adopted this method during their finetuning process.</p>
<p>The color-labeling methods mentioned above can efficiently provide keypoints for cloth manipulation. However, they may be inadequate in certain scenarios, particularly when RL or imitation learning is employed. Consequently, other data collection methods have been explored. For example, <xref ref-type="bibr" rid="B27">Hoque et al. (2022b)</xref> developed open-source software that enables humans to remotely control a robot for interacting with cloth and collecting demonstrations. <xref ref-type="bibr" rid="B1">Avigal et al. (2022)</xref> first annotated images with primitive actions and corresponding gripper positions and then trained a neural network to iteratively collect self-supervised data. <xref ref-type="bibr" rid="B37">Lee et al. (2021)</xref> required only 1 hour of random interactions with the cloth to develop their offline RL approach, which effectively handles complex sequential cloth folding (see <xref ref-type="fig" rid="F3">Figure 3</xref>). In the new work of <xref ref-type="bibr" rid="B38">Lee et al. (2024)</xref>, they further explored dataset collection by tackling the movement in human manipulation videos, making the dataset collection process more efficient and simpler.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Collecting an hour of experience dataset on the real robot (PMLR image credit <xref ref-type="bibr" rid="B37">Lee et al., 2021</xref>, licensed under CC BY).</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g003.tif">
<alt-text content-type="machine-generated">A robotic arm interacting with fabric on a table, labeled &#x22;1 hour of random interaction,&#x22; is shown on the left. On the right, there are multiple images of folded cloth, labeled &#x22;Offline training,&#x22; connected by an arrow.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Manipulation platforms</title>
<p>Robotic cloth manipulation platforms typically consist of two key components: the manipulation system and the vision system. The robot executes physical interactions with the cloth, whereas the vision module provides the perceptual observations required for state estimation and policy learning. This section summarizes the platforms adopted across the reviewed studies.</p>
<sec id="s3-3-1">
<label>3.3.1</label>
<title>Robot types</title>
<p>Most works employ single-arm manipulation. Specifically, 65.9% (27 out of 41) of the reviewed papers use a single robotic arm, while 31.7% (13 out of 41) adopt a dual-arm system. Only one study (<xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>) utilizes a three-arm setup. Regarding robot brands, the Universal Robots series (particularly UR5) is the most frequently used in real-world experiments, followed by the Franka Emika Panda arm. For dual-arm settings, existing solutions either (i) combine two independent single-arm robots into a coordinated dual-arm system (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>; <xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>) or (ii) rely on dedicated dual-arm robot platforms such as ABB YuMi (<xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>), Kawada HIRO (<xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>), or PR2 (<xref ref-type="bibr" rid="B71">Wu Y. et al., 2020</xref>).</p>
</sec>
<sec id="s3-3-2">
<label>3.3.2</label>
<title>Vision sensors</title>
<p>The vision observation types include RGB, depth, RGB-D, grayscale, and point cloud images. We systematically analyze these inputs, which are used for training or inference, as shown in <xref ref-type="table" rid="T3">Table 3</xref>. RGB information is the most commonly used observation. <xref ref-type="bibr" rid="B26">Hoque et al. (2022a)</xref> compared the results of RGB, RGB-D, and depth, and suggested that RGB-D provides the best performance for their visual foresight task. <xref ref-type="bibr" rid="B43">Lin et al. (2022)</xref> demonstrated that RGB-related information are sensitive to camera views and visual features, which also poses challenges in the Sim2Real transformation. Later, <xref ref-type="bibr" rid="B69">Weng et al. (2022)</xref> utilized depth images as their policy input because they found that using depth images or point clouds as the training dataset could minimize the gap between simulation and reality.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Observation modalities used.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Observation type</th>
<th align="center">Count (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">RGB</td>
<td align="center">22 (53.7%)</td>
</tr>
<tr>
<td align="left">RGB-D</td>
<td align="center">6 (14.6%)</td>
</tr>
<tr>
<td align="left">Depth/point cloud</td>
<td align="center">11 (26.8%)</td>
</tr>
<tr>
<td align="left">Gray</td>
<td align="center">1 (2.4%)</td>
</tr>
<tr>
<td align="left">Depth and gray</td>
<td align="center">1 (2.4%)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Primitive actions</title>
<p>The primitive actions used in cloth unfolding and folding manipulation include quasi-static pick-and-place (P&#x26;P), dynamic fling action, drag and mop, and air blowing. The manipulations described in the papers either use one or a combination of these four primitive actions.</p>
<sec id="s3-4-1">
<label>3.4.1</label>
<title>Pick and place action</title>
<p>The P&#x26;P configuration involves selecting a pick coordinate and a place coordinate. Initially, a robot grasps the cloth at the pick coordinate and lifts it to a certain height. The robot then moves above the place coordinate and finally places the cloth down and releases its grip. However, the P&#x26;P primitive action has its development. For example, <xref ref-type="bibr" rid="B26">Hoque et al. (2022a)</xref> provide pixel coordinates for the pick and place points, which the robot then uses to execute the action. <xref ref-type="bibr" rid="B32">Kase et al. (2022)</xref> decomposed and labeled the P&#x26;P action into finer phases: approach, grasp, pull, fold, release, and standby, to facilitate their cloth manipulation task. <xref ref-type="bibr" rid="B1">Avigal et al. (2022)</xref> incorporated the grasp angle into their manipulation process, estimating a pixel-wise value map for each gripper z-axis rotation to enhance the reliability of their P&#x26;P action. To achieve better performance, <xref ref-type="bibr" rid="B5">Blanco-Mulero et al. (2023)</xref> proposed a policy that optimizes parameters such as motion velocity and height within the P&#x26;P primitive action. <xref ref-type="bibr" rid="B25">Hietala et al. (2022)</xref> demonstrated that closed-loop feedback with parameterized P&#x26;P primitives significantly enhances adaptability in cloth manipulation, showing promise for more general and adaptive skills. Although the P&#x26;P action has been successfully utilized in cloth manipulation, it is relatively slow and constrained by the robot&#x2019;s workspace.</p>
</sec>
<sec id="s3-4-2">
<label>3.4.2</label>
<title>Dynamic fling action</title>
<p>To overcome the drawbacks of P&#x26;P actions, <xref ref-type="bibr" rid="B23">Ha and Song (2022)</xref> proposed a dynamic fling primitive that leverages object momentum for efficient unfolding. Their approach learns grasp locations while relying on predefined motion parameters, which may limit robustness across different cloth geometries or sizes. To address this issue, <xref ref-type="bibr" rid="B21">Gu et al. (2024)</xref> correlated the lift height with cloth size and adjusted the fling speed based on the height, which improved performance, as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>. <xref ref-type="bibr" rid="B12">Chen et al. (2022)</xref> further focused on learning fling action trajectories for garment unfolding using one robot arm, rather than a fixed fling trajectory.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Process of flinging the garment to expand the coverage (Reprinted with permission from Gu et al., 2024, copyright 2024 IEEE). <bold>(a)</bold> Grasp <bold>(b)</bold> Lift <bold>(c)</bold> stretch <bold>(d)</bold> Fling.</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g004.tif">
<alt-text content-type="machine-generated">Four images demonstrate robotic arms manipulating a blue cloth. a) Robotic arms positioned near the cloth on a table. b) Arms lifting the cloth with height indicated. c) Cloth held mid-air, evenly stretched. d) Arm moves cloth horizontally with indicated speed.</alt-text>
</graphic>
</fig>
<p>Although the fling action can significantly expand the coverage, it is a coarse-grained manipulation method and is insufficient for fine-grained manipulation like P&#x26;P. Therefore, <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref> strategically choose between the fling and P&#x26;P actions to efficiently and precisely unfold the garment. Their policy first utilizes the fling action to bring the cloth to a considerably unfolded state, then employs the P&#x26;P action to further expand it.</p>
</sec>
<sec id="s3-4-3">
<label>3.4.3</label>
<title>Drag and mop action</title>
<p>During cloth manipulation, predicted manipulation points may be outside the robot&#x2019;s reachable workspace or correspond to difficult-to-handle cloth configurations when using previous primitive actions. Therefore, <xref ref-type="bibr" rid="B1">Avigal et al. (2022)</xref> and <xref ref-type="bibr" rid="B75">Xue et al. (2023)</xref> introduce a drag action in the unfolding stage. This action involves two robots simultaneously dragging the cloth away from its center. By exploiting the friction between the cloth and the supporting surface, the drag action helps smooth out corners and wrinkles, such as sleeves trapped underneath the garment, and can also be used to reposition the cloth. For the folding stage, <xref ref-type="bibr" rid="B75">Xue et al. (2023)</xref> further introduce a related action primitive mop. Similar to drag, mop is used to adjust the cloth position when the predicted grasp or placement points are unreachable during folding. Another form of dragging is described by <xref ref-type="bibr" rid="B24">He et al. (2023)</xref>, in which two robotic arms are used asymmetrically: one arm grasps the cloth and remains stationary, while the other drags the cloth away by a predefined distance. This strategy is particularly effective for handling long sleeves that are covered by or folded inside a T-shirt.</p>
</sec>
<sec id="s3-4-4">
<label>3.4.4</label>
<title>Air blow action</title>
<p>The primitive actions described above manipulate the cloth either through sparse contact or by utilizing high speed robot. From the perspectives of dense force application and safety, <xref ref-type="bibr" rid="B74">Xu et al. (2022)</xref> proposed an air blow primitive action. In this approach, two robot arms grasp two points on the cloth, while a third arm operates a blower to apply air, expanding and unfolding the cloth. This action allows for the application of dense air forces on areas not in direct contact with the robot, thereby extending the system&#x2019;s reach and enabling safe, high-speed interactions.</p>
</sec>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Performance metrics</title>
<p>For the unfolding task, almost all of the eligible papers use cloth coverage as their primary performance metric. The second most common metric is the number of action steps, which evaluates the efficiency of the unfolding policy. Time-related metrics, such as execution time and the time taken to determine actions, are also considered. Additionally, metrics like reward after actions (<xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>), cloth orientation (<xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>), and MIoU (<xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>) are used to evaluate unfolding performance.</p>
<p>In the folding task, the most commonly used metric is the folding success rate. However, the criteria for evaluating success vary. Some studies (<xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>; <xref ref-type="bibr" rid="B69">Weng et al., 2022</xref>) use MIoU as their success rate metric. <xref ref-type="bibr" rid="B37">Lee et al. (2021)</xref> argue that the self-occluding, deformable nature of the cloth and the difficulty of observing a 3D state in a 2D image make it challenging to apply a quantitative MIoU metric. They introduced &#x201c;visually consistent with the target image&#x201d; as their success rate metric. Conversely, some papers (<xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>; <xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>) evaluate performance using the cloth particle distance between the goal and the result in the simulation, although this criterion cannot be used in the real world. Additionally, metrics such as inference and execution time (<xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>) are considered to compare efficiency. Wrinkle penalties (<xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>) and normalized metrics (<xref ref-type="bibr" rid="B9">Chen and Rojas, 2024</xref>) are also used.</p>
</sec>
<sec id="s3-6">
<label>3.6</label>
<title>Failure modes</title>
<p>In cloth unfolding and folding tasks, frequent failure modes, in addition to common motion planning errors (<xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>), include issues such as failing to grasp (<xref ref-type="bibr" rid="B5">Blanco-Mulero et al., 2023</xref>), incorrect numbers of grasped cloth layers (<xref ref-type="bibr" rid="B17">Fu T. et al., 2023</xref>; <xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>; <xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="B20">Ganapathi et al., 2021</xref>), losing grip (<xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>), and failed releases (<xref ref-type="bibr" rid="B59">Shehawy et al., 2023</xref>). Among these, multi-layer grasping is particularly influential as a failure mode. Other issues include inaccurate predictions (<xref ref-type="bibr" rid="B68">Wang et al., 2022</xref>) and the gap between simulation and reality.</p>
</sec>
<sec id="s3-7">
<label>3.7</label>
<title>Learning and control paradigms for cloth manipulation</title>
<p>Prior survey work (<xref ref-type="bibr" rid="B49">Nocentini et al., 2022</xref>) categorizes learning-based cloth manipulation methods according to supervision type: supervised learning (SL), unsupervised learning (USL), reinforcement learning (RL), and imitation learning (IL). While this perspective is too general as a machine-learning taxonomy and too coarse to reveal the algorithmic structures in cloth manipulation tasks.</p>
<p>To better characterize how existing approaches perceive, represent, and act in cloth manipulation, we reorganize the eligible papers into six learning-and-control paradigms (<xref ref-type="table" rid="T4">Table 4</xref>) that more directly reflect their underlying design principles and highlight differences in perception requirements, control structures, and generalization strategies across cloth unfolding and folding tasks, offering a more fine-grained view. <xref ref-type="fig" rid="F5">Figure 5</xref> provides a conceptual overview of the learning and control paradigms within the overall robotic cloth unfolding and folding workflow discussed in this review. <list list-type="bullet">
<list-item>
<p>Perception-Guided Heuristic Methods: Vision networks (e.g., keypoint detectors or segmentation models) predict regions or contours (e.g., corners, masks, keypoints), which then feed into hand-crafted unfolding or folding routines.</p>
</list-item>
<list-item>
<p>Goal-Conditioned Manipulation Policies: Policies that take the current observation and a desired goal configuration as input and output manipulation actions to reach the goal.</p>
</list-item>
<list-item>
<p>Predictive and Model-Based State Representation Methods: Approaches that learn explicit cloth dynamics, latent state representations, or visuospatial models and use them for planning or control.</p>
</list-item>
<list-item>
<p>Reward-Driven Reinforcement Learning over Primitive Actions: RL methods that optimize value or policy functions over discrete or continuous manipulation primitives using task-specific reward signals.</p>
</list-item>
<list-item>
<p>Demonstration-Driven Skill Transfer Methods: Methods that learn manipulation policies primarily from expert demonstrations (simulation rollouts, robot teleoperation data, or human videos) and adapt them to the current situation.</p>
</list-item>
<list-item>
<p>Large Language Model-Based Planning Methods: Approaches that leverage large language models (LLMs) or vision-language models (VLMs) to extract high-level semantic information from textual descriptions or visual observations, propose manipulation primitives, infer sub-goals, and generate high-level action plans.</p>
</list-item>
</list>
</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Statistics of the six learning-and-control paradigms across cloth unfolding and folding tasks.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Paradigm</th>
<th align="left">Unfold (Count &#x2b; refs.)</th>
<th align="left">Fold (Count &#x2b; refs.)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Perception-H</td>
<td align="left">2: (<xref ref-type="bibr" rid="B57">Seita et al., 2019</xref>; <xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>)</td>
<td align="left">3: (<xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>; <xref ref-type="bibr" rid="B24">He et al., 2023</xref>)</td>
</tr>
<tr>
<td align="left">Goal-cond</td>
<td align="left">0: &#x2013;</td>
<td align="left">3: (<xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>; <xref ref-type="bibr" rid="B68">Weng et al., 2022</xref>; <xref ref-type="bibr" rid="B48">Mo et al., 2022</xref>)</td>
</tr>
<tr>
<td align="left">Predict.-model</td>
<td align="left">7: (<xref ref-type="bibr" rid="B20">Ganapathi et al., 2021</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>; <xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>; <xref ref-type="bibr" rid="B43">Lin et al., 2022</xref>; <xref ref-type="bibr" rid="B14">Deng et al., 2023</xref>; <xref ref-type="bibr" rid="B31">Kadi and Terzi&#x107;, 2024</xref>; <xref ref-type="bibr" rid="B73">Wu et al., 2024</xref>)</td>
<td align="left">8: (<xref ref-type="bibr" rid="B20">Ganapathi et al., 2021</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>; <xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>; <xref ref-type="bibr" rid="B8">Cao et al., 2023</xref>; <xref ref-type="bibr" rid="B14">Deng et al., 2023</xref>; <xref ref-type="bibr" rid="B79">Zhou et al., 2024</xref>; <xref ref-type="bibr" rid="B44">Longhini et al., 2024</xref>; <xref ref-type="bibr" rid="B73">Wu et al., 2024</xref>)</td>
</tr>
<tr>
<td align="left">Reward-driven</td>
<td align="left">11: (<xref ref-type="bibr" rid="B67">Tsurumine et al., 2019</xref>; <xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B12">Chen et al., 2022</xref>; <xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>; <xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>; <xref ref-type="bibr" rid="B5">Blanco-Mulero et al., 2023</xref>; <xref ref-type="bibr" rid="B59">Shehawy et al., 2023</xref>; <xref ref-type="bibr" rid="B24">He et al., 2023</xref>)</td>
<td align="left">6: (<xref ref-type="bibr" rid="B71">Wu et al., 2020b</xref>; <xref ref-type="bibr" rid="B37">Lee et al., 2021</xref>; <xref ref-type="bibr" rid="B56">Salhotra et al., 2022</xref>; <xref ref-type="bibr" rid="B25">Hietala et al., 2022</xref>; <xref ref-type="bibr" rid="B59">Shehawy et al., 2023</xref>; <xref ref-type="bibr" rid="B9">Chen and Rojas, 2024</xref>)</td>
</tr>
<tr>
<td align="left">Demo.-driven</td>
<td align="left">7: (<xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>; <xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>; <xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>; <xref ref-type="bibr" rid="B17">Fu et al., 2023b</xref>; <xref ref-type="bibr" rid="B38">Lee et al., 2024</xref>; <xref ref-type="bibr" rid="B19">Galassi et al., 2024</xref>; <xref ref-type="bibr" rid="B76">Yang et al., 2024</xref>)</td>
<td align="left">6: (<xref ref-type="bibr" rid="B68">Wang et al., 2022</xref>; <xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>; <xref ref-type="bibr" rid="B32">Kase et al., 2022</xref>; <xref ref-type="bibr" rid="B66">Tsurumine and Matsubara, 2022</xref>; <xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>; <xref ref-type="bibr" rid="B38">Lee et al., 2024</xref>)</td>
</tr>
<tr>
<td align="left">LLM-based</td>
<td align="left">2: (<xref ref-type="bibr" rid="B18">Fu et al., 2024</xref>; <xref ref-type="bibr" rid="B52">Raval et al., 2024</xref>)</td>
<td align="left">1: (<xref ref-type="bibr" rid="B52">Raval et al., 2024</xref>)</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Conceptual overview diagram of the six learning and control paradigms in the robotic cloth unfolding and folding pipeline.</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a robotic system for cloth unfolding and folding. Key components include Task Content, Robot and Sensing Platform, and Learning and Control Paradigms. Paradigms involve Perception-Guided Heuristics, Goal-Conditioned Policies, Predictive and Model-Based Methods, Reward-Driven Reinforcement Learning, Demonstration-Driven Skill Transfer, and LLM-Based Planning Methods. These involve perception, action prediction, planning, value learning, and skill transfer. Outputs lead to Primitive Actions Execution, which are then evaluated by Performance Metrics and Failure Modes.</alt-text>
</graphic>
</fig>
<sec id="s3-7-1">
<label>3.7.1</label>
<title>Perception-guided heuristic methods</title>
<p>Early work on cloth manipulation often relies on explicit perception outputs, such as corners, edges, or segmentation masks, which are then mapped to hand-designed manipulation routines to realize simple folding or unfolding. <xref ref-type="bibr" rid="B57">Seita et al. (2019)</xref> used YOLO (<xref ref-type="bibr" rid="B53">Redmon et al., 2016</xref>) to detect blanket corners from depth images. The detected keypoints are passed to a keypoint-based heuristic controller: the robot grasps these points and pulls the blanket to increase coverage. Real-world experiments with two mobile manipulators and three blankets demonstrated strong generalization of this perception-guided heuristic pipeline. Several works adopt a similar structure for folding tasks, as illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>. <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref>; <xref ref-type="bibr" rid="B24">He et al. (2023)</xref>; <xref ref-type="bibr" rid="B21">Gu et al. (2024)</xref> use segmentation networks such as U-Net (<xref ref-type="bibr" rid="B54">Ronneberger et al., 2015</xref>), DeeplabV3 (<xref ref-type="bibr" rid="B10">Chen et al., 2017</xref>), and DeeplabV3&#x2b; (<xref ref-type="bibr" rid="B11">Chen et al., 2018</xref>) to localize keypoints or regions on flattened garments to fold the cloth.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Keypoint-based heuristic folding method (arXiv image credit: <xref ref-type="bibr" rid="B6">Canberk et al. (2022)</xref>, licensed under CC BY).</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g006.tif">
<alt-text content-type="machine-generated">Three-step sequence for folding a garment. Step 1: Cloth with colored markers and red arrows indicating folding directions. Step 2: Some markers remain, showing further folding with arrows. Step 3: Garment is neatly folded with a green checkmark.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-7-2">
<label>3.7.2</label>
<title>Goal-conditioned manipulation policies</title>
<p>While perception-guided heuristics are effective for predefined routines, they lack flexibility when target configurations vary. Goal-conditioned manipulation policies address this limitation by conditioning actions on both the current cloth state and a desired goal state, such that the predicted actions transform the cloth toward the goal. <xref ref-type="bibr" rid="B69">Weng et al. (2022)</xref> propose FabricFlowNet (FFN), a dual-arm goal-conditioned policy for cloth folding that leverages optical-flow prediction, (<xref ref-type="fig" rid="F7">Figure 7</xref>). Instead of predicting actions directly from the current and goal images, FFN decomposes the policy into two components: a FlowNet that estimates particle flow between the current observation and the goal, and a PickNet that predicts P&#x26;P points from the estimated flow image. <xref ref-type="bibr" rid="B48">Mo et al. (2022)</xref> introduce Foldsformer, which incorporates space-time attention (<xref ref-type="bibr" rid="B3">Bertasius et al., 2021</xref>) into a folding planner. Given the current cloth image and a sequence of demonstration images, the model outputs a sequence of multi-step action points. This design balances speed and accuracy while capturing manipulation points and their ordering, even when cloth pose and size differ from those in the demonstration.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>FabricFlowNet, a type of Goal-Conditioned Manipulation Policy (PMLR image credit: <xref ref-type="bibr" rid="B69">Weng et al. (2022)</xref>, licensed under CC BY).</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g007.tif">
<alt-text content-type="machine-generated">Diagram illustrating a process flow from observation and goal images through a &#x22;Flow&#x22; function, resulting in a flow image. The flow image is processed by &#x22;PickNet&#x22; to determine pick points and place points, indicated with yellow arrows.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-7-3">
<label>3.7.3</label>
<title>Predictive and model-based state representation methods</title>
<p>Goal-conditioned policies typically react to the current observation and goal, but do not explicitly model cloth dynamics or future evolution. Predictive and model-based methods aim to address this limitation by learning latent state representations or forward models of cloth behavior for planning and control. <xref ref-type="bibr" rid="B26">Hoque et al. (2022a)</xref> develop the VisuoSpatial Foresight (VSF) policy, trained on self-supervised simulated cloth manipulation data. VSF is built on Stochastic Variational Video Prediction (SV2P) (<xref ref-type="bibr" rid="B2">Babaeizadeh et al., 2018</xref>), an action-conditioned latent-variable video prediction model. At test time, the model receives the current and goal cloth states and predicts intermediate frames together with P&#x26;P coordinates, providing a visuospatial predictive model that can be used for planning. <xref ref-type="bibr" rid="B45">Ma et al. (2022)</xref> argue that human-defined labeled keypoints do not generalize well to unseen cloth configurations. They therefore use Transporter Networks (<xref ref-type="bibr" rid="B35">Kulkarni et al., 2019</xref>) to extract features and detect keypoints in an unsupervised manner from depth images. The detected keypoints are composed into a graph, and graph neural networks (GNNs) and recurrent networks are then used to model cloth dynamics in this learned space. <xref ref-type="bibr" rid="B20">Ganapathi et al. (2021)</xref> learn dense visual correspondences between different cloth configurations by training a Siamese network on pairs of cloth images, thereby capturing the underlying geometric structure. The learned correspondence field is used to transfer manipulation actions from a reference demonstration to new cloth states and has shown promising Sim2Real performance. <xref ref-type="bibr" rid="B43">Lin et al. (2022)</xref> propose a model-based RL approach in which a particle-based cloth dynamics model is learned from partial point clouds. A GNN models visible connectivity by operating on voxelized point clouds <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and inferred edges <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the learned dynamics are then used to train a P&#x26;P manipulation policy.</p>
</sec>
<sec id="s3-7-4">
<label>3.7.4</label>
<title>Reward-driven reinforcement learning over primitive actions</title>
<p>Rather than explicitly modeling cloth dynamics, reward-driven RL acquires manipulation strategies by optimizing task-specific reward functions through trial-and-error interaction. This paradigm learns the parameters of primitive actions (e.g., pick, drag, or fling) autonomously, but typically requires careful reward design and large amounts of interaction data. <xref ref-type="bibr" rid="B71">Wu Y. et al. (2020)</xref> introduce an RL framework for P&#x26;P cloth unfolding in which the placing policy is learned conditioned on random pick points. The final pick location is then chosen as the point with maximal value under the learned placing policy (the maximum-value-under-placing, MVP, strategy), leading to faster learning than jointly learning pick and place. <xref ref-type="bibr" rid="B23">Ha and Song (2022)</xref> employ Spatial Action Maps (<xref ref-type="bibr" rid="B70">Wu J. et al., 2020</xref>) for dynamic cloth manipulation. Their method evaluates a batch of candidate fling actions by transforming the observation and predicting a batch of value maps (<xref ref-type="fig" rid="F8">Figure 8</xref>). The pixel with maximal value that also satisfies reachability constraints is selected, and its location and transformation are decoded into fling parameters. This value-map paradigm has been widely adopted in subsequent work on dynamic cloth manipulation, including <xref ref-type="bibr" rid="B24">He et al. (2023)</xref>, <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref>, <xref ref-type="bibr" rid="B21">Gu et al. (2024)</xref>, and <xref ref-type="bibr" rid="B74">Xu et al. (2022)</xref>. <xref ref-type="bibr" rid="B74">Xu et al. (2022)</xref> improve the fling grasping strategy of <xref ref-type="bibr" rid="B23">Ha and Song (2022)</xref> by introducing edge-coincident grasp parameterizations to boost performance. <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref> propose a factorized value-prediction model with two Spatial Action Maps networks (each with one encoder and two decoders) to generalize value maps over two primitive actions. <xref ref-type="bibr" rid="B21">Gu et al. (2024)</xref> apply this factorized policy to cloth unfolding and augment the value maps with an additional detection module. <xref ref-type="bibr" rid="B37">Lee et al. (2021)</xref> adopt an offline, batch-RL setting: a real robot first collects data via random actions, and a DQN is then trained on this fixed dataset, with DA used to improve robustness in low-data regimes. To scale reward-driven RL, <xref ref-type="bibr" rid="B23">Ha and Song (2022)</xref> further propose a self-supervised interaction framework in simulation: the robot interactively unfolds cloth, and the simulator computes coverage after each action. Episodes are reset once coverage reaches a threshold or an action limit is exceeded, eliminating the need for expert demonstrations or ground-truth state labels. <xref ref-type="bibr" rid="B1">Avigal et al. (2022)</xref> extend this idea to the real world by using a small set of human-labeled primitives and gripper poses for initial training, followed by large-scale self-supervised data collection.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Use of Spatial Action Maps (<xref ref-type="bibr" rid="B70">Wu J. et al., 2020</xref>) in dynamic cloth actions (PMLR image credit: <xref ref-type="bibr" rid="B23">Ha and Song (2022)</xref>, licensed under CC BY). <bold>(a)</bold> Workspace <bold>(b)</bold> Rotated and scaled images <bold>(c)</bold> Predicted value maps <bold>(d)</bold> Highest value <bold>(e)</bold> Reachability <bold>(f)</bold> Fling action.</p>
</caption>
<graphic xlink:href="frobt-13-1752914-g008.tif">
<alt-text content-type="machine-generated">Diagram illustrating a robotic system for manipulating a yellow fabric using different rotations and scales. Panel a shows two robotic arms assessing the fabric. Panel b displays twelve fabric images at various rotations and scales fed into a value network, depicted as a schematic with blocks. Panel c visualizes outputs as heatmaps, highlighting fabric features. Panel d zooms into a specific heatmap region. Panel e shows the reachability zone over the fabric. Panel f illustrates the final grip strategy with labeled points: L, C, R, and a 0.40-meter distance.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-7-5">
<label>3.7.5</label>
<title>Demonstration-driven skill transfer methods</title>
<p>Reward-driven RL can be sample-inefficient and sensitive to reward design, especially in real-world cloth manipulation. Demonstration-driven methods mitigate these challenges by leveraging expert demonstrations to provide more structured supervision. <xref ref-type="bibr" rid="B58">Seita et al. (2020)</xref> introduce behavior cloning (BC) for cloth unfolding. An Oracle supervisor generates unfolding demonstrations in simulation, and the policy is trained to imitate the supervisor from observed states. To improve robustness outside the demonstration distribution, they employ Dataset Aggregation (DAgger) (<xref ref-type="bibr" rid="B55">Ross et al., 2011</xref>), relabeling states visited under the learned policy, while DR over cloth appearance and camera poses supports Sim2Real transfer. <xref ref-type="bibr" rid="B17">Fu T. et al. (2023)</xref> propose a BC-based human-to-robot skill transfer framework for cloth unfolding. They decompose demonstrations into policy demonstrations (human-chosen P&#x26;P points) and action demonstrations (human manipulation trajectories), and use a mixture density network with parameter weighting to handle the multi-modal nature of unfolding behavior. The learned policy successfully unfolds cloth of various colors and sizes in the real world, with performance comparable to human operators. <xref ref-type="bibr" rid="B38">Lee et al. (2024)</xref> learn cloth manipulation actions directly from a small set of human videos (15 annotated demonstrations) to handle both unfolding and folding. A unified P&#x26;P policy is trained from these videos and deployed on a real robot, generalizing across fabrics with different shapes, colors, and textures. Other work, such as Goal-Aware GAIL (<xref ref-type="bibr" rid="B66">Tsurumine and Matsubara, 2022</xref>), explores adversarial imitation learning without hand-designed reward functions, but adversarial IL remains less common in current cloth manipulation studies.</p>
</sec>
<sec id="s3-7-6">
<label>3.7.6</label>
<title>LLM-based planning methods</title>
<p>While previous paradigms focus on learning low-level perception or control policies, they typically lack high-level semantic reasoning and task abstraction. LLM-based planning methods address this gap by leveraging large language or multimodal models to perform high-level decision making over manipulation primitives. <xref ref-type="bibr" rid="B18">Fu et al. (2024)</xref> introduces a large language model into cloth unfolding. They prompt ChatGPT with task requirements, a predefined taxonomy of cloth states, and corresponding operational primitives. The LLM then recommends which primitive to execute next. A segmentation network subsequently identifies manipulation points for the chosen primitive, combining LLM-based decision making with visual perception. <xref ref-type="bibr" rid="B52">Raval et al. (2024)</xref> first detects cloth corners using a perception module and converts them into structured representations, which, together with human instruction prompts, are provided to ChatGPT for high-level reasoning to determine P&#x26;P points for the robot. In addition, an unselected study, <xref ref-type="bibr" rid="B15">Deng et al. (2025)</xref>, leverages vision-language models to predict manipulation plans from visual observations and semantic keypoints, whereas earlier approaches rely solely on text-based inputs.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>As detailed in the results section, we analyze the key factors related to learning-based approaches for cloth manipulation and summarize their applications across unfolding and folding tasks. In this discussion, we further examine the current state, challenges, and perspectives associated with these factors, as outlined in <xref ref-type="table" rid="T5">Table 5</xref>, and highlight the strengths, limitations, and future opportunities of contemporary learning paradigms in this domain.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Perspectives and opportunities for the factors that influence cloth unfolding and folding.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Keywords</th>
<th align="left">Current state/challenges</th>
<th align="left">Perspectives and opportunities</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Task contents</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Unfolding and folding are treated as separate tasks</p>
</list-item>
<list-item>
<p>&#x2022; Limited cloth types (mainly towels/garments)</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Integrate unfolding and folding into a unified end2end pipeline</p>
</list-item>
<list-item>
<p>&#x2022; Improve generalization across diverse cloth types</p>
</list-item>
<list-item>
<p>&#x2022; Address complex cases (e.g., sleeves inside garments, inside-out T-shirts)</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td rowspan="3" align="left">Dataset</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Simulation data dominate, while simulators lack realism and usability</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Develop more realistic, user-friendly cloth simulators (textile-air interactions, better control, diverse models)</p>
</list-item>
<list-item>
<p>&#x2022; Use VR or human-in-the-loop simulation to reduce the Sim2Real gap</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Various Sim2Real methods exist but with inconsistent transferability</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Combine multiple Sim2Real strategies for improved robustness</p>
</list-item>
<list-item>
<p>&#x2022; Balance simulation and real-world transfer (RGB-D excels in simulation; depth transfers better)</p>
</list-item>
<list-item>
<p>&#x2022; When fine-tuning on real-world data, consider supervision signal availability and diversity</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Real-world data collection is labor intensive and sensor limited</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Improve the convenience and richness of real-world data collection</p>
</list-item>
<list-item>
<p>&#x2022; Develop real-time human-intervention systems</p>
</list-item>
<list-item>
<p>&#x2022; Incorporate diverse sensing modalities</p>
</list-item>
<list-item>
<p>&#x2022; Leverage human demonstrations and online manipulation videos</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td rowspan="3" align="left">Manipulation platform</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Platforms include single, dual, and triple arm setups</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Choose arm configurations by strategy and usability; dual arms support complex tasks</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Grippers are mainly parallel types</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Multi-fingered hands could enable richer and more dexterous manipulation</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Sensors are mostly vision-only, calibration is often required</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Choose RGB, depth, or RGB-D based on task properties</p>
</list-item>
<list-item>
<p>&#x2022; Incorporate multi-modal sensing (joint positions, force, tactile)</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Primitive action</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Primitive actions continue to evolve</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Develop new primitive actions via improved tools, strategies, and learning-based policies</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Performance metrics</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Unfolding metrics are relatively standardized, while folding lacks unified evaluation criteria</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Employ multiple complementary evaluation metrics</p>
</list-item>
<list-item>
<p>&#x2022; Establish a consensus metric for folding tasks</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Failure modes</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Many failure modes (prediction errors, grasp failures, multi-layer grasping)</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Reduce prediction errors and Sim2Real gap</p>
</list-item>
<list-item>
<p>&#x2022; Improve platform or add sensors to prevent grasp failures</p>
</list-item>
<list-item>
<p>&#x2022; Consider richer grasp parameters (position, orientation, velocity)</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s4-1">
<label>4.1</label>
<title>Inferences drawn from the manipulation tasks</title>
<p>The unfolding and folding tasks are often studied separately. As discussed in <xref ref-type="sec" rid="s3-1">Section 3.1</xref>, most papers focus exclusively on either folding or unfolding, with an equal distribution between the two topics. For those interested in exploring both tasks simultaneously, it is typical to propose either two separate policies or a single policy with distinct training for each task. However, separate unfolding and folding models require duplicating learning processes, which can be inefficient in terms of both computation and data usage. Therefore, integrating separate unfolding and folding policies into a unified end2end learning policy is a promising research direction. Only a few research papers (<xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>; <xref ref-type="bibr" rid="B38">Lee et al., 2024</xref>) address this approach. A unified end2end learning policy is designed to manipulate cloth from a random initial state to a folded result, without focusing on the intermediate flattened state. This integrated policy requires training only once to handle the entire pipeline, eliminating the need for separate policies for unfolding and folding. This approach can reduce computational resources and deployment overhead, making the system more efficient. Additionally, the integrated policy may offer improved generalization capabilities.</p>
<p>When manipulating objects, using a common-sized towel is a regular choice, as it simplifies the problem. However, this policy may fail in some special circumstances where larger or everyday garments are involved. To address this, several studies have incorporated larger clothing or everyday garments into their research, effectively tackling related issues. These shifts in focus have led to further advancements. For example, the use of dynamic actions (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>) has facilitated the manipulation of larger cloths, while novel simulations (<xref ref-type="bibr" rid="B42">Lin et al., 2021</xref>) with diverse cloth models (<xref ref-type="bibr" rid="B4">Bertiche et al., 2020</xref>) have enabled handling a variety of garments. Despite the advancements made, more complex scenarios in cloth unfolding and folding remain. For instance, managing the sleeves or collar of a T-shirt, especially a long-sleeved one, can be challenging when the sleeve is tucked inside the garment. Additionally, when the entire T-shirt is inside-out, it further complicates the unfolding and folding tasks. These issues are common in daily life and need to be addressed in future research.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Issues and prospects for the dataset work</title>
<p>For DL-based methods, the dataset plays a crucial role. <xref ref-type="sec" rid="s3-2">Section 3.2</xref> highlights the primary methods of dataset collection from simulations and real-world data. While there are several advantages to using simulation, the simulators currently in use can only partially meet user needs. Therefore, a more realistic, convenient, and comprehensive clothing simulator is still needed. The ideal simulator should not only be able to load a wider variety of cloth models, simulate more realistic cloth textures, and provide a convenient API for users, but it should also offer aerodynamic interactions between air and cloth, which are essential for dynamic manipulation actions. Furthermore, the simulator could load the URDF of different robots to provide more detailed information about the interaction between the cloth and the robot.</p>
<p>Concerning the Sim2Real, the eligible papers have employed various solutions to mitigate this problem. The review finds that multiple Sim2Real technologies can be employed within a single research paper to enhance performance. However, in some cases, certain Sim2Real technologies may not be applicable due to limitations in the real-world setup. For example, <xref ref-type="bibr" rid="B7">Canberk et al. (2023)</xref> were unable to use the fine-tuning method due to the lack of available supervision signals in the real world. Despite this challenge, exploring this area remains a promising direction for future research.</p>
<p>In terms of the realworld dataset collection, current cloth keypoint collection methods included color painting labeling and UV labeling. Color painting is suitable for cases with various types of keypoints and requires only depth information for training. In contrast, UV labeling is better suited for scenarios that use RGB or RGB-D training datasets. However, it involves a limited number of keypoint types because there are few types of transparent UV fluorescent paint. Therefore, a novel keypoint labelling method is still required. Regarding data collection on robot and cloth interactions, current methods (<xref ref-type="bibr" rid="B27">Hoque et al., 2022b</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>; <xref ref-type="bibr" rid="B37">Lee et al., 2021</xref>) lack real-time feedback during dataset collection. There is a need for more convenient data collection methods that incorporate additional real-world information. One potential solution is the use of real-time feedback control platforms, such as ALOHA (<xref ref-type="bibr" rid="B78">Zhao et al., 2023</xref>), TactileAloha (<xref ref-type="bibr" rid="B22">Gu et al., 2025</xref>) and Gello (<xref ref-type="bibr" rid="B72">Wu et al., 2023</xref>), where a master robot is controlled by a human operator, while a slave robot performs the same actions in real-time to manipulate the cloth, thus collecting a more precise real dataset. Another approach involves using motion-capture systems, allowing a human operator to control the robot and manipulate the cloth, providing an intuitive and interactive method for dataset collection. Additionally, employing diverse sensors, such as force and tactile sensors (<xref ref-type="bibr" rid="B36">Kutsuzawa and Hayashibe, 2025</xref>; <xref ref-type="bibr" rid="B22">Gu et al., 2025</xref>), can further enhance the gathering of real-world information. Lastly, detecting the movement of human manipulation in videos is also a promising method because it is highly efficient and can utilize the considerable amount of video content on human manipulation available on the Internet. However, we must also address the gap between human manipulation and robot manipulation.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Implications of cloth manipulation platforms</title>
<p>Robotic platforms used for cloth manipulation vary widely in their mechanical capabilities and directly influence the design of learning algorithms. Single-arm systems (6&#x2013;7 DoF) support basic P&#x26;P or one-arm fling motions (<xref ref-type="bibr" rid="B12">Chen et al., 2022</xref>), but generally exhibit limited versatility compared to dual-arm robots, which offer 12&#x2013;14 DoF and enable coordinated bimanual strategies. Although triple-arm systems have been explored (<xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>), they remain rare and are typically motivated by specialized manipulation or safety requirements. Overall, dual-arm configurations remain the most practical and capable choice for complex cloth tasks, despite their increased control and training complexity.</p>
<p>Across the reviewed papers, UR and Franka robots account for the majority of real-world deployments due to their user-friendly APIs, reliable hardware, and workspace geometries well aligned with cloth manipulation. Parallel grippers remain the predominant end-effector type; while dexterous (multi-fingered) hands promise richer manipulation behaviors, their high-dimensional control greatly increases algorithmic complexity and has limited their adoption.</p>
<p>For perception, RGB-D sensors remain the most widely used in cloth manipulation. However, visual observations sometime require preprocessing, e.g., segmentation or background removal (<xref ref-type="bibr" rid="B48">Mo et al., 2022</xref>; <xref ref-type="bibr" rid="B62">Tanaka et al., 2021</xref>), and different modalities (RGB, depth, RGB-D) can lead to noticeably different performance. Accurate calibration among robot, camera, and workspace frames is essential for Cartesian control (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>; <xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>).</p>
<p>Beyond vision, multimodal sensing offers an underexplored opportunity for increasing robustness. Force and tactile feedback can mitigate common issues such as multi-layer grasping (<xref ref-type="bibr" rid="B65">Tirumala et al., 2022</xref>) or corner localization (<xref ref-type="bibr" rid="B50">Proesmans et al., 2023</xref>). Future systems will likely benefit from integrating such modalities, enabling more reliable grasping, improved perception under occlusion, and safer execution during dynamic actions.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Insights into the evolution of primitive actions</title>
<p>Primitive actions form the fundamental building blocks of cloth manipulation strategies, and their evolution reflects increasing requirements for precision, efficiency, and robustness. Traditional P&#x26;P primitives have been extended with parameterized or closed-loop variants (<xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>; <xref ref-type="bibr" rid="B5">Blanco-Mulero et al., 2023</xref>; <xref ref-type="bibr" rid="B25">Hietala et al., 2022</xref>) to improve accuracy and adaptability. Dynamic fling actions (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>) dramatically accelerate unfolding and enable manipulation of larger garments, while follow-up work (<xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>; <xref ref-type="bibr" rid="B12">Chen et al., 2022</xref>) further refines fling height, speed, and trajectory to improve reliability.</p>
<p>Additional primitives such as dragging or mopping (<xref ref-type="bibr" rid="B75">Xue et al., 2023</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>) expand the manipulation space by leveraging surface friction for local smoothing or global pose adjustment. Non-contact primitives such as air-blowing (<xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>) demonstrate how external forces can unfold large surfaces safely and efficiently.</p>
<p>These developments illustrate that primitive actions are becoming increasingly specialized, combining coarse global adjustments with fine-grained corrections. As richer sensing modalities emerge, new primitives are likely to follow. For example, tactile-guided sliding (<xref ref-type="bibr" rid="B60">Sunil et al., 2023</xref>) enables reliable corner acquisition by using contact information to guide motion, a key step for both unfolding and folding. Going forward, learning frameworks will need to support hierarchical, multi-primitive, or hybrid controllers to take full advantage of this growing action diversity.</p>
</sec>
<sec id="s4-5">
<label>4.5</label>
<title>Performance metrics and failure modes to be concerned</title>
<p>Regarding performance metrics, the cloth unfolding task commonly employs two fundamental metrics: unfolded coverage and the number of manipulation actions. However, each eligible paper may introduce additional metrics tailored to their specific objectives, highlighting the performance of their proposed policies.</p>
<p>In contrast, there is no universally adopted basic metric for folding tasks, as each paper establishes its own criteria. While the folding success rate is frequently used, its definition varies across studies (<xref ref-type="bibr" rid="B66">Tsurumine and Matsubara, 2022</xref>; <xref ref-type="bibr" rid="B14">Deng et al., 2023</xref>). This lack of standardized metrics complicates the quantitative comparison of different approaches, particularly given the cloth&#x2019;s deformability and the diverse range of experimental environments. To address these issues, researchers in this field should aim to report their baseline results and validation using a variety of metrics. This approach will facilitate comparisons with SOTA methods and enable a more comprehensive evaluation of their proposed approaches. On the other hand, a consensus metric for folding task also needs to be created in the future.</p>
<p>Moreover, some papers provide information on the failure modes encountered. These can be divided into two categories. The first category relates to the manipulation algorithm, including issues such as inaccurate predictions and the gap between simulation and reality. To address these issues, researchers should focus on improving algorithmic strategies. The second category involves factors not directly related to the algorithm, such as failed grasping or handling multiple layers. Solutions for these issues include improving platform settings, including more action parameters, or incorporating additional sensors. For instance, applying a nonslip silicone pad can increase grip friction, and grasp parameters should consider not only the coordinates but also the orientation of the grasp (<xref ref-type="bibr" rid="B51">Qian et al., 2020</xref>). Orientation information can reduce cloth deformation during grasping and help prevent failed grasps. Additionally, utilizing tactile sensors (<xref ref-type="bibr" rid="B60">Sunil et al., 2023</xref>) to detect layers can help avoid multiple-layer grasping.</p>
</sec>
<sec id="s4-6">
<label>4.6</label>
<title>Applications and opportunities of learning and control paradigms</title>
<p>As summarized in <xref ref-type="sec" rid="s3-7">Section 3.7</xref>, the eligible papers can be reorganized into six learning and control paradigms that better reflect how existing methods perceive, represent, and act on cloth. Below, we discuss their advantage, disadvantage, and opportunities for cloth unfolding and folding, as outlined in <xref ref-type="table" rid="T6">Table 6</xref>.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Advantages, disadvantages, and opportunities for the six learning and control paradigms.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Paradigm</th>
<th align="left">Advantages</th>
<th align="left">Disadvantages and opportunities</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Perception-H</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Achieve accurate cloth perception using labeled data and strong vision backbones</p>
</list-item>
<list-item>
<p>&#x2022; Effective for structured folding tasks with explicit visual cues (e.g., corners, edges)</p>
</list-item>
<list-item>
<p>&#x2022; Modular and interpretable pipelines separating perception and control</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022;Require large-scale annotations; deformability and occlusion complicate labeling</p>
</list-item>
<list-item>
<p>&#x2022; Hand-crafted heuristics are brittle under topology changes, wrinkles and self-entanglement</p>
</list-item>
<list-item>
<p>&#x2022; Future work may replace fixed heuristics with learned controllers and extend to complex garments</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Goal-cond</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Formulate manipulation as goal-reaching, naturally supporting multi-step folding</p>
</list-item>
<list-item>
<p>&#x2022; Enable data-efficient learning from goal images or collected trajectories</p>
</list-item>
<list-item>
<p>&#x2022; Capture spatiotemporal structure via attention or flow-based architectures</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022;Depend on well-defined goal states; ambiguous goals degrade performance</p>
</list-item>
<list-item>
<p>&#x2022; Often assume relatively neat initial configurations, limiting robustness</p>
</list-item>
<list-item>
<p>&#x2022; Promising directions include language- or semantic-goal conditioning and integration with planning or LLM-generated sub-goals</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Predict.-model</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Learn explicit or latent cloth dynamics for look-ahead prediction and planning</p>
</list-item>
<list-item>
<p>&#x2022; Improve generalization via structured state representations (e.g., keypoints, correspondences)</p>
</list-item>
<list-item>
<p>&#x2022; Provide a principled interface between perception, control, and RL.</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Training dynamics models is computationally expensive and sensitive to model bias</p>
</list-item>
<list-item>
<p>&#x2022; Accuracy depends on simulator fidelity and scenario coverage</p>
</list-item>
<list-item>
<p>&#x2022; Opportunities include large-scale self-supervised learning and multimodal state fusion</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Reward-driven</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Well suited for exploratory unfolding with highly variable initial states</p>
</list-item>
<list-item>
<p>&#x2022; Discover non-trivial strategies (e.g., dynamic fling) via trial and error</p>
</list-item>
<list-item>
<p>&#x2022; Avoid explicit labeling by relying on reward signals</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Require extensive interaction, which is costly in both simulation and real-world settings</p>
</list-item>
<list-item>
<p>&#x2022; Policies may suffer from Sim2Real gaps due to inaccurate simulator</p>
</list-item>
<list-item>
<p>&#x2022; Future work calls for better simulators, improved reward design, model-based/model-free RL, and offline or data-efficient RL.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">Demo.-driven</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Learn policies from expert demonstrations without online interaction</p>
</list-item>
<list-item>
<p>&#x2022; Efficient for structured folding and routine-like tasks</p>
</list-item>
<list-item>
<p>&#x2022; Support diverse demonstration sources, including teleoperation and human videos</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Performance is limited by demonstration coverage and quality</p>
</list-item>
<list-item>
<p>&#x2022; Generalization across cloth types and materials remains challenging</p>
</list-item>
<list-item>
<p>&#x2022; Promising directions include scalable data collection, online correction, and multimodal (video, language) demonstrations</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left">LLM-based</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Leverage semantic reasoning to select primitives and infer sub-goals</p>
</list-item>
<list-item>
<p>&#x2022; Multimodal LLMs enable visuomotor reasoning from visual inputs</p>
</list-item>
<list-item>
<p>&#x2022; Provide a unified interface for task specification, perception, and planning</p>
</list-item>
</list>
</td>
<td align="left">
<list list-type="simple">
<list-item>
<p>&#x2022; Current methods mainly operate at the high-level and lack low-level control</p>
</list-item>
<list-item>
<p>&#x2022; Inference latency limits real-time deployment on physical robots</p>
</list-item>
<list-item>
<p>&#x2022; Future work includes distilling high-level plans into lightweight controllers for real-time execution, integrating multimodal perception and action-generation models within low latency</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s4-6-1">
<label>4.6.1</label>
<title>Perception-guided heuristic methods</title>
<p>Perception-guided heuristic methods are used for both unfolding and folding, particularly when some high-level geometric cues (e.g., corners, edges, contours) can be reliably extracted. By training detectors and segmentation networks on labeled cloth images (<xref ref-type="bibr" rid="B57">Seita et al., 2019</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>), these approaches achieve high-accuracy perception and can directly localize manipulation-relevant regions on flattened or not severely wrinkled cloth, making them particularly suitable for simple unfolding and folding tasks. However, their reliance on hand-crafted post-processing and motion heuristics limits scalability. For example, downstream motion generation often ignores cloth deformability and contact dynamics, and heuristics tuned for one garment type or configuration frequently fail on heavily wrinkled, self-entangled, or topologically complex cloth, requiring manual redesign. In addition, the need for pixel-level labels and keypoints makes data collection expensive and restricts the diversity of cloth categories and configurations that can be covered. Future work could move from fixed geometric heuristics toward learned downstream controllers that take detector or segmentation outputs as input and are trained jointly with, or conditioned on, the follow-up actions. At the perception level, richer geometric and multimodal features (e.g., depth, 3D shape cues, or topological descriptors) and weaker forms of supervision could be used to reduce labeling costs. Extending these perception modules to operate robustly on non-flat, self-entangled garments and across a broader range of cloth categories would help perception-guided methods remain effective beyond narrowly structured folding scenarios.</p>
</sec>
<sec id="s4-6-2">
<label>4.6.2</label>
<title>Goal-conditioned manipulation policies</title>
<p>Goal-conditioned manipulation policies frame cloth manipulation as a goal-reaching problem, mapping the current observation and a desired goal configuration to action sequences (<xref ref-type="bibr" rid="B69">Weng et al., 2022</xref>; <xref ref-type="bibr" rid="B48">Mo et al., 2022</xref>). This paradigm naturally aligns with folding tasks, whose target states are structured and can be expressed through goal images, keyframes, or demonstration trajectories. The primary challenge lies in goal specification and goal coverage. Existing approaches typically assume that a suitable goal image or trajectory is available and that the initial cloth state is not too far from this goal manifold. When the cloth is highly wrinkled, entangled, or heavily occluded, the system may struggle to infer a feasible goal-conditioned plan, and alternative paradigms (e.g., dynamics-based or RL-based methods) become more reliable. Future opportunities include allowing semantic goals (e.g., language descriptions, LLM-generated sub-goals) instead of explicit goal images, integrating predictive models to support longer-horizon reasoning, and learning goal manifolds that generalize across diverse garment categories and configurations. Such extensions would broaden the applicability of goal-conditioned policies beyond structured folding settings.</p>
</sec>
<sec id="s4-6-3">
<label>4.6.3</label>
<title>Predictive and model-based state representation methods</title>
<p>Predictive and model-based approaches focus on learning cloth dynamics or latent state representations that support downstream planning and control (<xref ref-type="bibr" rid="B26">Hoque et al., 2022a</xref>; <xref ref-type="bibr" rid="B45">Ma et al., 2022</xref>; <xref ref-type="bibr" rid="B20">Ganapathi et al., 2021</xref>; <xref ref-type="bibr" rid="B43">Lin et al., 2022</xref>). A key advantage of this paradigm is reusability. Once an accurate dynamics or representation model is learned, it can support multiple tasks, unfolding, flattening, or different folding patterns, simply by changing the planner or objective, without retraining the entire policy. This separation of representation and control also improves data efficiency, since costly robot interaction is used to fit the model once, and later tasks can rely on planning or offline optimization over the learned dynamics. However, model-based methods face several challenges. Long-horizon cloth dynamics are difficult to learn: small prediction biases accumulate quickly and can mislead planning. Video prediction and GNN-based models require large and diverse datasets, yet often struggle to represent task-critical properties such as layer ordering, self-occlusion, or contact conditions. Self-supervised objectives focused only on reconstruction or local geometry may also fail to encode physically meaningful structures. Promising directions include learning representations that better capture cloth topology and layer structure; integrating learned dynamics with model-based RL or planning under uncertainty; and incorporating multimodal cues, such as depth, force, or tactile feedback, to resolve ambiguities in partially observed states.</p>
</sec>
<sec id="s4-6-4">
<label>4.6.4</label>
<title>Reward-driven reinforcement learning over primitive actions</title>
<p>Reward-driven RL over primitive actions is currently the dominant paradigm for cloth unfolding (<xref ref-type="bibr" rid="B71">Wu Y. et al., 2020</xref>; <xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>; <xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>; <xref ref-type="bibr" rid="B74">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>). These methods optimize value or policy functions over discrete or continuous primitives using task-specific rewards. This paradigm is particularly well suited for cloth unfolding, because initial states are highly variable, heavily occluded, and partially observable. Such uncertainty naturally requires exploration and long-horizon decision-making, and RL agents can discover non-obvious sequences of pulls, flings, or drags that increase coverage even without an explicit cloth model. In practice, prior work has explored different ways to shape this learning process, for example by learning placement points from random picks (<xref ref-type="bibr" rid="B71">Wu Y. et al., 2020</xref>), designing multi-primitive policies that strategically select among several manipulation actions (<xref ref-type="bibr" rid="B7">Canberk et al., 2023</xref>), and tailoring reward functions to emphasize coverage (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>) or directional constraints in unfolding (<xref ref-type="bibr" rid="B21">Gu et al., 2024</xref>). Despite their strengths, RL methods face several challenges. Training in simulation is computationally expensive and strongly dependent on cloth and sensor fidelity, while large-scale real-world interaction is difficult to collect. Self-supervised interaction frameworks (<xref ref-type="bibr" rid="B23">Ha and Song, 2022</xref>; <xref ref-type="bibr" rid="B1">Avigal et al., 2022</xref>) reduce labeling effort, but significant Sim2Real gaps remain, and purely real-world training tends to be limited in scale and scenario diversity (<xref ref-type="bibr" rid="B37">Lee et al., 2021</xref>). Promising research directions include developing more realistic yet efficient cloth simulators, improving Sim2Real transfer methods, exploring more sample-efficient learning strategies, designing rewards that better capture cloth-specific manipulation objectives, and leveraging offline or data-driven RL methods capable of reusing large collections of prior trajectories.</p>
</sec>
<sec id="s4-6-5">
<label>4.6.5</label>
<title>Demonstration-driven skill transfer methods</title>
<p>Demonstration-driven methods learn manipulation skills directly from expert demonstrations using behavioral cloning or related imitation-learning techniques (<xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>; <xref ref-type="bibr" rid="B17">Fu T. et al., 2023</xref>; <xref ref-type="bibr" rid="B38">Lee et al., 2024</xref>; <xref ref-type="bibr" rid="B66">Tsurumine and Matsubara, 2022</xref>). In practice, such methods have been applied to both cloth unfolding and folding. For unfolding, data-driven policies can clone expert sequences of pulls, flings, or shakes that gradually increase coverage. For folding and other structured subtasks, where expert strategies (e.g., aligning corners, placing creases, or executing a fixed folding sequence) are relatively consistent, demonstration-driven methods are particularly effective. Recent works further show that demonstrations collected in simulation, from real robots, or even from human videos can be transferred to robotic policies, sometimes with only a small number of annotated trajectories (<xref ref-type="bibr" rid="B58">Seita et al., 2020</xref>; <xref ref-type="bibr" rid="B37">Lee et al., 2021</xref>; <xref ref-type="bibr" rid="B38">2024</xref>). However, these approaches are fundamentally limited by demonstration coverage and distribution shift. Policies often fail when encountering states that are not represented in the demonstrations, or when manipulating garments with different sizes, materials, and shapes. Thus, both the quality and the diversity of demonstrations are critical for robust performance across unfolding and folding scenarios. Future opportunities include improving data-collection platforms (<xref ref-type="bibr" rid="B78">Zhao et al., 2023</xref>; <xref ref-type="bibr" rid="B72">Wu et al., 2023</xref>) to make it easier to gather large, diverse, and high-quality demonstrations; combining offline imitation with selective online correction (e.g., DAgger-style relabeling) to mitigate distribution shift; and systematically evaluating how different types of demonstrations and input modalities (simulation rollouts, real-world robot teleoperation, and human-hand videos) influence generalization to new garments, initial configurations, and task variations.</p>
</sec>
<sec id="s4-6-6">
<label>4.6.6</label>
<title>LLM-, VLA-, and action-generation-based models</title>
<p>LLM-based planning for cloth manipulation is still in its early stage. <xref ref-type="bibr" rid="B18">Fu et al. (2024)</xref>; <xref ref-type="bibr" rid="B52">Raval et al. (2024)</xref>; <xref ref-type="bibr" rid="B15">Deng et al. (2025)</xref> illustrate a transition from text-only LLM prompting to multimodal inputs that include cloth observations for guiding high-level cloth manipulation planning. Recent multimodal LLMs such as Gemini (<xref ref-type="bibr" rid="B63">Team et al., 2025</xref>) further suggest the feasibility of mapping visual observations directly to manipulation trajectories. Related progress in robotics, e.g., large-scale vision-language-action (VLA) models such as RT-2 (<xref ref-type="bibr" rid="B82">Zitkovich et al., 2023</xref>) and OpenVLA (<xref ref-type="bibr" rid="B33">Kim et al., 2024</xref>), as well as action-generation architectures such as Action Chunking Transformer (ACT) (<xref ref-type="bibr" rid="B78">Zhao et al., 2023</xref>) and <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a0;</mml:mi>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B80">Intelligence et al., 2025</xref>), which directly map images or language instructions to robot trajectories. Despite this promise, several challenges remain for cloth manipulation. First, collecting sufficiently diverse garment data at the scale required by LLMs and VLAs is difficult, and existing web-scale datasets contain little fine-grained deformable-object interaction. Second, inference latency and model size limit real-time deployment. Even models trained on specialized garment datasets are typically evaluated in quasi-static settings, where cloth deformation is slow and replanning frequency is low. Current action generators remain slower than conventional policy inference and often overlook cloth-specific dynamics such as self-occlusion, multilayer contact, and fast, large deformations, properties essential for dynamic interactions such as flinging, catching, or in-air regrasping. Promising directions therefore include finetuning or prompting LLMs and VLA models on cloth-manipulation datasets, distilling their plans into lightweight policies, and using them as high-level semantic planners that propose sub-goals or primitive sequences, while domain-specific controllers from the other paradigms (e.g., goal-conditioned policies, reward-driven RL, or demonstration-driven policies) execute those plans at a lower level and higher frequency. For action-generation architectures, a complementary avenue is to amortize expensive inference into compact latent plans or skill embeddings and let smaller, task-specific controllers decode short-horizon action chunks at control rate. This can both improve online speed and make such models more compatible with dynamic cloth primitives (e.g., fling or shake actions), enabling future systems to more fully exploit the flexible, highly deformable nature of garments rather than being limited to quasi-static operation.</p>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This systematic review examined 41 deep learning-based robotic cloth unfolding and folding studies published between 2019 and 2024. From a systems perspective, we analyzed how task design, datasets, manipulation platforms, primitive actions, performance metrics, and failure modes jointly shape current solutions. From an algorithmic perspective, we reorganized the literature into six learning and control paradigms that more clearly reflect how cloth state is perceived, represented, and acted upon.</p>
<p>Across these works, several overarching insights emerge. First, most methods still treat unfolding and folding as isolated tasks with task-specific pipelines, despite their natural interdependence. Developing unified end2end policies that operate from highly crumpled states to folded configurations remains an underexplored but impactful direction. Second, data remains a core bottleneck: it calls for more realistic simulators and real-world pipelines that use modern teleoperation, motion capture, or multimodal sensing such as force and tactile feedback (<xref ref-type="bibr" rid="B22">Gu et al., 2025</xref>; <xref ref-type="bibr" rid="B36">Kutsuzawa and Hayashibe, 2025</xref>). Third, the review also highlights that the emergence of novel primitive actions contributes to the development of the field. Furthermore, performance metrics vary widely, and establishing a standardized metric is needed for future work. Depending on the failure mode, appropriate recovery solutions should be implemented.</p>
<p>Methodologically, each paradigm contributes differently to the field. Perception-guided heuristics offer high-accuracy perception and simple motion generation for simple unfolding and folding tasks, but rely on hand-crafted rules and generalize poorly when cloth exhibit complex or severe wrinkles. Future work includes replacing heuristics with learned controllers and extending perception to more diverse cloth. Goal-conditioned policies effectively drive folding when suitable goal images or trajectories are provided but struggle when initial states lie far from the goal manifold. Making goals more semantic and integrating predictive reasoning may improve robustness. Predictive and model-based representation methods provide reusable structure by learning cloth dynamics or latent states for downstream planning, yet remain limited by data demands and difficulty modeling long-horizon deformation and layer interactions. Advances in multimodal, topology-aware dynamics models are a key direction. Reward-driven reinforcement learning excels in highly variable, occluded unfolding scenarios requiring exploration and multi-steps credit assignment, but suffers from high sample complexity and Sim2Real gaps. Progress will depend on better simulators, improved reward design, and more effective Sim2Real strategies. Demonstration-driven skill transfer efficiently acquires folding and structured subtasks but relies heavily on demonstration coverage and diversity. Scalable data-collection pipelines and imitation schemes with selective online correction will be essential for broader generalization. Emerging large language model-based and action-generation models contribute high-level semantic planning, goal decomposition, and trajectory synthesis. Future efforts will focus on cloth-specific finetuning, improving inference time, and using them as high-level planners atop domain-specific low-level policies. Overall, while significant progress has been made in cloth unfolding and folding, ongoing research and innovation remain crucial for addressing the remaining challenges for future Physical AI.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>NG: Methodology, Formal Analysis, Data curation, Visualization, Writing &#x2013; original draft, Investigation, Conceptualization, Writing &#x2013; review and editing. MH: Resources, Writing &#x2013; review and editing, Project administration, Supervision. KK: Writing &#x2013; review and editing. HY: Writing &#x2013; review and editing, Supervision.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1019152/overview">Shunsuke Shigaki</ext-link>, National Institute of Informatics, Japan</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3086380/overview">Zhongpan Zhu</ext-link>, University of Shanghai for Science and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3301632/overview">Prem Gamolped</ext-link>, Kyushu Institute of Technology, Japan</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Avigal</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Berscheid</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Asfour</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kr&#xf6;ger</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Goldberg</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Speedfolding: learning efficient bimanual folding of garments</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Babaeizadeh</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Campbell</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Stochastic variational video prediction</article-title>,&#x201d; in <conf-name>6th International Conference on Learning Representations</conf-name>, <conf-date>April 30 &#x2013; May 3, 2018</conf-date> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>OpenReview.net</publisher-name>).</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bertasius</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Torresani</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Is space-time attention all you need for video understanding?</article-title> <source>ICML</source> <volume>2</volume> (<issue>4</issue>).</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Bertiche</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Madadi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Escalera</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Cloth3d: clothed 3d humans</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>344</fpage>&#x2013;<lpage>359</lpage>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Blanco-Mulero</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Alcan</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Abu-Dakka</surname>
<given-names>F. J.</given-names>
</name>
<name>
<surname>Kyrki</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Qdp: learning to sequentially optimise quasi-static and dynamic manipulation primitives for robotic cloth manipulation</article-title>,&#x201d; in <source>2023 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>984</fpage>&#x2013;<lpage>991</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Canberk</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ha</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Burchfiel</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cousineau</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Cloth funnels: canonicalized-alignment for multi purpose garment manipulation</article-title>. <source>arXiv Preprint arXiv:2210.09347</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2210.09347</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Canberk</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ha</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Burchfiel</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cousineau</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Cloth funnels: canonicalized-alignment for multi-purpose garment manipulation</article-title>,&#x201d; in <source>2023 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5872</fpage>&#x2013;<lpage>5879</lpage>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Learning dense visual object descriptors to fold two-dimensional deformable fabrics</article-title>,&#x201d; in <source>2023 IEEE 13th international conference on CYBER technology in automation, control, and intelligent systems (CYBER)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1176</fpage>&#x2013;<lpage>1181</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Rojas</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Trakdis: a transformer-based knowledge distillation approach for visual reinforcement learning with application to cloth manipulation</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>9</volume>, <fpage>2455</fpage>&#x2013;<lpage>2462</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2024.3358750</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L.-C.</given-names>
</name>
<name>
<surname>Papandreou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Rethinking atrous convolution for semantic image segmentation</article-title>. <source>arXiv Preprint arXiv:1706.05587</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1706.05587</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L.-C.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Papandreou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Encoder-decoder with atrous separable convolution for semantic image segmentation</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV)</source>, <fpage>801</fpage>&#x2013;<lpage>818</lpage>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L. Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Novoseller</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ichnowski</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Laskey</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Efficiently learning single-arm fling motions to smooth garments</article-title>,&#x201d; in <source>The international symposium of robotics research</source> (<publisher-name>Springer</publisher-name>), <fpage>36</fpage>&#x2013;<lpage>51</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Collins</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Leitner</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Quantifying the reality gap in robotic manipulation tasks</article-title>,&#x201d; in <source>2019 international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>6706</fpage>&#x2013;<lpage>6712</lpage>.</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Learning visual-based deformable object rearrangement with local graph neural networks</article-title>. <source>Complex and Intelligent Syst.</source> <volume>9</volume>, <fpage>5923</fpage>&#x2013;<lpage>5936</lpage>. <pub-id pub-id-type="doi">10.1007/s40747-023-01048-w</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Clasp: general-purpose clothes manipulation with semantic keypoints</article-title>. <source>arXiv Preprint arXiv:2507</source>, <fpage>19983</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2507.19983</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2023a</year>). &#x201c;<article-title>Demonstrating RFUniverse: a multiphysics simulation platform for embodied AI</article-title>,&#x201d; in <source>Proceedings of robotics: science and systems (daegu, Republic of Korea)</source>. <pub-id pub-id-type="doi">10.15607/RSS.2023.XIX.087</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>Human-robot deformation manipulation skill transfer: sequential fabric unfolding method for robots</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume>, <fpage>8454</fpage>&#x2013;<lpage>8461</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3329768</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Flingflow: llm-driven dynamic strategies for efficient cloth flattening</article-title>. <source>IEEE Robotics Automation Lett</source>. <pub-id pub-id-type="doi">10.1109/LRA.2024.3440770</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Galassi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Perez</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Palli</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Renders</surname>
<given-names>J.-M.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Attention-based cloth manipulation from model-free topological representation</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>18207</fpage>&#x2013;<lpage>18213</lpage>.</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ganapathi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sundaresan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Thananjeyan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Balakrishna</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Grannen</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Learning dense visual correspondences in simulation to smooth and fold real fabrics</article-title>,&#x201d; in <source>2021 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>11515</fpage>&#x2013;<lpage>11522</lpage>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Learning to unfold garment effectively into oriented direction</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>9</volume>, <fpage>1051</fpage>&#x2013;<lpage>1058</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3341763</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Kosuge</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hayashibe</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Tactilealoha: learning bimanual manipulation with tactile sensing</article-title>. <source>IEEE Robotics Automation Lett</source>. <pub-id pub-id-type="doi">10.1109/LRA.2025.3585396</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ha</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Flingbot: the unreasonable effectiveness of dynamic manipulation for cloth unfolding</article-title>,&#x201d; in <source>Proceedings of the 5th conference on robot learning</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Faust</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
</person-group>, <fpage>24</fpage>&#x2013;<lpage>33</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>M. Q.-H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Fabricfolding: learning efficient fabric folding without expert demonstrations</article-title>. <source>Robotica</source>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1017/S0263574724000250</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Hietala</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Blanco-Mulero</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Alcan</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kyrki</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Learning visual feedback control for dynamic cloth folding</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1455</fpage>&#x2013;<lpage>1462</lpage>.</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hoque</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Balakrishna</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ganapathi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tanwani</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Jamali</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2022a</year>). <article-title>Visuospatial foresight for physical sequential fabric manipulation</article-title>. <source>Aut. Robots</source> <volume>46</volume>, <fpage>175</fpage>&#x2013;<lpage>199</lpage>. <pub-id pub-id-type="doi">10.1007/s10514-021-10001-0</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hoque</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shivakumar</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Aeron</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Deza</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Ganapathi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2022b</year>). &#x201c;<article-title>Learning to fold real garments with one arm: a case study in cloud-based robotics research</article-title>,&#x201d; in <conf-name>2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>251</fpage>&#x2013;<lpage>257</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hou</surname>
<given-names>Y. C.</given-names>
</name>
<name>
<surname>Sahari</surname>
<given-names>K. S. M.</given-names>
</name>
<name>
<surname>How</surname>
<given-names>D. N. T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A review on modeling of flexible deformable object for dexterous robotic manipulation</article-title>. <source>Int. J. Adv. Robotic Syst.</source> <volume>16</volume>, <fpage>1729881419848894</fpage>. <pub-id pub-id-type="doi">10.1177/1729881419848894</pub-id>
</mixed-citation>
</ref>
<ref id="B80">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Intelligence</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Black</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Brown</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Darpinian</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dhabalia</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Driess</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>&#x03C0;0.5: a vision-language-action model with open-world generalization</article-title>. <source>CoRR</source> <volume>abs/2504.16054</volume>. <pub-id pub-id-type="doi">10.48550/ARXIV.2504.16054</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Jalali</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wohlin</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Systematic literature studies: database searches vs. backward snowballing</article-title>,&#x201d; in <source>Proceedings of the ACM-IEEE international symposium on empirical software engineering and measurement</source>, <fpage>29</fpage>&#x2013;<lpage>38</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jim&#xe9;nez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Torras</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Perception of cloth in assistive robotic manipulation tasks</article-title>. <source>Nat. Comput.</source> <volume>19</volume>, <fpage>409</fpage>&#x2013;<lpage>431</lpage>. <pub-id pub-id-type="doi">10.1007/s11047-020-09784-5</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kadi</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Terzi&#x107;</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Planet-clothpick: effective fabric flattening based on latent dynamic planning</article-title>,&#x201d; in <source>2024 IEEE/SICE international symposium on system integration (SII)</source> (<publisher-name>IEEE</publisher-name>), <fpage>972</fpage>&#x2013;<lpage>979</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kase</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Utsumi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Domae</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ogata</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Use of action label in deep predictive learning for robot manipulation</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>13459</fpage>&#x2013;<lpage>13465</lpage>.</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Pertsch</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Karamcheti</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Balakrishna</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nair</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Openvla: an open-source vision-language-action model</article-title>. <source>arXiv Preprint arXiv:2406.09246</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2406.09246</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ku</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>Y.-L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Automated sewing system enabled by machine vision for smart garment manufacturing</article-title>. <source>IEEE Robotics Automation Lett</source>. <pub-id pub-id-type="doi">10.1109/LRA.2023.3300284</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kulkarni</surname>
<given-names>T. D.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ionescu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Borgeaud</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Reynolds</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Unsupervised learning of object keypoints for perception and control</article-title>. <source>Adv. Neural Information Processing Systems</source> <volume>32</volume>.</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kutsuzawa</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hayashibe</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Simultaneous estimation of contact position and tool shape with high-dimensional parameters using force measurements and particle filtering</article-title>. <source>Int. J. Robotics Res.</source> <volume>0</volume>, <fpage>0</fpage>. <pub-id pub-id-type="doi">10.1177/02783649251379515</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ward</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Dasagi</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Cosgun</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Leitner</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Corke</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Learning arbitrary-goal fabric folding with one hour of real robot experience</article-title>,&#x201d; in <source>
<italic>Conference on robot learning</italic> (PMLR)</source>, <fpage>2317</fpage>&#x2013;<lpage>2327</lpage>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Abou-Chakra</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Corke</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Learning fabric manipulation in the real world with human videos</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>3124</fpage>&#x2013;<lpage>3130</lpage>.</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Grinspun</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Allen</surname>
<given-names>P. K.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Multi-sensor surface analysis for robotic ironing</article-title>,&#x201d; in <source>2016 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5670</fpage>&#x2013;<lpage>5676</lpage>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tedrake</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Torralba</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Learning particle dynamics for manipulating rigid bodies, deformable objects, and fluids</article-title>,&#x201d; in <conf-name>7th International Conference on Learning Representations</conf-name>, <conf-date>May 6&#x2013;9, 2019</conf-date> (<publisher-loc>New Orleans, LA</publisher-loc>: <publisher-name>OpenReview.net</publisher-name>).</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liberati</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Altman</surname>
<given-names>D. G.</given-names>
</name>
<name>
<surname>Tetzlaff</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mulrow</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>G&#xf8;tzsche</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Ioannidis</surname>
<given-names>J. P.</given-names>
</name>
<etal/>
</person-group> (<year>2009</year>). <article-title>The prisma statement for reporting systematic reviews and meta-analyses of studies that evaluate health care interventions: explanation and elaboration</article-title>. <source>Ann. Internal Medicine</source> <volume>151</volume>, <fpage>W&#x2013;65</fpage>&#x2013;<lpage>W94</lpage>. <pub-id pub-id-type="doi">10.7326/0003-4819-151-4-200908180-00136</pub-id>
<pub-id pub-id-type="pmid">19622512</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Olkin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Softgym: benchmarking deep reinforcement learning for deformable object manipulation</article-title>,&#x201d; in <source>
<italic>Conference on robot learning</italic> (PMLR)</source>, <fpage>432</fpage>&#x2013;<lpage>448</lpage>.</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Learning visible connectivity dynamics for cloth smoothing</article-title>,&#x201d; in <conf-name>Conference on Robot Learning, Proceedings of Machine Learning Research</conf-name>, <conf-loc>London, United Kingdom</conf-loc>, <conf-date>November 8&#x2013;11, 2021</conf-date>. Editor <person-group person-group-type="editor">
<name>
<surname>Faust</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
</person-group> (<publisher-name>PMLR</publisher-name>. <volume>164</volume>), <fpage>256</fpage>&#x2013;<lpage>266</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v164/lin22a.html">https://proceedings.mlr.press/v164/lin22a.html</ext-link>
</comment>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Longhini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Welle</surname>
<given-names>M. C.</given-names>
</name>
<name>
<surname>Erickson</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kragic</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Adafold: adapting folding trajectories of cloths <italic>via</italic> feedback-loop manipulation</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>9</volume>, <fpage>9183</fpage>&#x2013;<lpage>9190</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2024.3436329</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>W. S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Learning latent graph dynamics for visual manipulation of deformable objects</article-title>,&#x201d; in <source>
<italic>2022 international conference on robotics and automation (ICRA)</italic> (IEEE)</source>, <fpage>8266</fpage>&#x2013;<lpage>8273</lpage>.</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Maitin-Shepard</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cusumano-Towner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Cloth grasp point detection based on multiple-view geometric cues with application to robotic towel folding</article-title>,&#x201d; in <source>
<italic>2010 IEEE international conference on robotics and automation</italic> (IEEE)</source>, <fpage>2308</fpage>&#x2013;<lpage>2315</lpage>.</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Matas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>James</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Davison</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sim-to-real reinforcement learning for deformable object manipulation</article-title>,&#x201d; in <source>
<italic>Conference on robot learning</italic> (PMLR)</source>, <fpage>734</fpage>&#x2013;<lpage>743</lpage>.</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mo</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Foldsformer: learning sequential multi-step cloth manipulation with space-time attention</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume>, <fpage>760</fpage>&#x2013;<lpage>767</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3229573</pub-id>
</mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nocentini</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bashir</surname>
<given-names>Z. M.</given-names>
</name>
<name>
<surname>Cavallo</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Learning-based control approaches for service robots on cloth manipulation and dressing assistance: a comprehensive review</article-title>. <source>J. NeuroEngineering Rehabilitation</source> <volume>19</volume>, <fpage>117</fpage>. <pub-id pub-id-type="doi">10.1186/s12984-022-01078-4</pub-id>
<pub-id pub-id-type="pmid">36329473</pub-id>
</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Proesmans</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Verleysen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wyffels</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Unfoldir: tactile robotic unfolding of cloth</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume>, <fpage>4426</fpage>&#x2013;<lpage>4432</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2023.3284382</pub-id>
</mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Qian</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Okorn</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Cloth region segmentation for robust grasp selection</article-title>,&#x201d; in <source>2020 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>9553</fpage>&#x2013;<lpage>9560</lpage>.</mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Raval</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Nikolaidis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Gpt-fabric: smoothing and folding fabric by leveraging pre-trained foundation models</article-title>,&#x201d; in <source>The international symposium of robotics research (ISRR)</source>.</mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: unified, real-time object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>779</fpage>&#x2013;<lpage>788</lpage>.</mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <source>Medical image computing and computer-assisted Intervention&#x2013;MICCAI 2015: 18Th international conference, munich, Germany, October 5-9, 2015, proceedings, part III 18</source> (<publisher-name>Springer</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gordon</surname>
<given-names>G. J.</given-names>
</name>
<name>
<surname>Bagnell</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>A reduction of imitation learning and structured prediction to no-regret online learning</article-title>,&#x201d; in <conf-name>Proceedings of the Fourteenth International Conference on Artificial</conf-name>. Editor <person-group person-group-type="editor">
<name>
<surname>Gordon</surname>
<given-names>G. J.</given-names>
</name>
<name>
<surname>Dunson</surname>
<given-names>D. B.</given-names>
</name>
<name>
<surname>Dud&#x00ED;k</surname>
<given-names>M.</given-names>
</name>
</person-group> (<publisher-name>JMLR Workshop and Conference Proceedings</publisher-name>) <volume>15</volume>, <fpage>627</fpage>&#x2013;<lpage>635</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://proceedings.mlr.press/v15/ross11a/ross11a.pdf">http://proceedings.mlr.press/v15/ross11a/ross11a.pdf</ext-link>
</comment>.</mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Salhotra</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>I.-C. A.</given-names>
</name>
<name>
<surname>Dominguez-Kuhne</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sukhatme</surname>
<given-names>G. S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Learning deformable object manipulation from expert demonstrations</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>7</volume>, <fpage>8775</fpage>&#x2013;<lpage>8782</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3187843</pub-id>
</mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jamali</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Laskey</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tanwani</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Berenstein</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Baskaran</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Deep transfer learning of pick points on fabric for robot bed-making</article-title>,&#x201d; in <source>The international symposium of robotics research</source> (<publisher-name>Springer</publisher-name>), <fpage>275</fpage>&#x2013;<lpage>290</lpage>.</mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ganapathi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hoque</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hwang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Tanwani</surname>
<given-names>A. K.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Deep imitation learning of sequential fabric smoothing from an algorithmic supervisor</article-title>,&#x201d; in <source>2020 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>9651</fpage>&#x2013;<lpage>9658</lpage>.</mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shehawy</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Pareyson</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Caruso</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>De Bernardi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zanchettin</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Rocco</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Flattening and folding towels with a single-arm robot based on reinforcement learning</article-title>. <source>Robotics Aut. Syst.</source> <volume>169</volume>, <fpage>104506</fpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2023.104506</pub-id>
</mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Sunil</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>She</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Adelson</surname>
<given-names>E. H.</given-names>
</name>
<name>
<surname>Garcia</surname>
<given-names>A. R.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Visuotactile affordances for cloth manipulation with local control</article-title>,&#x201d; in <conf-name>Conference on Robot Learning, Proceedings of Machine Learning Research</conf-name>, <conf-loc>Auckland, New Zealand</conf-loc>, <conf-date>February 14&#x2013;18, 2022</conf-date> (<publisher-name>PMLR</publisher-name>), <volume>205</volume>, <fpage>1596</fpage>&#x2013;<lpage>1606</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v205/sunil23a.html">https://proceedings.mlr.press/v205/sunil23a.html</ext-link>
</comment>.</mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tampuu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Matiisen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Semikin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fishman</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Muhammad</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A survey of end-to-end driving: architectures and training methods</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>33</volume>, <fpage>1364</fpage>&#x2013;<lpage>1384</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2020.3043505</pub-id>
<pub-id pub-id-type="pmid">33373304</pub-id>
</mixed-citation>
</ref>
<ref id="B62">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tanaka</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Arnold</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yamazaki</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Disruption-resistant deformable object manipulation on basis of online shape estimation and prediction-driven trajectory correction</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>6</volume>, <fpage>3809</fpage>&#x2013;<lpage>3816</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2021.3060679</pub-id>
</mixed-citation>
</ref>
<ref id="B63">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Team</surname>
<given-names>G. R.</given-names>
</name>
<name>
<surname>Abeyruwan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ainslie</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Alayrac</surname>
<given-names>J.-B.</given-names>
</name>
<name>
<surname>Arenas</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Armstrong</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Gemini robotics: bringing ai into the physical world</article-title>. <source>arXiv Preprint arXiv:2503.20020</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2503.20020</pub-id>
</mixed-citation>
</ref>
<ref id="B64">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Thananjeyan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kerr</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gonzalez</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Goldberg</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>All you need is LUV: unsupervised collection of labeled images using uv-fluorescent markings</article-title>,&#x201d; in <source>
<italic>IEEE/RSJ international conference on intelligent robots and systems, IROS 2022, Kyoto, Japan, October 23-27, 2022</italic> (IEEE)</source>, <fpage>3241</fpage>&#x2013;<lpage>3248</lpage>. <pub-id pub-id-type="doi">10.1109/IROS47612.2022.9981768</pub-id>
</mixed-citation>
</ref>
<ref id="B65">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tirumala</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Seita</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kroemer</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Temel</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Learning to singulate layers of cloth using tactile feedback</article-title>,&#x201d; in <conf-name>2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>7773</fpage>&#x2013;<lpage>7780</lpage>.</mixed-citation>
</ref>
<ref id="B66">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tsurumine</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Matsubara</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Goal-aware generative adversarial imitation learning from imperfect demonstration for robotic cloth manipulation</article-title>. <source>Robotics Aut. Syst.</source> <volume>158</volume>, <fpage>104264</fpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2022.104264</pub-id>
</mixed-citation>
</ref>
<ref id="B67">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tsurumine</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Uchibe</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Matsubara</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep reinforcement learning with smooth policy update: application to robotic cloth manipulation</article-title>. <source>Robotics Aut. Syst.</source> <volume>112</volume>, <fpage>72</fpage>&#x2013;<lpage>83</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2018.11.004</pub-id>
</mixed-citation>
</ref>
<ref id="B68">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.-H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Learning-based fabric folding and box wrapping</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>7</volume>, <fpage>5703</fpage>&#x2013;<lpage>5710</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3158434</pub-id>
</mixed-citation>
</ref>
<ref id="B69">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Weng</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bajracharya</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Fabricflownet: bimanual cloth manipulation with a flow-based policy</article-title>,&#x201d; in <source>Conference on robot learning (PMLR)</source>, <fpage>192</fpage>&#x2013;<lpage>202</lpage>.</mixed-citation>
</ref>
<ref id="B70">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rusinkiewicz</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Spatial action maps for Mobile manipulation</article-title>,&#x201d; in <source>Robotics: science and systems XVI, virtual event/corvalis, Oregon, USA</source>. <pub-id pub-id-type="doi">10.15607/RSS.2020.XVI.035</pub-id>
</mixed-citation>
</ref>
<ref id="B71">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kurutach</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pinto</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning to manipulate deformable objects without demonstrations</article-title>,&#x201d; in <source>Robotics: science and systems XVI, virtual event/corvalis, Oregon, USA</source>. <pub-id pub-id-type="doi">10.15607/RSS.2020.XVI.065</pub-id>
</mixed-citation>
</ref>
<ref id="B72">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Shentu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>GELLO: a general, low-cost, and intuitive teleoperation framework for robot manipulators</article-title>,&#x201d; in <source>Towards generalist robots: learning paradigms for scalable skill acquisition @ CoRL2023</source>.</mixed-citation>
</ref>
<ref id="B73">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Unigarmentmanip: a unified framework for category-level garment manipulation <italic>via</italic> dense visual correspondence</article-title>,&#x201d; in <source>CVF conference on computer vision and pattern recognition (CVPR)</source>, <volume>2</volume>. <publisher-name>IEEE</publisher-name>, <fpage>16340</fpage>&#x2013;<lpage>16350</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr52733.2024.01546</pub-id>
</mixed-citation>
</ref>
<ref id="B74">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chi</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Burchfiel</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cousineau</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Dextairity: deformable manipulation can be a breeze</article-title>,&#x201d; in <source>Robotics: science and systems XVIII, New York city, NY, USA, June 27 - july 1, 2022</source>.</mixed-citation>
</ref>
<ref id="B75">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Xue</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Unifolding: towards sample-efficient, scalable, and generalizable robotic garment folding</article-title>,&#x201d; in <source>Conference on robot learning, CoRL 2023, 6-9 November 2023, Atlanta, GA, USA</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Tan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Toussaint</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Darvish</surname>
<given-names>K.</given-names>
</name>
</person-group>, <fpage>3321</fpage>&#x2013;<lpage>3341</lpage>.</mixed-citation>
</ref>
<ref id="B76">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Clothppo: a proximal policy optimization enhancing framework for robotic cloth manipulation with observation-aligned action spaces</article-title>,&#x201d; in <source>Proceedings of the thirty-third international joint conference on artificial intelligence, IJCAI 2024</source> (<publisher-loc>Jeju, South Korea</publisher-loc>: <publisher-name>ijcai.org</publisher-name>), <fpage>6895</fpage>&#x2013;<lpage>6903</lpage>.</mixed-citation>
</ref>
<ref id="B77">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Demiris</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning grasping points for garment manipulation in robot-assisted dressing</article-title>,&#x201d; in <source>2020 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>9114</fpage>&#x2013;<lpage>9120</lpage>.</mixed-citation>
</ref>
<ref id="B78">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>T. Z.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Learning fine-grained bimanual manipulation with low-cost hardware</article-title>,&#x201d; in <source>Robotics: science and systems XIX, Daegu, Republic of Korea, July 10-14, 2023</source>. <pub-id pub-id-type="doi">10.15607/RSS.2023.XIX.016</pub-id>
</mixed-citation>
</ref>
<ref id="B79">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Huo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Navarro-Alarcon</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Imitating tool-based garment folding from a single visual observation using hand-object graph dynamics</article-title>. <source>IEEE Trans. Industrial Inf.</source> <volume>20</volume>, <fpage>6245</fpage>&#x2013;<lpage>6256</lpage>. <pub-id pub-id-type="doi">10.1109/TII.2023.3342895</pub-id>
</mixed-citation>
</ref>
<ref id="B81">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cherubini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dune</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Navarro-Alarcon</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Alambeigi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Berenson</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Challenges and outlook in robotic manipulation of deformable objects</article-title>. <source>IEEE Robotics and Automation Mag.</source> <volume>29</volume>, <fpage>67</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1109/mra.2022.3147415</pub-id>
</mixed-citation>
</ref>
<ref id="B82">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zitkovich</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Rt-2: vision-language-action models transfer web knowledge to robotic control</article-title>,&#x201d; in <source>Conference on robot learning (PMLR)</source>, <fpage>2165</fpage>&#x2013;<lpage>2183</lpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>