<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2024.1401075</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Ontology based autonomous robot task processing framework</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ge</surname> <given-names>Yueguang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2668550/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhang</surname> <given-names>Shaolin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Cai</surname> <given-names>Yinghao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2676837/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lu</surname> <given-names>Tao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Haitao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hui</surname> <given-names>Xiaolong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Shuo</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>The State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>The School of Artificial Intelligence, University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>The Center for Excellence in Brain Science and Intelligence Technology, Chinese Academy of Sciences</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Paloma de la Puente, Polytechnic University of Madrid, Spain</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Hang Zhong, Hunan University, China</p>
<p>Guohui Tian, Shandong University, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Yinghao Cai <email>yinghao.cai&#x00040;ia.ac.cn</email></corresp>
<corresp id="c002">Shuo Wang <email>shuo.wang&#x00040;ia.ac.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>05</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>18</volume>
<elocation-id>1401075</elocation-id>
<history>
<date date-type="received">
<day>14</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Ge, Zhang, Cai, Lu, Wang, Hui and Wang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Ge, Zhang, Cai, Lu, Wang, Hui and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>In recent years, the perceptual capabilities of robots have been significantly enhanced. However, the task execution of the robots still lacks adaptive capabilities in unstructured and dynamic environments.</p></sec>
<sec>
<title>Methods</title>
<p>In this paper, we propose an ontology based autonomous robot task processing framework (ARTProF), to improve the robot&#x00027;s adaptability within unstructured and dynamic environments. ARTProF unifies ontological knowledge representation, reasoning, and autonomous task planning and execution into a single framework. The interface between the knowledge base and neural network-based object detection is first introduced in ARTProF to improve the robot&#x00027;s perception capabilities. A knowledge-driven manipulation operator based on Robot Operating System (ROS) is then designed to facilitate the interaction between the knowledge base and the robot&#x00027;s primitive actions. Additionally, an operation similarity model is proposed to endow the robot with the ability to generalize to novel objects. Finally, a dynamic task planning algorithm, leveraging ontological knowledge, equips the robot with adaptability to execute tasks in unstructured and dynamic environments.</p></sec>
<sec>
<title>Results</title>
<p>Experimental results on real-world scenarios and simulations demonstrate the effectiveness and efficiency of the proposed ARTProF framework.</p></sec>
<sec>
<title>Discussion</title>
<p>In future work, we will focus on refining the ARTProF framework by integrating neurosymbolic inference.</p></sec></abstract>
<kwd-group>
<kwd>service robot</kwd>
<kwd>knowledge-enabled robot</kwd>
<kwd>ontology</kwd>
<kwd>knowledge representation</kwd>
<kwd>task planning</kwd>
</kwd-group>
<counts>
<fig-count count="15"/>
<table-count count="4"/>
<equation-count count="1"/>
<ref-count count="36"/>
<page-count count="16"/>
<word-count count="8010"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Benefiting from the rapid advancements in artificial intelligence and robotics, the perception capabilities of robots have been significantly improved in recent years. Robots are now able to accomplish basic tasks such as object recognition, navigation, and manipulation. However, the task execution of the robots still lacks adaptive capabilities in unstructured and dynamic environments. Consider the basic task of retrieving apples, if the robot can visually perceive the apple, the robot is able to successfully grasp it and execute the corresponding action. However, if the robot is in an indoor environment and the apple is placed in a box or drawer, the robot lacks the ability to reason about the task. The lack of cognitive and reasoning ability poses a critical bottleneck for the robot to accomplish the task. Specifically, robots lack the fundamental understanding of commonsense knowledge. Their cognitive abilities remain confined to basic object recognition, limiting their capacity to tasks in unstructured and dynamic environments.</p>
<p>Symbolism believes that cognition is a form of symbolic processing in ontology, suggesting that human thought processes can always be described through specific symbols. Ontology, which can effectively describe the hierarchical structures and semantics of different concepts, has become an important tool for robot&#x00027;s reasoning capacities (Olivares-Alarcos et al., <xref ref-type="bibr" rid="B17">2019</xref>; Paulius and Sun, <xref ref-type="bibr" rid="B18">2019</xref>). Suh et al. (<xref ref-type="bibr" rid="B27">2007</xref>) proposed an ontology-based multi-level robot knowledge framework (OMRKF) , which achieves the semantic cognitive representation of robots by defining four knowledge types: perception, model, activity, and context. Ontological knowledge reasoning is achieved through defining knowledge axioms and rules, providing the ability to query semantic knowledge effectively. Tenorth et al. (<xref ref-type="bibr" rid="B30">2010</xref>), Tenorth and Beetz (<xref ref-type="bibr" rid="B28">2013</xref>), Tenorth and Beetz (<xref ref-type="bibr" rid="B29">2017</xref>) and Beetz et al. (<xref ref-type="bibr" rid="B1">2018</xref>) proposed an ontology-based knowledge processing system named KnowRob, which built a semantic framework integrating multi-source heterogeneous information. KnowRob has the capacity for both knowledge representation and reasoning. Leveraging ontology as the knowledge carrier enables the effective characterization of multiple and complex classes, attributes and relationship of knowledge. Based on the representation of classes and attributes, KnowRob can align knowledge with objects in the real scenario and generate a large number of instance descriptions through inheritance operations. Rule-based reasoning approaches empower the customization of rules tailored to specific application scenarios, thereby enabling user-defined reasoning processes. Based on KnowRob, Beetz et al. (<xref ref-type="bibr" rid="B3">2010</xref>) proposed CRAM cognitive framework (Beetz et al., <xref ref-type="bibr" rid="B3">2010</xref>, <xref ref-type="bibr" rid="B2">2023</xref>). This framework addresses the challenge of missing information in daily tasks by utilizing CRAM Plan Language (CPL) to build action plans. By leveraging knowledge reasoning, CRAM fills in the gaps in action plans, which enables robots to execute daily operations effectively. ORO (Lemaignan et al., <xref ref-type="bibr" rid="B12">2010</xref>; Lemaignan, <xref ref-type="bibr" rid="B11">2013</xref>) proposed a general knowledge representation framework for autonomous robot-human interaction processes. It aims to enhance the robot&#x00027;s interaction capabilities in complex human living environments. The tasks include object recognition, natural language interaction, task planning, and collaboration with other robots or humans. The knowledge in ORO is grounded on an upper-level ontology built on OpenCyc (Lenat, <xref ref-type="bibr" rid="B13">1995</xref>), which allows for the addition of new ontologies on top of the upper-level ontology. ORO employs Pellet (Sirin et al., <xref ref-type="bibr" rid="B26">2007</xref>) for ontology knowledge query and reasoning. Li et al. (<xref ref-type="bibr" rid="B14">2017</xref>) introduced the Smart and Networking Underwater Robots in Cooperation Meshes (SWARMs), which aims to address information heterogeneity and facilitate uniform comprehension among robots regarding exchanged information.</p>
<p>The aforementioned knowledge frameworks use ontology as the basis for knowledge representation and reasoning, which could provide rich semantic information for robots. OMRKF (Suh et al., <xref ref-type="bibr" rid="B27">2007</xref>) addresses the low-level perception by storing SIFT visual features in a hierarchical symbolic architecture, making it difficult to extend to more complex entities or actions. KnowRob adopts an encyclopedia form to build the semantic knowledge model, which lacks the top-level design for tasks. CRAM focuses on completing task parameters through knowledge but lacks emphasis on dynamically generating action execution sequences in tasks. The ORO knowledge management system highlights the interaction between robots and humans. The ontology in SWARMs is specialized for unmanned underwater robots, limiting its applicability to other types of robot applications. Moreover, representative work of knowledge frameworks such as RoboEarth (Waibel et al., <xref ref-type="bibr" rid="B32">2011</xref>), OPEN-EASE (Beetz et al., <xref ref-type="bibr" rid="B4">2015</xref>), and RoboBrain (Saxena et al., <xref ref-type="bibr" rid="B24">2014</xref>) emphasize more on knowledge sharing among different robots. They do not offer task processing tailored for robot manipulations in dynamic environments.</p>
<p>Recently, deep learning has achieved remarkable breakthroughs in vision tasks such as object detection and recognition. R-CNN series from R-CNN to Mask R-CNN (Girshick, <xref ref-type="bibr" rid="B7">2015</xref>; Ren et al., <xref ref-type="bibr" rid="B23">2015</xref>; He et al., <xref ref-type="bibr" rid="B9">2017</xref>; Bharati and Pramanik, <xref ref-type="bibr" rid="B5">2020</xref>), YOLO series (Redmon et al., <xref ref-type="bibr" rid="B20">2016</xref>; Redmon and Farhadi, <xref ref-type="bibr" rid="B21">2017</xref>, <xref ref-type="bibr" rid="B22">2018</xref>; Jiang et al., <xref ref-type="bibr" rid="B10">2022</xref>), and SSD (Liu et al., <xref ref-type="bibr" rid="B15">2016</xref>; Zhai et al., <xref ref-type="bibr" rid="B36">2020</xref>) are representative works in deep learning-based object detection. Deep learning-based approaches have greatly improved the performance of object detection and recognition compared with manually designed features. Meanwhile, Robot Operating System (ROS), as a communication framework specifically designed for robot software development, has attracted much attention. ROS hosts a varieties of algorithms such as Gmapping for laser-based SLAM (Simultaneous Localization and Mapping) and MoveIt for robotic arm motion planning. In a semantic knowledge-assisted robot, beyond achieving the dynamic update of the knowledge base by combining robot perception system with the knowledge-driven decision-making control, it is also necessary to address the easy deployment of new perception and control algorithms. To this end, this paper proposes an ontology autonomous robot task processing framework (ARTProF). This framework seamlessly integrates knowledge representation, knowledge reasoning, and autonomous task planning and execution.</p>
<p>ARTProF is based on ontological knowledge representation and reasoning. In the proposed framework, the instances in the knowledge base are generated with neural network-based object detection algorithms. The proposed framework also defines ROS-based manipulation operators for the robot, which establishes the connections between the primitive actions of the robot and the objects it interacts with. The integration with the ROS system facilitates the relationship between the knowledge system and the robot. Moreover, the proposed framework includes an operation similarity model for different objects. When the robot is operating a novel object, the robot&#x00027;s action is selected autonomously according to the similarity model, which endows the robot with the ability to manipulate generalization. Moreover, ARTProF achieves dynamic task planning by leveraging knowledge reasoning. The robot can autonomously and dynamically organize action sequences to complete tasks in diverse environmental conditions. Compared with existing knowledge frameworks, ARTProF offers the following advantages: (1) ARTProF addresses the task demands in dynamic and uncertain environments by supporting the representation and reasoning of common sense and task knowledge, dynamic knowledge generation, task planning and execution. These functionalities provide a comprehensive support for robot task execution. (2) An operation similarity model is proposed to facilitate operation transfer among different objects. Objects with similar characteristics are manipulated in a similar manner. (3) A dynamic task planning algorithm is proposed based on the ARTProF framework. The generated plans satisfy the execution constraints defined in the prior knowledge during robot task execution.</p>
<p>The paper is organized as follows. Section 2 introduces the basic architecture, knowledge representation and reasoning of the ARTProF. Section 3 gives the design and implementation of the perception system, where the instances in the knowledge base are dynamically generated. Section 4 introduces the knowledge-guided manipulation operators. Section 5 introduces the dynamic task planning based on knowledge reasoning. Section 6 presents our experimental result and analysis on both real-world scenarios and simulations. Finally, we conclude the paper in Section 7.</p></sec>
<sec id="s2">
<title>2 The framework of ARTProF</title>
<p>As the knowledge processing and task execution system for autonomous robots, ARTProF is able to handle diverse knowledge types such as environment knowledge and task knowledge. ARTProF also presents capacities for flexible knowledge reasoning. Through integration of the perception and control systems, ARTProF achieves knowledge-based autonomous control, enabling the robot to execute everyday manipulation tasks. The framework of ARTProF is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>The architecture of ARTProF. The knowledge representation layer utilizes the OWL language to describe static and dynamic ontology knowledge related to the environment and tasks. The knowledge reasoning layer achieves ontology parsing and constructs inference rules grounded in logic programming. Additionally, it integrates perception algorithms to implement logical reasoning process. The task planning and control layer incorporates a knowledge-based dynamic planning algorithm to enable autonomous robot task execution.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0001.tif"/>
</fig>
<p>In ARTProF, the ontological knowledge base is constructed using the description logic (DL), which includes classes, attributes, and instances needed to describe the object. The ontological knowledge base is extensible, allowing the derivation of new classes from existing ones and incorporation of new object instances. The ontological knowledge is denoted as <italic>D</italic> &#x0003D; (<italic>T, A</italic>), where <italic>T</italic> is the TBox (Terminology Box) and <italic>A</italic> is the ABox (Assertional Box). The TBox represents static knowledge built from commonsense knowledge and task-specific background information. It defines concepts and relationship between concepts such as abstract classes, inherent attributes, and relationships between classes. On the contrary, the ABox represents dynamic knowledge derived from the real-time data acquired by the robot perception system. This dynamic knowledge represents specific events such as object instances, size, pose, and state. We use Web Ontology Language (OWL; Motik et al., <xref ref-type="bibr" rid="B16">2009</xref>) to store the description logic knowledge in XML-based files. Originally developed for knowledge representation in the semantic web, OWL has now become a general knowledge representation format capable of describing all aspects of objects, actions, time events, attributes, and their relationships.</p>
<p>The semantic knowledge of robot task processing is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The &#x0201C;Environment&#x0201D; class describes information related to the environment where the robot is located. It includes &#x0201C;Object&#x0201D; for the classes of the operated object and its associated semantic attributes, &#x0201C;State&#x0201D; for defining the object&#x00027;s state, &#x0201C;Map&#x0201D; for representing the semantic layout of the task environment, &#x0201C;Time&#x0201D; for temporal concepts, &#x0201C;Math&#x0201D; for mathematical models and algorithms, and &#x0201C;User&#x0201D; for robot user-related knowledge including identity and task-specific information such as usernames, habits, interests, etc. The &#x0201C;Task&#x0201D; class defines basic primitive actions necessary for the robot&#x00027;s task execution. Upon receiving a task, semantic understanding, action decomposition, and dynamic planning are facilitated through the &#x0201C;Task&#x0201D; class. The &#x0201C;Robot&#x0201D; class describes the attributes of the robot itself. It includes &#x0201C;Capability&#x0201D; for function descriptions, &#x0201C;Component&#x0201D; for hardware configuration, &#x0201C;Type&#x0201D; for the robot category (e.g., industrial and service), and &#x0201C;Status&#x0201D; for describing the robot&#x00027;s operational states (e.g., working, shutdown, and charging).</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>The semantic knowledge of robot task processing in ARTProF. The nodes and edges in the graph correspond to classes (instances) and properties in the OWL language, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0002.tif"/>
</fig>
<p>There are many inference tasks implemented by DL inference engines, such as Racer (Haarslev and M&#x000F6;ller, <xref ref-type="bibr" rid="B8">2001</xref>), Pellet (Sirin et al., <xref ref-type="bibr" rid="B26">2007</xref>), and HermiT (Shearer et al., <xref ref-type="bibr" rid="B25">2008</xref>), which are operated by maintaining a complete knowledge base in memory. This reasoning mechanism requires reasoning on the entire knowledge base whenever there are changes, which is both time-consuming and not suitable for the reasoning in dynamic environments. In ARTProF, we choose a purely memory-based infrastructure for efficiency. The knowledge query and reasoning engine of ARTProF utilizes the semweb library (Wielemaker et al., <xref ref-type="bibr" rid="B33">2003</xref>) as an XML parser to convert the XML parse-tree from the OWL ontology file into a Prolog list of triples. We further employ the rule-based reasoning used in SWI-Prolog (Wielemaker et al., <xref ref-type="bibr" rid="B34">2012</xref>). SWI-Prolog allows to customize the rules according to specific application scenarios, enabling reasoning through querying the predicates (Vassiliadis et al., <xref ref-type="bibr" rid="B31">2009</xref>).</p>
<p>When the robot is interacting with objects, the robot needs to understand the object&#x00027;s attributes, infer the object&#x00027;s location, and formulate the manipulation policies. These decisions necessitate relating the abstracted knowledge about objects to the physical entities in the environment. Moreover, the knowledge base is required to associate with the robot perception system. To seamlessly integrate the perceived visual information into the knowledge query and reasoning processes, the perceptual fusion model is designed in ARTProF. This model synchronously transforms the perceived visual information into dynamic knowledge.</p>
<p><xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the interaction between the knowledge base and the perception system within ARTProF. ARTProF provides two connection modes between the knowledge base and the robot perception system: (1) synchronous communication (request-response mode). This mode allows on-demand perception of objects while querying the knowledge base to generate object instances. It achieves synchronous updates of the knowledge base by incorporating the perceived object instances. (2) Asynchronous communication (channel broadcast mode). The knowledge base asynchronously updates by passively listening to the published object detection results. The generated object instances mainly include attributes such as ID, category, pose, size, material, etc. Since the pose of the object can be varied, an intermediate perception instance is added to bridge the object instance and its pose, capturing the object&#x00027;s state at a specific timestamp.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>The interaction process between the knowledge base and the perception system in ARTProF. ARTProF provides two connection modes: synchronous communication (request-response mode) and asynchronous communication (channel broadcast mode).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0003.tif"/>
</fig>
<p>The robot control system transforms the control decision into a knowledge reasoning task. The robot control is obtained by querying the knowledge base, where different primitive actions of the robot are combined as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>. The dynamic task planning module employs a dynamic planning algorithm (<xref ref-type="table" rid="T5">Algorithm 1</xref>). Upon acquiring task definitions through knowledge base queries, it derives a sequence of primitive actions required for the task. The action control module also inquires the object characteristics through knowledge queries. It then determines suitable manipulation operators according to an operation similarity model. Subsequently, it executes the corresponding primitive actions leveraging the ROS communication mechanism. The interaction between the robot control system and the knowledge query and reasoning engine is realized through calling Json_prolog in high-level languages such as Python, C&#x0002B;&#x0002B;, and Java.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Knowledge-based control system in ARTProf. The robot control system transforms the control decision into a knowledge reasoning task. The robot control is obtained by querying the knowledge base, where different primitive actions of the robot are combined.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0004.tif"/>
</fig>
<table-wrap position="float" id="T5">
<label>Algorithm 1</label>
<caption><p>Dynamic task planning algorithm.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-i0001.tif"/>
</table-wrap>
</sec>
<sec id="s3">
<title>3 The perception system</title>
<p>In ARTProF, we present a unified communication interface between the knowledge base and the perception system. Recently, deep learning has revolutionized object detection. The performance of object detection has been improved significantly (Girshick, <xref ref-type="bibr" rid="B7">2015</xref>; Ren et al., <xref ref-type="bibr" rid="B23">2015</xref>; Redmon et al., <xref ref-type="bibr" rid="B20">2016</xref>; He et al., <xref ref-type="bibr" rid="B9">2017</xref>; Redmon and Farhadi, <xref ref-type="bibr" rid="B21">2017</xref>, <xref ref-type="bibr" rid="B22">2018</xref>; Bharati and Pramanik, <xref ref-type="bibr" rid="B5">2020</xref>; Jiang et al., <xref ref-type="bibr" rid="B10">2022</xref>). Moreover, the 6D pose of the detected objects can be obtained through PoseCNN (Xiang et al., <xref ref-type="bibr" rid="B35">2018</xref>), PVNet (Peng et al., <xref ref-type="bibr" rid="B19">2019</xref>), and SilhoNet (Billings and Johnson-Roberson, <xref ref-type="bibr" rid="B6">2019</xref>), etc. By defining a unified communication interface between the knowledge base and neural network-based perception algorithms, detection and recognition of objects at different abstraction levels can be achieved.</p>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> illustrates the communication modes between the knowledge base and the perception system. The &#x0201C;object_detect_listener&#x0201D; and &#x0201C;comp_object_detect&#x0201D; are different communication modes: synchronous on-demand in a request-response manner (&#x0201C;comp_object_detect&#x0201D;) and asynchronous in a passive listening manner (&#x0201C;object_detect_listener&#x0201D;), respectively.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>The communication modes between the knowledge base and the perception system in ARTProF: synchronous on-demand in a request-response manner and asynchronous in a passive listening manner.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0005.tif"/>
</fig>
<p>In the synchronous communication mode, the object perception algorithm is encapsulated as a ROS service node. The connection is triggered by a custom Prolog predicate, which implements ROS service invocation, accesses the knowledge base, and processes the data returned by the perception system. When a user queries the knowledge base for a specific object class, &#x0201C;comp_object_detect&#x0201D; initiates the object detection and recognition request to the perception system. Then, the knowledge query and reasoning system filters the commonsense knowledge based on data returned by the perception system, identifying object instances that match the query or its subclasses. The data returned by the perception system includes object category labels. The filtering process involves determining whether the label is defined as a class in the knowledge base, and it is the process of querying either the category itself or its subclasses. This filtering process can be implemented using the Prolog built-in predicate &#x0201C;rdfs_subclass_of.&#x0201D; Taking the query for fruits as an example, when the perception system detects and locates objects such as apples, bananas, and plates, by evaluating whether the return value of &#x0201C;rdfs_subclass_of(L, &#x00027;Fruit&#x00027;)&#x0201D; is true (where L represents the object label returned by the perception system, and &#x0201C;Fruit&#x0201D; represents the queried fruit category), it can be determined that apples and bananas belong to the fruit category, thereby creating and returning instances of apples and bananas. Finally, all object instances belonging to the queried class are returned to the user. The synchronous communication pipeline is shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. Meanwhile, in the asynchronous communication mode, the perception algorithm is encapsulated as a ROS publisher node, and object instance generation is implemented by calling a custom Prolog predicate in subscriber nodes. &#x0201C;object_detect_listener&#x0201D; monitors sensory data from the perception system, detecting and generating object instances at regular intervals. The knowledge representation of object instances includes ID, class, timestamp, size, and pose, which can be extended according to task requirements and perception algorithms. <xref ref-type="fig" rid="F7">Figure 7</xref> displays the partial knowledge topology of &#x0201C;Instance1&#x0201D; of &#x0201C;Class1.&#x0201D; The &#x0201C;latestDetection&#x0201D; indicates the most recent detection results.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Synchronous communication. When querying a specific object through the &#x0201C;object detect listener&#x0201D; in the knowledge base, it triggers the perception system to acquire instances of the queried object in the environment through synchronous communication.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0006.tif"/>
</fig>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Knowledge topology of the &#x0201C;Instance1.&#x0201D; The nodes in the graph are represented by classes or instances in OWL, and the edges are represented by properties in OWL.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0007.tif"/>
</fig>
<p>We choose YOLO v3 for object detection, which strikes a good balance between accuracy and efficiency. Here, Yolo v3 can be replaced with any object detection algorithm. In <xref ref-type="fig" rid="F8">Figure 8</xref>, three types of objects, namely apple, banana and box, are queried and recognized. It can be seen from <xref ref-type="fig" rid="F8">Figure 8</xref> that there is information exchanged between the knowledge base and the perception system, which can be extended to multi-class objects or objects of different abstraction levels. Deep learning-based object detection and recognition approaches improve the generalization and scalability of ARTProF for object perception. Consequently, the robot is able to associate the physical objects detected in the environment with the abstracted knowledge about object classes.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Object instance inquiry based on the knowledge base and perception system. Through the query interface, we can not only obtain instances of labeled objects in the perception system but also acquire instances of abstract concepts (such as &#x0201C;fruit&#x0201C;).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0008.tif"/>
</fig>
</sec>
<sec id="s4">
<title>4 Knowledge-guided manipulation operators</title>
<sec>
<title>4.1 Action knowledge representation</title>
<p>Action knowledge is used to describe the actions executed by robots in manipulation tasks. In ARTProF, the relationships between the actions and objects are defined in the constraint attributes {<italic>preActors, postActors</italic>}, where <italic>preActors</italic> are the pre-conditions necessary for the action, specifying the action properties that are needed to be satisfied before the action is executed. <italic>postActors</italic> define the post-effects of the action, describing the environment&#x00027;s state after successful execution of the action.</p>
<p>As shown in <xref ref-type="fig" rid="F9">Figure 9A</xref>, <italic>preActors</italic> include: {<italic>objectActedOn</italic>, <italic>performedBy</italic>, <italic>fromLocation</italic>, <italic>fromState</italic>}, representing the operated object, the action execution agent, initial position and the initial state of the operated object, respectively. The <italic>postActors</italic> include: {<italic>outputs</italic>, <italic>toLocation</italic>, <italic>toState</italic>}, representing the output object, target position and target state of the operated object, respectively. The action attribute is defined using OWL pseudo-code, as shown in <xref ref-type="fig" rid="F9">Figure 9B</xref>. <xref ref-type="fig" rid="F9">Figure 9C</xref> showcases the OWL pseudo-code of two basic actions: picking up and putting down.</p>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Action knowledge representation. <bold>(A)</bold> The hierarchy of the action attributes. <bold>(B)</bold> The definition of the action attributes. <bold>(C)</bold> An example of the knowledge representation of the action.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0009.tif"/>
</fig>
</sec>
<sec>
<title>4.2 Manipulation operator</title>
<p>In ARTProF, manipulation operators based on ROS are designed to guide robot actions via knowledge. The integration with ROS is to maximize the utilization of existing robot operation algorithms. These operators are instances of the &#x0201C;Algorithm&#x0201D; class, which is a subclass of &#x0201C;Math&#x0201D; in the ontological knowledge base. The constraints for actions and objects are respectively fulfilled by &#x0201C;operatorAction&#x0201D; and &#x0201C;objectActedOn.&#x0201D; Basic operation algorithms on ROS are associated with ROS-related properties such as &#x0201C;serviceName,&#x0201D; &#x0201C;serviceReq,&#x0201D; &#x0201C;serviceRes,&#x0201D; and &#x0201C;serviceSrv.&#x0201D;</p>
<p>More specifically, as shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the &#x0201C;PrimitiveAction&#x0201D; denotes the primitive action class in the knowledge representation. The &#x0201C;Object&#x0201D; corresponds to the &#x0201C;Object&#x0201D; class in the knowledge base, which describes the category of the operated object and its related semantic attributes. &#x0201C;service name&#x0201D;, &#x0201C;request data,&#x0201D; &#x0201C;response data,&#x0201D; and &#x0201C;service data type&#x0201D; are the service name, function name for obtaining the requested data, response data name, and service data type defined in ROS, respectively.</p>
<fig id="F10" position="float">
<label>Figure 10</label>
<caption><p>Manipulation operator. Manipulation operator is defined as instances of the &#x0201C;Algorithm&#x0201D; class in OWL, guiding robot actions based on ROS.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0010.tif"/>
</fig>
<p>For example, in the task of &#x0201C;grasp an apple,&#x0201D; a manipulation operator named &#x0201C;PickingUpAnApple&#x0201D; is defined in ARTProF. The OWL pseudo-code of this operator is described as:</p>
<preformat>
Individual: PickingUpAnApple
&#x000A0;&#x000A0;type: Algorithm
&#x000A0;&#x000A0;operatorAction: PickingUpAnObject
&#x000A0;&#x000A0;objectActedOn: Apple
&#x000A0;&#x000A0;serviceName: &#x0201C;pickingUpApple&#x0201D;
&#x000A0;&#x000A0;serviceReq: &#x0201C;pickingUpAppleReq&#x0201D;
&#x000A0;&#x000A0;serviceRes: &#x0201C;status&#x0201D;
&#x000A0;&#x000A0;serviceSrv: &#x0201C;pickUpApple&#x0201D;
&#x000A0;
ObjectProperty: operatorAction
&#x000A0;&#x000A0;domain: Algorithm
&#x000A0;&#x000A0;rang: Action
&#x000A0;
DatatypeProperty: serviceName
&#x000A0;&#x000A0;domain: Algorithm
&#x000A0;&#x000A0;rang: string
...
</preformat>
<p>During task execution, when the robot is performing the task, it first queries the knowledge base using the primitive action &#x0201C;PickingUpAnObject&#x0201D; and the operated object &#x0201C;Apple&#x0201D; to determine the manipulation operator &#x0201C;PickingUpAnApple.&#x0201D; The &#x0201C;grasp&#x0201D; operation is then invoked through ROS-related attributes of the manipulation operator to finish the task. The process of searching and reasoning for the manipulation operator is represented by Prolog pseudo-codeas follows:</p>
<preformat>
get_operator(Action, Object, Operator) :-
&#x000A0;&#x000A0;rdf(Operator, type, Algorithm),
&#x000A0;&#x000A0;rdf(Operator, operatorAction, Action),
&#x000A0;&#x000A0;rdf(Operator, objectActedOn, Object).
</preformat>
<p>If the &#x0201C;Action&#x0201D; (the primitive action) and the &#x0201C;Object&#x0201D; (the operated object) are determined, the manipulation operator &#x0201C;PickingUpAnApple&#x0201D; is obtained through the &#x0201C;get_operator&#x0201D; operator.</p>
</sec>
<sec>
<title>4.3 Object operation similarity model</title>
<p>In human experiences, objects with similar characteristics can be manipulated in a similar manner. For example, apples and oranges, or water bottles and milk bottles can be manipulated upon using the same manipulation operator. To endow the robot with similar flexibility, ARTProF introduces an object operation similarity model. This model determines suitable manipulation operator for an unknown object based on object similarity. The object similarity is defined as:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mi>I</mml:mi><mml:mi>M</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mi>A</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>k</mml:mi><mml:mo>*</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>SIM</italic> denotes the object similarity with values ranging from [0, 1]. Higher values indicate larger object similarity. <italic>a</italic><sub>1</sub> and <italic>a</italic><sub>2</sub> are two object instances being compared. <italic>A</italic> is the set of object instances. <italic>N</italic> is the number of object features. <italic>S</italic><sub><italic>i</italic></sub> represents the similarity of objects based on feature <italic>i</italic>. <italic>k</italic> is used to adjust the slope of the sigmoid function. Decreasing the value of <italic>k</italic> will slow down the speed of approaching the limiting values.</p>
<p>The object similarity model extracts object features that directly influence the robot&#x00027;s manipulation of the object. The similarity of object features is then computed. The object features include: {category, shape, material, and size}. The influences of different features on the object similarity is manually defined. <xref ref-type="table" rid="T1">Table 1</xref> shows object similarities with the same, different or unknown conditions of category, shape, and material. <xref ref-type="table" rid="T2">Table 2</xref> shows object similarities of different sizes, measured using deviation intervals <italic>d</italic>. The volume deviation uses a unit of 10<italic>cm</italic><sup>3</sup>, while length, width, and height deviations use units of 2<italic>cm</italic>, 1<italic>cm</italic>, and 1<italic>cm</italic>, respectively. The object similarities are shown in <xref ref-type="table" rid="T3">Table 3</xref> [<italic>k</italic> is set to 0.3 in <xref ref-type="disp-formula" rid="E1">Equation (1)</xref>]. The results align with human cognition.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Category, shape, and material feature similarity measure.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><inline-graphic xlink:href="fnbot-18-1401075-i0002.tif"/></th>
<th valign="top" align="center"><bold>Same</bold></th>
<th valign="top" align="center"><bold>Different</bold></th>
<th valign="top" align="center"><bold>Unknown</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Category</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
</tr> <tr>
<td valign="top" align="left">Shape</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">-4</td>
<td valign="top" align="center">0</td>
</tr> <tr>
<td valign="top" align="left">Material</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">-1</td>
<td valign="top" align="center">0</td>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Size feature similarity measure.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><inline-graphic xlink:href="fnbot-18-1401075-i0003.tif"/></th>
<th valign="top" align="center"><bold>|<italic>x</italic>| &#x02264; 1<italic>d</italic></bold></th>
<th valign="top" align="center"><bold>1<italic>d</italic> &#x0003C; |<italic>x</italic>| &#x02264; 2<italic>d</italic></bold></th>
<th valign="top" align="center"><bold>2<italic>d</italic> &#x0003C; |<italic>x</italic>| &#x02264; 3<italic>d</italic></bold></th>
<th valign="top" align="center"><bold>3<italic>d</italic> &#x0003C; |<italic>x</italic>| &#x02264; 5<italic>d</italic></bold></th>
<th valign="top" align="center"><bold>|<italic>x</italic>|&#x0003E;5<italic>d</italic></bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Volume (cm<sup>3</sup>)</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">-1</td>
</tr> <tr>
<td valign="top" align="left">Length (cm)</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">-1</td>
<td valign="top" align="center">-1</td>
</tr> <tr>
<td valign="top" align="left">Width (cm)</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">-1</td>
<td valign="top" align="center">-2</td>
<td valign="top" align="center">-2</td>
</tr> <tr>
<td valign="top" align="left">Height (cm)</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">-1</td>
<td valign="top" align="center">-1</td>
</tr></tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Example of object similarity.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><inline-graphic xlink:href="fnbot-18-1401075-i0004.tif"/></th>
<th valign="top" align="center"><bold>Apple</bold></th>
<th valign="top" align="center"><bold>Orange</bold></th>
<th valign="top" align="center"><bold>Water bottle</bold></th>
<th valign="top" align="center"><bold>Milk bottle</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Apple</td>
<td valign="top" align="center">1.0</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">0.23</td>
</tr> <tr>
<td valign="top" align="left">Orange</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">1.0</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">0.23</td>
</tr> <tr>
<td valign="top" align="left">Water bottle</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">1.0</td>
<td valign="top" align="center">0.77</td>
</tr> <tr>
<td valign="top" align="left">Milk bottle</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">1.0</td>
</tr></tbody>
</table>
</table-wrap>
<p>The category and size features can be obtained in real-time through perceptual algorithms such as YOLO v3 and SSD. The shape and material are static features defined directly in the knowledge base through attributes &#x0201C;shapeOfObject&#x0201D; and &#x0201C;materialOfObject.&#x0201D; The manipulation operator is then searched based on the object operation similarity model. All manipulation operators associated with the basic action are identified. The similarity between the current operated object and the object in each manipulation operator is calculated. Finally, the manipulation operator with the highest similarity is selected, and the corresponding action is executed.</p></sec>
</sec>
<sec id="s5">
<title>5 Dynamic task planning</title>
<sec>
<title>5.1 Task knowledge representation</title>
<p>A robotic task can often be decomposed into several low-level primitive actions. The aim of the task planning is to rearrange the primitive actions with preconditions and effects to achieve the task. The knowledge representation in ARTProF adopts a hierarchical structure. The OWL pseudo-code for retrieving an apple is described as:</p>
<preformat>
Class: TakeAnAppleToPlate
&#x000A0;&#x000A0;subClassOf: PuttingSomethingSomewhere
&#x000A0;&#x000A0;objectActedOn some Apple
&#x000A0;&#x000A0;toLocation in Plate
&#x000A0;
Class: PuttingSomethingSomewhere
&#x000A0;&#x000A0;subClassOf: Action
&#x000A0;&#x000A0;subAction some PickingUpAnObject
&#x000A0;&#x000A0;subAction some PuttingDownAnObject
&#x000A0;
Class: PickingUpAnObject
&#x000A0;&#x000A0;subClassOf: Action
&#x000A0;&#x000A0;objectActedOn some Object
&#x000A0;
Class: PuttingDownAnObject
&#x000A0;&#x000A0;subClassOf: Action
&#x000A0;&#x000A0;objectActedOn some Object
&#x000A0;&#x000A0;toLocation some Place
...
</preformat>
<p>The task of &#x0201C;TakeAnAppleToPlate&#x0201D; is defined as a subclass of &#x0201C;PuttingSomethingSomewhere.&#x0201D; The predicates &#x0201C;objectActedOn&#x0201D; and &#x0201C;toLocation&#x0201D; are used to define the operated object and its target location. The &#x0201C;PuttingSomethingSomewhere&#x0201D; class as a subclass of &#x0201C;Action&#x0201D; comprises sub-actions &#x0201C;PickingUpAnObject&#x0201D; and &#x0201C;PuttingDownAnObject.&#x0201D; The sub-action &#x0201C;PickingUpAnObject,&#x0201D; as a subclass of &#x0201C;Action,&#x0201D; is constrained by the predicate &#x0201C;objectActedOn.&#x0201D; The sub-action &#x0201C;PuttingDownAnObject,&#x0201D; which is also a subclass of &#x0201C;Action,&#x0201D; is constrained by the predicates &#x0201C;objectActedOn&#x0201D; and &#x0201C;toLocation.&#x0201D;</p>
</sec>
<sec>
<title>5.2 Task execution</title>
<p>Task knowledge guides the process of robotic task execution. Upon receiving a task execution command, the robot initiates the task planning process by inquiring the task knowledge from the knowledge base to create the task instance. The action execution sequence is then obtained. The action instances are generated following the constraints of the task (e.g., preconditions and effects). Both task instances and action instances are stored in memory as temporary knowledge using a prolog-based representation. The pseudo-code of the generated task and action example of retrieving an apple is described as:</p>
<preformat>
Instance: TakeAnAppleToPlate_001
&#x000A0;&#x000A0;type: TakeAnAppleToPlate
&#x000A0;&#x000A0;ObjectActedOn: Apple
&#x000A0;&#x000A0;toLocation: in Plate
&#x000A0;&#x000A0;subAction : PickingUpAnObject_001
&#x000A0;&#x000A0;subAction : PuttingDownAnObject_001
&#x000A0;
Instance: PickingUpAnObject_001
&#x000A0;&#x000A0;type: PickingUpAnObject
&#x000A0;&#x000A0;objectActedOn: Apple
&#x000A0;
Instance: PuttingDownAnObject_001
&#x000A0;&#x000A0;type: PuttingDownAnObject
&#x000A0;&#x000A0;objectActedOn: Apple
&#x000A0;&#x000A0;toLocation: in Plate
...
</preformat>
<p>The generated task instance &#x0201C;TakeAnAppleToPlate_001&#x0201D; includes two action instances &#x0201C;PickingUpAnObject_001&#x0201D; and &#x0201C;PuttingDownAnObject_001.&#x0201D; The action instance &#x0201C;PickingUpAnObject_001&#x0201D; is subject to task constraints. The operated object is replaced from the abstract &#x0201C;Object&#x0201D; by the specific object &#x0201C;Apple.&#x0201D; Similarly, the abstract position &#x0201C;Some Place&#x0201D; is replaced by &#x0201C;in Plate&#x0201D; in &#x0201C;PuttingDownAnObject_001.&#x0201D;</p>
</sec>
<sec>
<title>5.3 Dynamic task planning</title>
<p>In ARTProF, we introduce a dynamic task planning method called Action Primitive Conditional Exploration Dynamic Task Planning. The robot performs each primitive action based on its prior task knowledge. During task execution, if a primitive action fails to meet the execution preconditions. the system initiates a search for a new action sequence through knowledge reasoning. This process continues until a valid primitive action, meeting the execution preconditions, is successfully executed. The task is considered completed once all defined primitive actions have been carried out.</p>
<p>Suppose the task is denoted by <italic>T</italic>, and the action is denoted by <italic>a</italic>. The task can be represented as <italic>T</italic> &#x0003D; [<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, <italic>a</italic><sub><italic>i</italic></sub>, &#x02026;, <italic>a</italic><sub><italic>n</italic></sub>], where <italic>a</italic><sub><italic>i</italic></sub> represents the <italic>i</italic>-th primitive action in the action sequence. This task comprises an ordered composition of <italic>n</italic> primitive actions. The description of the algorithm is shown in <xref ref-type="table" rid="T5">Algorithm 1</xref>.</p>
<p>In <xref ref-type="table" rid="T5">Algorithm 1</xref>, <italic>T</italic>&#x02032; denotes the set of actions generated, initialized to &#x02205;. <italic>len</italic> is the action depth, initialized to 0. When an action fails to meet the execution condition, the action depth is increased by 1, and this process iterates. The maximum action depth is denoted by <italic>MAXLEN</italic>. <italic>PS</italic> is the set of action sequences generated through knowledge reasoning represented by <italic>getPreActionsList</italic>. <italic>P</italic> denotes the action sequence that meets the execution conditions for the current action. The function <italic>actionExe</italic>(<italic>a</italic>) denotes the robot&#x00027;s execution of action <italic>a</italic>. By default, if action <italic>a</italic> fails during execution and still meets the execution conditions, it will be executed again until successful.</p>
<p>Assuming task <italic>T</italic> &#x0003D; [<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, <italic>a</italic><sub><italic>i</italic></sub>, ..., <italic>a</italic><sub><italic>n</italic></sub>], the execution process of the dynamic task planning algorithm is shown in <xref ref-type="fig" rid="F11">Figure 11</xref>. Initially, the system executes <italic>a</italic><sub>1</sub> and plans <italic>a</italic><sub>2</sub> next. As <italic>a</italic><sub>2</sub> fails the execution conditions, the system explores alternative action sequences [<italic>b</italic><sub>1</sub>, <italic>b</italic><sub>2</sub>, <italic>a</italic><sub>2</sub>] and [<italic>c</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>] through knowledge reasoning. After checking that <italic>b</italic><sub>1</sub> does not meet the execution conditions, and the action depth of <italic>d</italic><sub>1</sub> is already at the maximum depth (<italic>MAXLEN</italic>=2), the system backtracks to execute [<italic>c</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>]. Upon evaluation, [<italic>e</italic><sub>1</sub>, <italic>e</italic><sub>2</sub>, <italic>c</italic><sub>1</sub>] are successfully executed, and the system implements <italic>a</italic><sub>2</sub>, <italic>a</italic><sub>3</sub>, etc.</p>
<fig id="F11" position="float">
<label>Figure 11</label>
<caption><p>The execution process of the dynamic task planning. The initial action sequence for the task is [<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, <italic>a</italic><sub>3</sub>, ..., <italic>a</italic><sub><italic>n</italic>&#x02212;1</sub><italic>a</italic><sub><italic>n</italic></sub>], and during the task execution process, dynamic planning is triggered due to some actions not satisfying the execution conditions. The actual execution action sequence becomes [<italic>a</italic><sub>1</sub>, <italic>e</italic><sub>1</sub>, <italic>e</italic><sub>2</sub>, <italic>c</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, <italic>a</italic><sub>3</sub>, ..., <italic>a</italic><sub><italic>n</italic>&#x02212;1</sub>, <italic>f</italic><sub>1</sub>, <italic>a</italic><sub><italic>n</italic></sub>].</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0011.tif"/>
</fig>
<p>The dynamic task planning relies on task ontological knowledge as the prior knowledge. Throughout the execution of robotic tasks, new actions are explored by considering both depth and breadth. This strategy empowers the robot with adaptability to work in unstructured environments while avoiding the shortcomings of manually editing domain knowledge in traditional task planners.</p></sec>
</sec>
<sec id="s6">
<title>6 Experimental results and analysis</title>
<sec>
<title>6.1 Experiment 1: autonomous retrieval of manipulation operators with objects in the scene</title>
<p>The experiments are carried out in a laboratory environment shown in <xref ref-type="fig" rid="F12">Figure 12</xref>. The robotic system, named RedBot, is equipped with two 6-degrees-of-freedom (DOF) manipulators, a mobile platform, two single line radars, an industrial computer, and an RGB-D camera (Intel RealSense D435i). The software modules include mapping, navigation, obstacle avoidance, object detection and recognition, grasping, etc. These modules are communicated through ROS messages. The robot is controlled by a back-end server. The back-end platform utilizes containers to isolate distinct software environments. The data exchange between the robot and the back-end platform is facilitated through Ethernet.</p>
<fig id="F12" position="float">
<label>Figure 12</label>
<caption><p>Experimental environment for autonomous retrieval of manipulation operators with objects.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0012.tif"/>
</fig>
<p>The robot demonstrates capability in grasping various objects such as oranges, cups, and footballs, etc, as indicated in <xref ref-type="table" rid="T4">Table 4</xref>. Corresponding manipulation operators for these grasp actions are denoted as &#x0201C;pickingUpOrange,&#x0201D; &#x0201C;pickingUpCup,&#x0201D; and &#x0201C;pickingUpFootball.&#x0201D; However, real-world scenarios often introduce novel objects to the robot. In our experiments, we also conduct grasping experiments involving apples, water bottles, and watermelons, of which the manipulation operators are not specified in the knowledge base.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Features of objects to be grasped.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Object</bold></th>
<th valign="top" align="left"><bold>Category</bold></th>
<th valign="top" align="left"><bold>Shape</bold></th>
<th valign="top" align="left"><bold>Material</bold></th>
<th valign="top" align="left"><bold>Volume (<italic>cm</italic><sup>3</sup>)</bold></th>
<th valign="top" align="left"><bold>Length (<italic>cm</italic>)</bold></th>
<th valign="top" align="left"><bold>Width (<italic>cm</italic>)</bold></th>
<th valign="top" align="left"><bold>Height (<italic>cm</italic>)</bold></th>
<th valign="top" align="left"><bold>Manipulation operator</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Orange</td>
<td valign="top" align="left">Orange</td>
<td valign="top" align="left">Sphere</td>
<td valign="top" align="left">Fruit</td>
<td valign="top" align="left">512</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">pickingUpOrange</td>
</tr> <tr>
<td valign="top" align="left">Cup</td>
<td valign="top" align="left">Cup</td>
<td valign="top" align="left">Cylinder</td>
<td valign="top" align="left">Metal</td>
<td valign="top" align="left">540</td>
<td valign="top" align="left">6</td>
<td valign="top" align="left">6</td>
<td valign="top" align="left">15</td>
<td valign="top" align="left">pickingUpCup</td>
</tr> <tr>
<td valign="top" align="left">Football</td>
<td valign="top" align="left">Football</td>
<td valign="top" align="left">Sphere</td>
<td valign="top" align="left">Leather</td>
<td valign="top" align="left">8,000</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">pickingUpFootball</td>
</tr> <tr>
<td valign="top" align="left">Apple</td>
<td valign="top" align="left">Apple</td>
<td valign="top" align="left">Sphere</td>
<td valign="top" align="left">Fruit</td>
<td valign="top" align="left">512</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">8</td>
<td valign="top" align="left">pickingUpOrange</td>
</tr> <tr>
<td valign="top" align="left">Waterbottle</td>
<td valign="top" align="left">Bottle</td>
<td valign="top" align="left">Cylinder</td>
<td valign="top" align="left">Plastic</td>
<td valign="top" align="left">720</td>
<td valign="top" align="left">6</td>
<td valign="top" align="left">6</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">pickingUpCup</td>
</tr> <tr>
<td valign="top" align="left">Watermelon</td>
<td valign="top" align="left">Watermelon</td>
<td valign="top" align="left">Sphere</td>
<td valign="top" align="left">Fruit</td>
<td valign="top" align="left">8,000</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">pickingUpFootball</td>
</tr></tbody>
</table>
</table-wrap>
<p>In our grasping experiment, the robot is placed in front of a table where the objects are placed. As there is no manipulation operator for grasping apples, water bottles, and watermelons in our knowledge base, the robot selects the manipulation operators by evaluating the similarities between objects. The object&#x00027;s shape and material are obtained from the knowledge base as prior knowledge. The perception system of ARTProF provides measurements for the length, width, and height of the objects, from which the volume is calculated as <italic>V</italic> &#x0003D; <italic>L</italic>&#x000D7;<italic>W</italic>&#x000D7;<italic>H</italic>. The similarity between objects is calculated as shown in <xref ref-type="fig" rid="F13">Figure 13A</xref>. Darker boxes indicate lower similarity. The robot grasping of different objects and the corresponding manipulation operators are illustrated in <xref ref-type="fig" rid="F13">Figure 13B</xref>. The experiments highlight the robot&#x00027;s ability not only to autonomously match manipulation operators for objects in the scene but also to generalize operators across different objects.</p>
<fig id="F13" position="float">
<label>Figure 13</label>
<caption><p>The experimental results of operation transfer. <bold>(A)</bold> The Similarities of objects to be grasped. Darker cells indicate lower similarities between objects. <bold>(B)</bold> The robot grasping of different objects and the corresponding manipulation operators. The grabbing of apples and oranges uses the same manipulation operator &#x0201C;pickingUpOrange.&#x0201D; Similarly, the grabbing of the cup and water bottle uses the same manipulation operator &#x0201C;PickingUpCup,&#x0201D; and the grabbing of the football and watermelon uses the same manipulation operator &#x0201C;PickingUpFootball&#x0201D;.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0013.tif"/>
</fig>
</sec>
<sec>
<title>6.2 Experiment 2: dynamic task planning</title>
<p>This experiment takes retrieving an apple as an example to achieve task planning in unknown environments. The dynamic task planning experiment is conducted in the simulation platform CoppeliaSim (version 4.1.0). The indoor experimental setting, as shown in <xref ref-type="fig" rid="F14">Figure 14A</xref>, includes a YouBot by KUKA, an apple, a plate, a box, a cabinet with drawers, and two tables. The semantic map of the task-related background knowledge is shown in <xref ref-type="fig" rid="F14">Figure 14B</xref>, where the dark nodes are task knowledge, and the light nodes are environmental knowledge. The grasping task is defined as two actions: picking up and putting down. The knowledge representation of the task is presented in Section 5.1. In scene 1 and scene 2, the apple is placed on the table, as shown in <xref ref-type="fig" rid="F15">Figures 15a1</xref>, <xref ref-type="fig" rid="F15">b1</xref>. In scene 3, the apple is in the box, as shown in <xref ref-type="fig" rid="F15">Figure 15c1</xref>. In scene 4, the apple is in the drawer, as shown in <xref ref-type="fig" rid="F15">Figure 15d1</xref>.</p>
<fig id="F14" position="float">
<label>Figure 14</label>
<caption><p>The scenario and the background knowledge of the dynamic task planning. <bold>(A)</bold> The scenario of the dynamic task planning. The location of the apple is uncertain, it could be placed on the table near the robot or in a box or drawer far from the robot. <bold>(B)</bold> The semantic map of the task-related background knowledge. The dark nodes are task knowledge, and the light nodes are environmental knowledge.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0014.tif"/>
</fig><fig id="F15" position="float">
<label>Figure 15</label>
<caption><p>Screenshots of robot&#x00027;s dynamic task planning experiment. <bold>(a1&#x02013;a5)</bold> The apple is placed on the table, and there are no instances of action execution failure. <bold>(b1&#x02013;b7)</bold> The apple is placed on the table, and the first attempt at the &#x0201C;PickingUpAnObject&#x0201D; action fails. <bold>(c1&#x02013;c9)</bold> The apple is in the box. <bold>(d1&#x02013;d12)</bold> The apple is in the drawer.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1401075-g0015.tif"/>
</fig>
<p>In scene 1, the &#x0201C;pick-up&#x0201D; action is executed successfully. However, the subsequent &#x0201C;put-down&#x0201D; action fails due to the &#x0201C;toLocation&#x0201D; (plate location) is beyond the robot arm&#x00027;s operational range. According to the dynamic task planning, the &#x0201C;navigatingTo&#x0201D; action is triggered. Once the navigation is completed, fulfilling the conditions for the &#x0201C;put-down&#x0201D; action, the &#x0201C;put-down&#x0201D; action is executed accordingly. The final action sequence involves picking up the apple &#x02192; navigating to the plate &#x02192; putting down the apple, as illustrated in <xref ref-type="fig" rid="F15">Figures 15a2</xref>&#x02013;<xref ref-type="fig" rid="F15">a5</xref>.</p>
<p>In scene 2, during the execution of the &#x0201C;pick-up&#x0201D; action, the apple falls, resulting in the failure of the action execution. However, since the conditions for executing the &#x0201C;pick-up&#x0201D; action are still met at this point, the &#x0201C;pick-up&#x0201D; action is repeated. After successfully execution of the pick-up action, the subsequent task execution process follows the same pattern as in scene 1. The final action execution sequence involves picking up the apple (performed twice) &#x02192; navigating to the plate &#x02192; putting down the apple, as illustrated in <xref ref-type="fig" rid="F15">Figures 15b2</xref>&#x02013;<xref ref-type="fig" rid="F15">b7</xref>. In scene 3, the robot fails to perceive the apple, leading to the unmet condition for the &#x0201C;pick-up&#x0201D; action. After the knowledge reasoning in ARTProF, the robot infers the potential presence of the apple within the box. Consequently, it performs the &#x0201C;openingABox&#x0201D; operation. Once the apple is found, the subsequent actions are similar with those of scene 1, resulting in the final action sequence: navigating to the box &#x02192; opening the box &#x02192; picking up the apple &#x02192; navigating to the plate &#x02192; putting down the apple, as shown in <xref ref-type="fig" rid="F15">Figures 15c2</xref>&#x02013;<xref ref-type="fig" rid="F15">c9</xref>.</p>
<p>In scene 4, the apple is not present in the box. According to dynamic task planning, the robot infers the potential presence of the apple within the drawer. Since the drawer is not within the operating range of the manipulator, the robot navigates to the drawer firstly. The final action sequence of the robot is: navigating to the box &#x02192; opening the box &#x02192; navigating to the drawer &#x02192; opening the drawer &#x02192; picking up the apple &#x02192; navigating to the plate &#x02192; putting down the apple. The execution process is shown in <xref ref-type="fig" rid="F15">Figures 15d2</xref>&#x02013;<xref ref-type="fig" rid="F15">d12</xref>. Experimental details are demonstrated in the Supplementary video.<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> Experimental results show that the effectiveness of the proposed ARTProF in improving the robot&#x00027;s adaptability within unstructured and dynamic environments.</p></sec>
</sec>
<sec sec-type="conclusions" id="s7">
<title>7 Conclusions</title>
<p>In this paper, we proposed an Ontology based Autonomous Robot Task Processing Framework (ARTProF) to improve the robot&#x00027;s adaptability within unstructured and dynamic environments. ARTProF includes key functionalities such as knowledge representation, knowledge reasoning, and task planning and control. The interface between the knowledge base and the neural network-based object detection algorithms augments the perception capabilities of the robots. To bridge the gap between the knowledge base and robot actions, the framework defines ROS based manipulation operators. ARTProF also introduces an operation similarity model, enabling the robot to generalize operations to novel objects effectively. A dynamic task planning method based on knowledge reasoning is further proposed for autonomous task planning. Experimental results showcase improvements of ARTProF in the robot&#x00027;s environmental perception, generalization abilities, and autonomous task execution within unstructured and dynamic environments. Ongoing research is focused on refining the ARTProF framework by integrating neurosymbolic inference.</p>
</sec>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p></sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>YG: Writing &#x02013; original draft. SZ: Validation, Writing &#x02013; review &#x00026; editing. YC: Writing &#x02013; review &#x00026; editing. TL: Writing &#x02013; review &#x00026; editing. HW: Validation, Writing &#x02013; review &#x00026; editing. XH: Validation, Writing &#x02013; review &#x00026; editing. SW: Conceptualization, Funding acquisition, Methodology, Writing &#x02013; review &#x00026; editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This study was funded by the National Natural Science Foundation of China under Grants U23B2038 and 62273342.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/m9.figshare.25531045.v2">https://doi.org/10.6084/m9.figshare.25531045.v2</ext-link></p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Beetz</surname> <given-names>M.</given-names></name> <name><surname>Be&#x000DF;ler</surname> <given-names>D.</given-names></name> <name><surname>Haidu</surname> <given-names>A.</given-names></name> <name><surname>Pomarlan</surname> <given-names>M.</given-names></name> <name><surname>Bozcuo&#x0011F;lu</surname> <given-names>A. K.</given-names></name> <name><surname>Bartels</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Know rob 2.0&#x02014;a 2nd generation knowledge processing framework for cognition-enabled robotic agents,&#x0201D;</article-title> in <source>2018 IEEE International Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Brisbane, QLD</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>512</fpage>&#x02013;<lpage>519</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Beetz</surname> <given-names>M.</given-names></name> <name><surname>Kazhoyan</surname> <given-names>G.</given-names></name> <name><surname>Vernon</surname> <given-names>D.</given-names></name></person-group> (<year>2023</year>). <article-title>The CRAM cognitive architecture for robot manipulation in everyday activities</article-title>. <source>arXiv:2304.14119 [cs]</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2304.14119</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Beetz</surname> <given-names>M.</given-names></name> <name><surname>M&#x000F6;senlechner</surname> <given-names>L.</given-names></name> <name><surname>Tenorth</surname> <given-names>M.</given-names></name></person-group> (<year>2010</year>). <article-title>&#x0201C;CRAM&#x02014;a cognitive robot abstract machine for everyday manipulation in human environments,&#x0201D;</article-title> in <source>2010 IEEE/RSJ International Conference on Intelligent Robots and Systems</source> (<publisher-loc>Taipei</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1012</fpage>&#x02013;<lpage>1017</lpage>.</citation>
</ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Beetz</surname> <given-names>M.</given-names></name> <name><surname>Tenorth</surname> <given-names>M.</given-names></name> <name><surname>Winkler</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Open-ease,&#x0201D;</article-title> in <source>2015 IEEE International Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1983</fpage>&#x02013;<lpage>1990</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bharati</surname> <given-names>P.</given-names></name> <name><surname>Pramanik</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Deep learning techniques&#x02014;R-CNN to mask R-CNN: a survey,&#x0201D;</article-title> in <source>Computational Intelligence in Pattern Recognition, Vol. 999. Advances in Intelligent Systems and Computing</source>, eds. A. K. Das, J. Nayak, B. Naik, S. K. Pati, and D. Pelusi (Singapore: Springer Singapore), <fpage>657</fpage>&#x02013;<lpage>668</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Billings</surname> <given-names>G.</given-names></name> <name><surname>Johnson-Roberson</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>SilhoNet: an RGB method for 6D object pose estimation</article-title>. <source>IEEE Robot. Automat. Lett</source>. <volume>4</volume>, <fpage>3727</fpage>&#x02013;<lpage>3734</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1809.06893</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Fast R-CNN,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Computer Vision</source> (<publisher-loc>Santiago</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1440</fpage>&#x02013;<lpage>1448</lpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Haarslev</surname> <given-names>V.</given-names></name> <name><surname>M&#x000F6;ller</surname> <given-names>R.</given-names></name></person-group> (<year>2001</year>). <article-title>&#x0201C;Description of the RACER system and its applications,&#x0201D;</article-title> in <source>International Workshop on Description Logics (DL-2001)</source> (<publisher-loc>Stanford, CA</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.researchgate.net/publication/220957323_Description_of_the_RACER_System_and_its_Applications">https://www.researchgate.net/publication/220957323_Description_of_the_RACER_System_and_its_Applications</ext-link></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gkioxari</surname> <given-names>G.</given-names></name> <name><surname>Doll&#x000E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Mask R-CNN,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Computer Vision</source> (<publisher-loc>Venice</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2961</fpage>&#x02013;<lpage>2969</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jiang</surname> <given-names>P.</given-names></name> <name><surname>Ergu</surname> <given-names>D.</given-names></name> <name><surname>Liu</surname> <given-names>F.</given-names></name> <name><surname>Cai</surname> <given-names>Y.</given-names></name> <name><surname>Ma</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>A review of Yolo algorithm developments</article-title>. <source>Proc. Comput. Sci</source>. <volume>199</volume>, <fpage>1066</fpage>&#x02013;<lpage>1073</lpage>. <pub-id pub-id-type="doi">10.1016/j.procs.2022.01.135</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lemaignan</surname> <given-names>S.</given-names></name></person-group> (<year>2013</year>). <article-title>Grounding the interaction: knowledge management for interactive robots: dissertation abstract</article-title>. <source>K&#x000FC;nstliche Intelligenz</source> <volume>27</volume>, <fpage>183</fpage>&#x02013;<lpage>185</lpage>. <pub-id pub-id-type="doi">10.1007/s13218-013-0246-3</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lemaignan</surname> <given-names>S.</given-names></name> <name><surname>Ros</surname> <given-names>R.</given-names></name> <name><surname>M&#x000F6;senlechner</surname> <given-names>L.</given-names></name> <name><surname>Alami</surname> <given-names>R.</given-names></name> <name><surname>Beetz</surname> <given-names>M.</given-names></name></person-group> (<year>2010</year>). <article-title>&#x0201C;ORO, a knowledge management platform for cognitive architectures in robotics,&#x0201D;</article-title> in <source>2010 IEEE/RSJ International Conference on Intelligent Robots and Systems</source> (<publisher-loc>Taipei</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3548</fpage>&#x02013;<lpage>3553</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lenat</surname> <given-names>D. B.</given-names></name></person-group> (<year>1995</year>). <article-title>CYC: a large-scale investment in knowledge infrastructure</article-title>. <source>Commun. ACM</source> <volume>38</volume>, <fpage>33</fpage>&#x02013;<lpage>38</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Bilbao</surname> <given-names>S.</given-names></name> <name><surname>Mart&#x000F3;n-Wanton</surname> <given-names>T.</given-names></name> <name><surname>Bastos</surname> <given-names>J.</given-names></name> <name><surname>Rodriguez</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>SWARMs ontology: a common information model for the cooperation of underwater robots</article-title>. <source>Sensors</source> <volume>17</volume>:<fpage>569</fpage>. <pub-id pub-id-type="doi">10.3390/s17030569</pub-id><pub-id pub-id-type="pmid">28287468</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Anguelov</surname> <given-names>D.</given-names></name> <name><surname>Erhan</surname> <given-names>D.</given-names></name> <name><surname>Szegedy</surname> <given-names>C.</given-names></name> <name><surname>Reed</surname> <given-names>S.</given-names></name> <name><surname>Fu</surname> <given-names>C.-Y.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>&#x0201C;SSD: single shot multibox detector,&#x0201D;</article-title> in <source>Computer Vision&#x02014;ECCV 2016, vol. 9905</source>, eds. B. Leibe, J. Matas, N. Sebe, and M. Welling (Cham: Springer International Publishing), <fpage>21</fpage>&#x02013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Motik</surname> <given-names>B.</given-names></name> <name><surname>Patel-Schneider</surname> <given-names>P. F.</given-names></name> <name><surname>Parsia</surname> <given-names>B.</given-names></name> <name><surname>Bock</surname> <given-names>C.</given-names></name> <name><surname>Fokoue</surname> <given-names>A.</given-names></name> <name><surname>Haase</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>OWL 2 web ontology language: structural specification and functional-style syntax</article-title>. <source>W3C Recommend</source>. <volume>27</volume>:<fpage>159</fpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://www.w3.org/TR/owl2-syntax/">http://www.w3.org/TR/owl2-syntax/</ext-link></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Olivares-Alarcos</surname> <given-names>A.</given-names></name> <name><surname>Be&#x000DF;ler</surname> <given-names>D.</given-names></name> <name><surname>Khamis</surname> <given-names>A.</given-names></name> <name><surname>Goncalves</surname> <given-names>P.</given-names></name> <name><surname>Habib</surname> <given-names>M. K.</given-names></name> <name><surname>Bermejo-Alonso</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>A review and comparison of ontology-based approaches to robot autonomy</article-title>. <source>Knowl. Eng. Rev</source>. 34:e29. <pub-id pub-id-type="doi">10.1017/S0269888919000237</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Paulius</surname> <given-names>D.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>A survey of knowledge representation in service robotics</article-title>. <source>Robot. Autonom. Syst</source>. <volume>118</volume>, <fpage>13</fpage>&#x02013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1807.02192</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Peng</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>Q.</given-names></name> <name><surname>Zhou</surname> <given-names>X.</given-names></name> <name><surname>Bao</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;PVNet: Pixel-wise voting network for 6DoF pose estimation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4561</fpage>&#x02013;<lpage>4570</lpage>.<pub-id pub-id-type="pmid">33360984</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Redmon</surname> <given-names>J.</given-names></name> <name><surname>Divvala</surname> <given-names>S.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Farhadi</surname> <given-names>A.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;You only look once: unified, real-time object detection,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>779</fpage>&#x02013;<lpage>788</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Redmon</surname> <given-names>J.</given-names></name> <name><surname>Farhadi</surname> <given-names>A.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;YOLO9000: better, faster, stronger,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Honolulu, HI</publisher-loc>), <fpage>7263</fpage>&#x02013;<lpage>7271</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Redmon</surname> <given-names>J.</given-names></name> <name><surname>Farhadi</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>YOLOv3: an incremental improvement</article-title>. <source>arXiv:1804.02767 [cs]</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1804.02767</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>Adv. Neural Inform. Process. Syst</source>. <volume>28</volume>:<fpage>1497</fpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1506.01497</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Saxena</surname> <given-names>A.</given-names></name> <name><surname>Jain</surname> <given-names>A.</given-names></name> <name><surname>Sener</surname> <given-names>O.</given-names></name> <name><surname>Jami</surname> <given-names>A.</given-names></name> <name><surname>Misra</surname> <given-names>D. K.</given-names></name> <name><surname>Koppula</surname> <given-names>H. S.</given-names></name></person-group> (<year>2014</year>). <article-title>RoboBrain: large-scale knowledge engine for robots</article-title>. <source>arXiv preprint arXiv:1412.0691</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1412.0691</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Shearer</surname> <given-names>R. D.</given-names></name> <name><surname>Motik</surname> <given-names>B.</given-names></name> <name><surname>Horrocks</surname> <given-names>I.</given-names></name></person-group> (<year>2008</year>). <article-title>&#x0201C;Hermit: a highly-efficient OWL reasoner,&#x0201D;</article-title> in <source>Proceedings of the 5th International Workshop on OWL: Experiences and Directions (OWLED 2008)</source> (<publisher-loc>Aachen</publisher-loc>: <ext-link ext-link-type="uri" xlink:href="https://Ceur-ws.org">Ceur-ws.org</ext-link>).</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sirin</surname> <given-names>E.</given-names></name> <name><surname>Parsia</surname> <given-names>B.</given-names></name> <name><surname>Grau</surname> <given-names>B. C.</given-names></name> <name><surname>Kalyanpur</surname> <given-names>A.</given-names></name> <name><surname>Katz</surname> <given-names>Y.</given-names></name></person-group> (<year>2007</year>). <article-title>Pellet: a practical OWL-DL reasoner</article-title>. <source>J. Web Semant</source>. <volume>5</volume>, <fpage>51</fpage>&#x02013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1016/j.websem.2007.03.004</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Suh</surname> <given-names>I. H.</given-names></name> <name><surname>Lim</surname> <given-names>G. H.</given-names></name> <name><surname>Hwang</surname> <given-names>W.</given-names></name> <name><surname>Suh</surname> <given-names>H.</given-names></name> <name><surname>Choi</surname> <given-names>J.-H.</given-names></name> <name><surname>Park</surname> <given-names>Y.-T.</given-names></name></person-group> (<year>2007</year>). <article-title>&#x0201C;Ontology-based multi-layered robot knowledge framework (OMRKF) for robot intelligence,&#x0201D;</article-title> in <source>2007 IEEE/RSJ International Conference on Intelligent Robots and Systems</source> (<publisher-loc>San Diego, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>429</fpage>&#x02013;<lpage>436</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tenorth</surname> <given-names>M.</given-names></name> <name><surname>Beetz</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>KnowRob: a knowledge processing infrastructure for cognition-enabled robots</article-title>. <source>Int. J. Robot. Res</source>. <volume>32</volume>, <fpage>566</fpage>&#x02013;<lpage>590</lpage>. <pub-id pub-id-type="doi">10.1177/0278364913481635</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tenorth</surname> <given-names>M.</given-names></name> <name><surname>Beetz</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>Representations for robot knowledge in the KnowRob framework</article-title>. <source>Artif. Intell</source>. <volume>247</volume>, <fpage>151</fpage>&#x02013;<lpage>169</lpage>. <pub-id pub-id-type="doi">10.1016/j.artint.2015.05.010</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tenorth</surname> <given-names>M.</given-names></name> <name><surname>Kunze</surname> <given-names>L.</given-names></name> <name><surname>Jain</surname> <given-names>D.</given-names></name> <name><surname>Beetz</surname> <given-names>M.</given-names></name></person-group> (<year>2010</year>). <article-title>&#x0201C;Knowrob-map-knowledge-linked semantic object maps,&#x0201D;</article-title> in <source>2010 10th IEEE-RAS International Conference on Humanoid Robots</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>430</fpage>&#x02013;<lpage>435</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Vassiliadis</surname> <given-names>V.</given-names></name> <name><surname>Wielemaker</surname> <given-names>J.</given-names></name> <name><surname>Mungall</surname> <given-names>C.</given-names></name></person-group> (<year>2009</year>). <article-title>&#x0201C;Processing OWL2 ontologies using thea: an application of logic programming,&#x0201D;</article-title> in <source>Proceedings of the 6th International Workshop on OWL: Experiences and Directions</source> (<publisher-loc>OWLED 2009</publisher-loc>) (Chantilly, VA: <ext-link ext-link-type="uri" xlink:href="https://CEUR-WS.org">CEUR-WS.org</ext-link>).</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Waibel</surname> <given-names>M.</given-names></name> <name><surname>Beetz</surname> <given-names>M.</given-names></name> <name><surname>Civera</surname> <given-names>J.</given-names></name> <name><surname>d&#x00027;Andrea</surname> <given-names>R.</given-names></name> <name><surname>Elfring</surname> <given-names>J.</given-names></name> <name><surname>Galvez-Lopez</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Roboearth</article-title>. <source>IEEE Robot. Automat. Mag</source>. <volume>18</volume>, <fpage>69</fpage>&#x02013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1109/MRA.2011.941632</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wielemaker</surname> <given-names>J.</given-names></name> <name><surname>Schreiber</surname> <given-names>A. T.</given-names></name> <name><surname>Wielinga</surname> <given-names>B. J.</given-names></name></person-group> (<year>2003</year>). <article-title>&#x0201C;Prolog-based infrastructure for RDF: performance and scalability,&#x0201D;</article-title> in <source>The Semantic Web-Proceedings ISWC&#x00027;03, Sanibel Island, Florida</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer Verlag</publisher-name>), <fpage>644</fpage>&#x02013;<lpage>658</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wielemaker</surname> <given-names>J.</given-names></name> <name><surname>Schrijvers</surname> <given-names>T.</given-names></name> <name><surname>Triska</surname> <given-names>M.</given-names></name> <name><surname>Lager</surname> <given-names>T.</given-names></name></person-group> (<year>2012</year>). <article-title>SWI-prolog</article-title>. <source>Theory Pract. Log. Progr</source>. <volume>12</volume>, <fpage>67</fpage>&#x02013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1011.5332</pub-id><pub-id pub-id-type="pmid">28486579</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xiang</surname> <given-names>Y.</given-names></name> <name><surname>Schmidt</surname> <given-names>T.</given-names></name> <name><surname>Narayanan</surname> <given-names>V.</given-names></name> <name><surname>Fox</surname> <given-names>D.</given-names></name></person-group> (<year>2018</year>). <article-title>PoseCNN: a convolutional neural network for 6D object pose estimation in cluttered scenes</article-title>. <source>arXiv:1711.00199 [cs]</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1711.00199</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhai</surname> <given-names>S.</given-names></name> <name><surname>Shang</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Dong</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>DF-SSD: an improved SSD object detection algorithm based on DenseNet and feature fusion</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>24344</fpage>&#x02013;<lpage>24357</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2971026</pub-id></citation>
</ref>
</ref-list>
</back>
</article>