<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1732004</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2026.1732004</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Osiris<sup>&#x2b;&#x2b;</sup>: hierarchical representations for robotic-enabled precision agriculture</article-title>
<alt-title alt-title-type="left-running-head">Mukuddem et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2026.1732004">10.3389/frobt.2026.1732004</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Mukuddem</surname>
<given-names>Adam</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3344313"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Speed-Andrews</surname>
<given-names>Adam</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3270657"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Maweni</surname>
<given-names>Thabisa</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nanyaro</surname>
<given-names>Imannuel</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sojen</surname>
<given-names>Ritvik</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3273681"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hsiao</surname>
<given-names>Venny</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Amayo</surname>
<given-names>Paul</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3254933"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>African Robotics Unit, University of Cape Town</institution>, <city>Cape Town</city>, <country country="ZA">South Africa</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Adam Mukuddem, <email xlink:href="mailto:mkdada001@myuct.ac.za">mkdada001@myuct.ac.za</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-26">
<day>26</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>13</volume>
<elocation-id>1732004</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>08</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Mukuddem, Speed-Andrews, Maweni, Nanyaro, Sojen, Hsiao and Amayo.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Mukuddem, Speed-Andrews, Maweni, Nanyaro, Sojen, Hsiao and Amayo</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-26">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>There has been significant development in agricultural robotics over the past few years in the pursuit of optimising efficiency and addressing issues such as labour shortages and humans performing hazardous and arduous tasks. Despite this, human&#x2013;robot interaction in the agricultural sector remains largely unchanged, often requiring technical expertise, which hinders wide-scale adoption. This problem is particularly pronounced in the African context, where limited technical exposure and linguistic diversity pose significant barriers to the adoption of these technologies. While alternative means for human&#x2013;robot collaboration have been developed, these methods are currently limited to indoor structured environments. In this work, we introduce Osiris&#x2b;&#x2b;, a flexible approach designed to allow seamless communication between robots and humans on an array of precision agriculture tasks. We validate and evaluate the performance of Osiris&#x2b;&#x2b; in real-world agricultural environments, demonstrating that the system can create accurate and useful scene graphs that aid in solving the assigned tasks. This paves the way for the possibility of allowing natural language instructions, including those in African languages, to be issued to robots within the agricultural sector.</p>
</abstract>
<kwd-group>
<kwd>field robotics</kwd>
<kwd>human&#x2013;robot interaction</kwd>
<kwd>perception</kwd>
<kwd>precision agriculture</kwd>
<kwd>robotics</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work received funding from the Google.Org AI Collaborative on Food Security. The funder was not involved in the study design, collection, analysis, interpretation of data, the writing of this article, or the decision to submit it for publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="0"/>
<equation-count count="10"/>
<ref-count count="38"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Field Robotics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>There has been a continuous global interest in human&#x2013;robotic collaboration within the robotics community, with this approach maximising the unique capabilities of both humans and robots and minimising the perceived threat of human displacement in the workplace (<xref ref-type="bibr" rid="B2">Ajoudani et al., 2018</xref>). Effective robot&#x2013;human interfaces remain an open problem, particularly in the agricultural sector, where many workers have minimal technological training (<xref ref-type="bibr" rid="B19">Lytridis et al., 2021</xref>) and, therefore, require user-friendly interfaces.</p>
<p>Conventional farming methodologies are characterised by high labour intensity and, in the face of increasing shortages of skilled labour and increasing costs, may prove inadequate for achieving optimal efficiency and productivity in agricultural practices (<xref ref-type="bibr" rid="B4">Bai et al., 2023</xref>). Agricultural vehicle operators must function with considerable precision to reduce areas of omission and overlap. This task imposes significant demands on humans, who must constantly supervise the traversed paths and make real-time trajectory adjustments (<xref ref-type="bibr" rid="B4">Bai et al., 2023</xref>). Although autonomous agricultural robots and vehicles have the potential to perform such demanding tasks and mitigate the efficacy and productivity concerns associated with human labour (<xref ref-type="bibr" rid="B14">Duckett et al., 2018</xref>), the development of a comprehensive autonomous navigation system remains an unresolved challenge in the field of robotics. The integration of human&#x2013;robot collaboration, which uses the distinct capabilities of humans and robots while mitigating any perceived threats, might present a viable solution to this problem.</p>
<p>For human&#x2013;robot interfaces to be effective, they must, at a minimum, represent information that is understandable by both humans and robots over different scales. Hierarchical scene graphs have recently emerged as a powerful and human-understandable way of representing complex 3D environments. These describe environments through a layered or hierarchical graph where the nodes represent different spatial concepts (from low-level geometry to higher-level scene-scale reasoning) and the edges between them represent relationships.</p>
<p>We developed the notion of a scene graph for an agricultural outdoor environment in our prior work Osiris (<xref ref-type="bibr" rid="B22">Mukuddem and Amayo, 2024</xref>), which, while powerful, allowed only the construction of a scene graph in an open loop from a single robot. In this work, we specifically investigate the formulation of a scene graph for precision agricultural tasks such as robotic navigation, mapping, and interaction. We demonstrate that the flexibility of our approach allows the hierarchical scene graph to be applied to a variety of precision agricultural needs.</p>
<p>We summarise the main contributions in this study as follows:<list list-type="bullet">
<list-item>
<p>We modify the 3D scene graph structure of <italic>Osiris</italic> to more explicitly encode and allow precision agricultural tasks through the inclusion of a <italic>robot</italic> layer.</p>
</list-item>
<list-item>
<p>We demonstrate the utility of this flexible formulation across navigation, mapping, and interaction tasks.</p>
</list-item>
<list-item>
<p>We validate and evaluate the performance of this formulation using real-world data collected in agricultural settings.</p>
</list-item>
</list>
</p>
<p>The remainder of this article is organised as follows. Related work is discussed in <xref ref-type="sec" rid="s2">Section 2</xref>. An overview of the hierarchical graph formulation is presented in <xref ref-type="sec" rid="s3">Section 3</xref>. <xref ref-type="sec" rid="s4">Sections 4</xref>&#x2013;<xref ref-type="sec" rid="s6">6</xref> present the three different variants of hierarchical graphs for precision agriculture, beginning with Osiris-Nav, Osiris-Map, and finally NeurOsiris. <xref ref-type="sec" rid="s7">Section 7</xref> presents the performance of each of these formulations for different tasks in collected real-world farm data and field experiments. Finally, conclusions are drawn in <xref ref-type="sec" rid="s8">Section 8</xref>.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2-1">
<label>2.1</label>
<title>Cooperative robotics in agriculture</title>
<p>There has been a growing interest in robotics research in the agricultural field in the pursuit of optimising efficiency for precision agriculture. Most agricultural robots are task-specific systems designed for tasks such as monitoring, spraying, harvesting, and transportation (<xref ref-type="bibr" rid="B19">Lytridis et al., 2021</xref>) with varying levels of autonomy. Cooperative robotics refers to the process in which humans and robots act as a team to achieve a goal and is highly dependent on effective exchange of information between robots and humans (<xref ref-type="bibr" rid="B7">Benos et al., 2023</xref>). Particularly in the agriculture sector, collaborative human&#x2013;robot systems have the potential to enhance productivity (<xref ref-type="bibr" rid="B36">V&#xe1;sconez and Auat Cheein, 2022</xref>) and improve the quality of services (<xref ref-type="bibr" rid="B7">Benos et al., 2023</xref>). However, the use of cooperative robotics in the agricultural sector is an emerging trend (<xref ref-type="bibr" rid="B19">Lytridis et al., 2021</xref>).</p>
<p>Conventional methods for human&#x2013;robot interaction in agriculture typically involve devices such as keyboards, mice, pens, and touch screens (<xref ref-type="bibr" rid="B19">Lytridis et al., 2021</xref>). These interfaces require detailed inputs, for example, providing GPS coordinates for tasks such as transportation. However, a significant drawback is that the users must possess certain technical skills, which are often lacking in Africa (<xref ref-type="bibr" rid="B19">Lytridis et al., 2021</xref>). To address these limitations, vision-based interfaces have been explored for human&#x2013;robot collaboration (<xref ref-type="bibr" rid="B21">Moysiadis et al., 2024</xref>; <xref ref-type="bibr" rid="B30">Tagarakis et al., 2021</xref>; <xref ref-type="bibr" rid="B1">Aivazidou and Tsolakis, 2023</xref>). <xref ref-type="bibr" rid="B21">Moysiadis et al. (2024)</xref> created a system that utilises vision technology to interpret various human gestures and translate them into corresponding robot actions, thereby facilitating coordination between humans and robots in farming settings. This system was successfully tested in an actual orchard setting; however, it faced challenges such as difficulty in consistently detecting the intended human gestures in the agricultural field and the need for meticulous obstacle-avoidance programming as it relied on a complete GPS-enabled robotic operating system navigation stack.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>3D scene graphs</title>
<p>3D scene graphs have recently emerged as a powerful and human-understandable way of representing complex 3D environments (<xref ref-type="bibr" rid="B22">Mukuddem and Amayo, 2024</xref>). These represent environments through a layered or hierarchical graph, where nodes correspond to different spatial concepts (ranging from low-level geometry to higher-level scene-scale reasoning) and the edges between them represent relationships (<xref ref-type="bibr" rid="B16">Hughes et al., 2022</xref>). The choice of layers in the graph and the attributes of its nodes is based on the structure of the environment and is designed with consideration of task- and motion-planning queries (<xref ref-type="bibr" rid="B17">Hughes et al., 2023</xref>). Most work on 3D scene graphs has been developed for indoor scenes, but <xref ref-type="bibr" rid="B22">Mukuddem and Amayo (2024)</xref> created a 3D scene graph for an outdoor agricultural environment. <xref ref-type="bibr" rid="B9">Chang et al. (2023)</xref> developed a 3D scene graph system that takes inputs from multiple robots simultaneously to generate a single coherent 3D scene graph. In this work, we adapt the open-loop 3D scene graph structure developed by <xref ref-type="bibr" rid="B22">Mukuddem and Amayo (2024)</xref> to accommodate more specific precision agricultural tasks.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Osiris overview</title>
<p>This work demonstrates a flexible and adaptable hierarchical graph system that is responsive to different sensor modalities and precision agricultural tasks performed by mobile robots. In particular, we introduce three variants encapsulated within this framework.<list list-type="bullet">
<list-item>
<p>
<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">N</mml:mi>
<mml:mi mathvariant="bold">a</mml:mi>
<mml:mi mathvariant="bold">v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: A hierarchical graph system that enables real-time multi-robot coordination and creation of a hierarchical scene graph from visual input.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">M</mml:mi>
<mml:mi mathvariant="bold">a</mml:mi>
<mml:mi mathvariant="bold">p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: A hierarchical graph system that enables high-precision mapping of agricultural areas using LiDAR and inertial sensors.</p>
</list-item>
<list-item>
<p>
<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="bold">N</mml:mi>
<mml:mi mathvariant="bold">e</mml:mi>
<mml:mi mathvariant="bold">u</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mi mathvariant="bold">o</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">O</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mi mathvariant="bold">i</mml:mi>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>: A hierarchical graph folded into a neural field for high-resolution interaction of agricultural areas using visual input.</p>
</list-item>
</list>
</p>
<p>Although the specific construction details of each variant may differ, they share a common construction flow, as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The following section details the construction of the common hierarchical graph, breaking it down layer by layer.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Process flow.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g001.tif">
<alt-text content-type="machine-generated">Diagram showing a five-stage data processing pipeline with labeled boxes: Sensor Data (Cameras, LiDAR, Depth); Robotic Perception (Instance Segmentation, Ego-Motion Estimation, Robot Layer); 3D Individuation (Plant/Fruit Detection); Scene-Level Reconstruction (Planting Line Detection, Row Detection); and Language Encoding (Planting Line Detection), connected by arrows.</alt-text>
</graphic>
</fig>
<p>The layers of <italic>Osiris</italic>
<sup>
<italic>&#x2b;&#x2b;</italic>
</sup> include farm sections, rows, planting lines, plants, <italic>robots</italic>, and a 3D perception layer. <italic>Layer 1</italic> is a perception layer that captures the environment, and <italic>Layer 2</italic> is a topo-metric graph of the robot and the selected sensor data for precision robotic tasks. <italic>Layer 3</italic> is a subgraph of the plant objects detected within the farm. <italic>Layer 4</italic> is a subgraph of the planting lines. Planting lines represent the lines in which the crops are planted. <italic>Layer 5</italic> is a subgraph of the rows within the farm. A row is defined as a robot-navigable area located between two planting lines; in this way, two planting lines form the lower layer of a specific row. <italic>Layer 6</italic> is a subgraph representing the sections within the farm. Sections are areas of the farm in which crops are of a specific type as a farm can contain multiple types of crops.</p>
<p>Formally defined, the hierarchical graph obtained through <italic>Osiris</italic>
<sup>
<italic>&#x2b;&#x2b;</italic>
</sup> is <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">V,E</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the set of nodes and <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi mathvariant="script">E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the set of edges between nodes <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The set of nodes <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be divided into the six layers of the graph as follows: <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x222a;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<sec id="s3-1">
<label>3.1</label>
<title>Perception layer</title>
<p>
<italic>Layer 1</italic> of the <italic>Osiris</italic>
<sup>
<italic>&#x2b;&#x2b;</italic>
</sup> scene graph is a 3D perception layer and captures the environment at the resolution required for the particular navigation task. This layer is intentionally designed to be flexible and can be represented using a mesh, point cloud, or neural point cloud. This flexibility allows for computing to be traded off when perception accuracy is not very important. An example perception layer is detailed for each precision agricultural task in its corresponding section.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Robot layer</title>
<p>The robot&#x2019;s movement within the agricultural environment is encapsulated in the robot layer, <italic>Layer 2</italic>. This layer is generated while the robot traverses the agricultural environment and contains a spatio-temporal graph. This layer is characterised as an undirected graph consisting of the pose and spatial vertices <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and their temporal and spatial edges.</p>
<p>Pose vertices correspond to the initial position of the robot, and its ego-motion estimates are sub-sampled regularly to create keyframes. Formally, each pose vertex is defined as <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is an incremented index as a new keyframe is observed and <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a unique identification number given to each robot to facilitate further localisation tasks. Temporal edges are created between vertices, representing <bold>SE</bold>(3) transformations derived from odometry data, and they closely resemble the classical pose graphs commonly utilised in <bold>SLAM</bold>.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Object individuation layer</title>
<p>An object detection algorithm is used in <italic>Layer 1</italic> to obtain the objects in <italic>Layer 3</italic>. Objects are detected and updated upon the addition of a new keyframe. The specific implementation of this layer varies by task.</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Planting line layer</title>
<p>The construction of the first three layers of the scene graph only considers the information introduced in each keyframe. However, as proposed by <xref ref-type="bibr" rid="B22">Mukuddem and Amayo (2024)</xref>, the detection of planting lines considers the entirety of the objects in the scene. This is the first task in the <italic>Osiris</italic>
<sup>
<italic>&#x2b;&#x2b;</italic>
</sup> pipeline that reasons at the entire scene level, rather than solely at the keyframe level. This is important as the system is required to extract the underlying structure of the row-crop systems. Planting lines are, therefore, generated from the object nodes <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> belonging to <italic>Layer 3</italic>.</p>
<p>A two-dimensional (2D) bird&#x2019;s-eye view of plant objects is obtained as a preprocessing step to better understand the structure of row-system crops. This top&#x2013;down perspective not only reveals the underlying structure of the crop rows but also simplifies the detection of planting lines. By reducing the problem to a 2D multi-line estimation task, the computational complexity is significantly decreased. A reduced set, <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, is created, where <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of plant objects, <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the coordinates of the plant objects found in the original set <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Planting line detection is reduced to the fitting of multiple geometric line models to the set of plants <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The convex relaxation algorithm (CORAL) (<xref ref-type="bibr" rid="B3">Amayo et al., 2018</xref>) can be used for this. CORAL is a method to fit multiple geometric models to multi-structured data via convex relaxation (<xref ref-type="bibr" rid="B3">Amayo et al., 2018</xref>). The result of CORAL is a set of labels <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, which assigns each plant object in the set <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to a geometric line model <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the 2D equation of a line and <inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of labels.</p>
<p>CORAL approaches the multi-model estimation as an energy minimisation problem. In this way, an energy functional that aims for a solution that is geometrically accurate, spatially smooth, and compact is created, as shown below:<disp-formula id="e1">
<mml:math id="m27">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The initial term of this energy functional represents the geometric cost of assigning a plant object to a particular line model. Here, <inline-formula id="inf27">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the label currently assigned to the object. The residual function <inline-formula id="inf28">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> gives the Euclidean distance between the plant object&#x2019;s position and the geometric line model <inline-formula id="inf29">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. It, therefore, penalises label assignments that poorly correspond to the underlying data.</p>
<p>The second term of the energy functional represents a smoothness cost. We expect that in rowed crops, plants that are physically close together have a high probability of belonging to the same planting line. The function <inline-formula id="inf30">
<mml:math id="m31">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is formulated to penalise neighbouring plant objects that do not share the same label. This is calculated using <inline-formula id="inf31">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which calculates the gradient of the current label assignment between neighbouring plant objects. <inline-formula id="inf32">
<mml:math id="m33">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> trades-off between the smoothness and the geometric cost, while the weighting function <inline-formula id="inf33">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> allows for finer control of the influence of neighbouring points; in this case, weighting is reduced as the distance to a plant&#x2019;s neighbour increases. Here, <inline-formula id="inf34">
<mml:math id="m35">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of neighbours that every plant object will have.</p>
<p>The final term promotes compactness by favouring label assignments that explain the data using as few planting line models as possible. This reduces redundant models and ensures that the solution more closely resembles the underlying structure. The value <inline-formula id="inf35">
<mml:math id="m36">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> trades off compactness against smoothness and geometric cost.</p>
<p>The optimal label assignment <inline-formula id="inf36">
<mml:math id="m37">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be obtained through the minimisation of <xref ref-type="disp-formula" rid="e1">Equation 1</xref>. CORAL (<xref ref-type="bibr" rid="B3">Amayo et al., 2018</xref>) can simultaneously minimise the geometric, smoothness, and compactness terms. CORAL is defined <xref ref-type="disp-formula" rid="e2">Equations 2</xref>, <xref ref-type="disp-formula" rid="e3">3</xref>:<disp-formula id="e2">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>&#x2207;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m39">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>.</mml:mo>
<mml:mspace width="1em"/>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf37">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>:</mml:mo>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is known as the dual function of <inline-formula id="inf38">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">u</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and both <inline-formula id="inf39">
<mml:math id="m42">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf40">
<mml:math id="m43">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are dual norms. The optimal values of <inline-formula id="inf41">
<mml:math id="m44">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf42">
<mml:math id="m45">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a8;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are obtained from the first-order primal-dual optimisation.</p>
<p>CORAL presents a solution that is geometrically sound, spatially smooth, and free of redundant labels. Additionally, due to its highly parallel nature, CORAL&#x2019;s time performance does not reduce significantly even as the number of plant objects within the scene <inline-formula id="inf43">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> increases. This allows it to reason over large scenes without incurring a large time penalty (<xref ref-type="bibr" rid="B22">Mukuddem and Amayo, 2024</xref>).</p>
<p>Planting lines are, therefore, detected through CORAL on all plant objects within the scene after every keyframe has been processed. The corresponding labelling <inline-formula id="inf44">
<mml:math id="m47">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> presents edges between the plant object nodes <inline-formula id="inf45">
<mml:math id="m48">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and the line models <inline-formula id="inf46">
<mml:math id="m49">
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, which create line model nodes. Line model nodes <inline-formula id="inf47">
<mml:math id="m50">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are, thus, added to the set of nodes <inline-formula id="inf48">
<mml:math id="m51">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and added to the scene graph <inline-formula id="inf49">
<mml:math id="m52">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. To obtain <inline-formula id="inf50">
<mml:math id="m53">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the mean of all the positions of plant objects&#x2019; nodes assigned to the corresponding line model is used.</p>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Row layer</title>
<p>Within this framework, rows are defined as traversable sections that provide access to planting lines and, consequently, the planted objects. Rows, therefore, occur between planting lines, and the geometric line models <inline-formula id="inf51">
<mml:math id="m54">
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> obtained while extracting the plant line nodes in <italic>Layer 4</italic> can be used for the row detection.</p>
<p>Given a pair of line models, <inline-formula id="inf52">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf53">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, a gradient check is first performed to ensure that they are within a tolerance threshold <inline-formula id="inf54">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of being parallel. If the gradient check is within this threshold, the distance between the adjacent planting lines is calculated. A row node, <inline-formula id="inf55">
<mml:math id="m58">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, is then added between two planting lines if the calculated distance falls within a specified range, which is determined by the minimum and maximum tolerance values that are informed by the physical structure of the farm. Edges between this node <inline-formula id="inf56">
<mml:math id="m59">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and the two plant line nodes <inline-formula id="inf57">
<mml:math id="m60">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are also added to the graph. To obtain <inline-formula id="inf58">
<mml:math id="m61">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the mean of all the positions of plant line nodes that are assigned to the row node is used.</p>
<p>It is important to note that in this way, a plant line node <inline-formula id="inf59">
<mml:math id="m62">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> can belong to more than one row node <inline-formula id="inf60">
<mml:math id="m63">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. This differs from most definitions of hierarchical graphs, where current approaches typically ensure that each node at a lower layer has only a single edge to a node in the layer above it (<xref ref-type="bibr" rid="B29">Strader et al., 2024</xref>; <xref ref-type="bibr" rid="B16">Hughes et al., 2022</xref>; <xref ref-type="bibr" rid="B17">Hughes et al., 2023</xref>; <xref ref-type="bibr" rid="B27">Rosinol et al., 2021</xref>). Enforcing this would create a level of duplication when dealing with shared spatial concepts, which does not faithfully represent the underlying structure. By retaining the concept of shared spatial concepts in a manner similar to that observed in S-graphs (<xref ref-type="bibr" rid="B6">Bavle et al., 2022</xref>; <xref ref-type="bibr" rid="B5">Bavle et al., 2023</xref>), where &#x2018;walls&#x2019; can be shared by &#x2018;rooms,&#x2019; <italic>Osiris</italic> can maintain a high-fidelity representation of the underlying environment.</p>
</sec>
<sec id="s3-6">
<label>3.6</label>
<title>Section layer</title>
<p>The scene graph consolidator then creates the final layer by consolidating the row nodes <inline-formula id="inf61">
<mml:math id="m64">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> into a section node <inline-formula id="inf62">
<mml:math id="m65">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> that is added to the set of nodes <inline-formula id="inf63">
<mml:math id="m66">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> corresponding to the section layer and the scene graph <inline-formula id="inf64">
<mml:math id="m67">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Edges between this node and the row nodes are also added to the graph.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Osiris-Nav</title>
<p>This section describes the concept and construction of a 3D scene graph that not only enables the integration of multiple sessions and multiple robot inputs but is also primed for robot navigation. Vision-based navigation remains an extremely favourable option for mobile robots and has been widely used for autonomy in various fields. This, coupled with relatively low cost of cameras compared to other sensors, makes them a prime candidate for precision agricultural tasks. Osiris-Nav extensively uses image-based techniques to construct a 3D scene graph that advances towards this goal.</p>
<p>As mentioned in <xref ref-type="sec" rid="s2">Section 2</xref>, each variant of <inline-formula id="inf65">
<mml:math id="m68">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Osiris</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> only modifies the three base layers, and following object detection, all the subsequent layers of the hierarchical graph remain the same.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Perception layer</title>
<p>Building upon this, the perception layer, namely, <italic>Layer 1</italic>, of Osiris-Nav is developed in real-time from an image data stream, which is subsequently transformed into a 3D metric-semantic mesh. A keyframe-based construction approach was utilised to enhance system performance and ensure the usability of the resulting system in robotic applications. From the image data stream, ORBSLAM3 (<xref ref-type="bibr" rid="B8">Campos et al., 2021</xref>) is used to estimate the ego-motion of the camera. Upon detecting a significant alteration in motion, a new keyframe is introduced to incorporate the updated information of the scene. Following the extraction of keyframes, depth-estimation using a depth network and semantic labelling are pursued.</p>
<p>A two-stage approach was employed for instance segmentation. It consists of prompt-guided bounding-box detection, followed by high-quality semantic segmentation within the bounding boxes. Agricultural environments contain a wide range of objects, some of which are directly relevant to agricultural tasks. To effectively handle this diversity, Osiris-Nav leverages a prompt-guided approach. This approach facilitates generalisation, allowing the system to accurately represent rowed crops in a semantic mesh. Furthermore, it ensures that the graph representation is selective, including only objects relevant to agricultural tasks while filtering out irrelevant elements, thereby maintaining a focused and efficient representation of the agricultural environment (<xref ref-type="bibr" rid="B22">Mukuddem and Amayo, 2024</xref>).</p>
<p>Grounding DINO (<xref ref-type="bibr" rid="B18">Liu et al., 2023</xref>) was used to obtain bounding boxes using keywords such as &#x2018;fruits,&#x2019; &#x2018;plants,&#x2019; &#x2018;bushes,&#x2019; and &#x2018;paths.&#x2019; These bounding boxes were fed into &#x201c;Towards Real-Time Segment Anything&#x201d; (<xref ref-type="bibr" rid="B37">Wang et al., 2023</xref>) for semantic segmentation of stereo images. The resulting semantic images and depth images were combined to create a semantically labelled point cloud. This point cloud is then used with Voxblox (<xref ref-type="bibr" rid="B24">Oleynikova et al., 2017</xref>) to generate a truncated signed distance field (TSDF) and Euclidean distance field (ESDF) using data within a certain radius of the robot. The 3D metric semantic mesh is extracted using Voxblox&#x2019;s (<xref ref-type="bibr" rid="B24">Oleynikova et al., 2017</xref>) marching cube implementation.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Robot layer</title>
<p>The robot&#x2019;s movement within the agricultural environment is encapsulated in the robot layer, namely, <italic>Layer 2</italic>. This layer is generated while the robot traverses the agricultural environment and contains a spatio-temporal graph. In Osiris-Nav, the pose vertices in this layer correspond to the keyframe positions obtained through ORB-SLAM and the associated image and bag-of-words descriptors from the keyframe. The addition of the image and descriptors is critical for enabling the localisation, subsequent aggregation, and autonomy.</p>
<p>Similar to Kimera-Multi (<xref ref-type="bibr" rid="B34">Tian et al., 2022</xref>), Osiris-Nav augments the pose graph with a sub-sampling of the semantic mesh in <italic>Layer 1</italic>, allowing joint optimisation of both the pose graph and the underlying mesh. Joint optimisation counteracts both odometry drift due to inaccuracies in visual odometry over long distances and the incorporation of multiple robot scene graphs into a singular coherent scene graph.</p>
<p>The sub-sampled mesh is then simplified using a vertex clustering method that stores these vertices in an octree and then merges vertices belonging to the same voxel as the map grows. These merged vertices thus become the spatial vertices of the robot layer and are immediately connected to the pose vertex of its associated keyframe through a spatial edge. Each of these spatial vertices is defined as <inline-formula id="inf66">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf67">
<mml:math id="m70">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is initialised to the identity, and <inline-formula id="inf68">
<mml:math id="m71">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the 3D position of the merged mesh vertices.</p>
<sec id="s4-2-1">
<label>4.2.1</label>
<title>Robot atlas</title>
<p>The atlas integrates a virtually unlimited number of scene graphs generated from multiple robots and sessions. The atlas then establishes connections between scene graphs that observe the same location. This capability is enabled by a database of DBoW descriptors within the atlas, populated from the robot layers of each scene graph, thus facilitating place recognition across the entire collection of open-loop scene graphs.</p>
<p>This centralised place recognition database, which is continuously updated with new robot layer pose vertices, enables the creation of a single, consolidated, though unoptimised, scene graph through factor graph optimisation using GTSAM (<xref ref-type="bibr" rid="B10">Dellaert, 2012</xref>). This is achieved by adding edges between robot layers that observe common regions, which are identified through DBoW descriptor matching, to create the following function for minimisation.<disp-formula id="e4">
<mml:math id="m72">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>arg min</mml:mi>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Robot&#x2009;Layers</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>&#x2b;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:munder>
<mml:mrow>
<mml:munder accentunder="false">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mi>&#x3b1;</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x23df;</mml:mo>
</mml:munder>
</mml:mrow>
<mml:mrow>
<mml:mtext>Scene&#x2009;Graph&#x2009;Atlas</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Optimisation of the objective function reveals a target set of optimised poses <inline-formula id="inf69">
<mml:math id="m73">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">X</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. R is the residual function between two poses, <inline-formula id="inf70">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of poses in a particular scene graph, and <inline-formula id="inf71">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the total number of scene graphs to be aggregated. <inline-formula id="inf72">
<mml:math id="m76">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf73">
<mml:math id="m77">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> refer to different scene graphs over which the scene graph atlas detects a common region.</p>
<p>Similar to Kimera-Multi (<xref ref-type="bibr" rid="B34">Tian et al., 2022</xref>), once the target poses are obtained, local optimisation is performed to deform the underlying mesh, ensuring accurate alignment with the trajectory defined by the target poses. We direct the reader to <xref ref-type="bibr" rid="B34">Tian et al. (2022)</xref> for further technical details on implementation.</p>
</sec>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>3D object detection</title>
<p>An object-detection algorithm is applied to the 3D metric-semantic mesh generated in <italic>Layer 1</italic> to obtain the objects in <italic>Layer 3</italic>. Objects are detected through Euclidean clustering (<xref ref-type="bibr" rid="B28">Rusu and Cousins, 2011</xref>) of mesh vertices that were updated on the addition of the new keyframe. The Euclidean clustering algorithm groups together vertices that are within a specified distance threshold, effectively identifying individual objects within the scene. A key variable used in Euclidean clustering (<xref ref-type="bibr" rid="B28">Rusu and Cousins, 2011</xref>) is <italic>cluster tolerance</italic>, which relates to the minimum allowed distance between vertices. Vertices are joined together if they are less than <italic>cluster tolerance</italic> apart. The optimal value for the <italic>cluster tolerance</italic> parameter is determined empirically through experiments conducted on the robot within its operational environment.</p>
<p>Euclidean clustering is further restricted to semantic classes corresponding to agricultural produce as defined during the construction of <italic>Layer 1</italic>. This ensures that only plant-related objects are identified and propagated through the scene graph. The clustering process yields a set of vertex clusters, each of which is associated with a bounding box and its centroid. For each object obtained, the centroid position <inline-formula id="inf74">
<mml:math id="m78">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is used to create the object node and is added to the set of nodes <inline-formula id="inf75">
<mml:math id="m79">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> representing the object layer. At the same time, an edge is added between the centroid of the boundary box and the clustered vertices of the detected object. Finally, when objects are detected, edges are added between the object centroids and the pose vertices of the robot layer.</p>
<p>Any updates to the pose vertices through optimisation of the scene atlas will trigger a corresponding update to the positions of the object nodes to match the pose vertices. If this optimisation results in overlapping object nodes associated with the same semantic label, the nodes are reconciled into a single node representing their union, ensuring that agricultural entities are not duplicated in the graph.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Osiris-Map</title>
<p>This section describes the concept and construction of a 3D scene graph that preserves high-precision environmental information while providing users with a human-understandable representation. Osiris-Map leverages a combination of LiDAR and inertial sensors to create its perception and robot layers. The accuracy of LiDAR significantly exceeds that of visual sensors, particularly in outdoor environments, making it a natural choice for high-precision mapping, especially as the agricultural environment expands.</p>
<sec id="s5-1">
<label>5.1</label>
<title>Perception layer</title>
<p>The perception layer, namely, <italic>Layer 1</italic>, of Osiris-Map is developed in real-time from a stream of laser and inertial data, which is subsequently transformed into a 3D semantic point cloud. From the combined streams, Fast-LIO2 (<xref ref-type="bibr" rid="B38">Xu et al., 2022</xref>) is used to estimate the ego-motion of the laser. For instance segmentation, an adapted version of SegMatch (<xref ref-type="bibr" rid="B12">Dub&#xe9; et al., 2017</xref>) was used for plant identification. Unlike in Osiris-Nav, where a flexible segmentation target can be used, Osiris-Map relies on naturally delineated objects such as trees in agricultural regions. A similar keyframe approach is used, however, to accumulate consecutive point clouds into a keyframe point cloud.</p>
</sec>
<sec id="s5-2">
<label>5.2</label>
<title>Robot layer</title>
<p>Similar to Osiris-Nav, the pose vertices in Osiris-Map&#x2019;s robot layer correspond to the keyframe positions obtained through LiDAR odometry. These pose vertices are enriched with a density image of the corresponding keyframe point cloud with a view to enabling further localisation. The creation of the density image follows the approach proposed in MapClosures (<xref ref-type="bibr" rid="B15">Gupta et al., 2024</xref>). While SegMatch segment descriptors are useful for place reconstruction in many robotic scenes, as demonstrated in SegMap (<xref ref-type="bibr" rid="B13">Dub&#xe9; et al., 2018</xref>), their performance deteriorates when applied to the homogeneous scenes common in agricultural environments, which often contain imperceptible differences between plant instances. Density images formed by accumulating multiple point clouds into a single projection are more robust compared to these as they not only capture individual instances but also scene-level relationships.</p>
<p>For Osiris-Map, a bird&#x2019;s-eye view <inline-formula id="inf76">
<mml:math id="m80">
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> projection of the keyframe point cloud was created and then discretised using a voxel grid. This projection <inline-formula id="inf77">
<mml:math id="m81">
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> contains points in the bounds shown in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>: <disp-formula id="e5">
<mml:math id="m82">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>The discretisation creates a grid <inline-formula id="inf78">
<mml:math id="m83">
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> of resolution <inline-formula id="inf79">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> metres per cell, with W as shown in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>: <disp-formula id="e6">
<mml:math id="m85">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>With each cell in the grid storing the number of points post-discretisation, this allows for the definition of the greyscale density image <inline-formula id="inf80">
<mml:math id="m86">
<mml:mrow>
<mml:mi mathvariant="script">I</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>:<disp-formula id="e7">
<mml:math id="m87">
<mml:mrow>
<mml:mi mathvariant="script">I</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<sec id="s5-2-1">
<label>5.2.1</label>
<title>Robot atlas</title>
<p>In Osiris-Nav, the scene graph atlas was powered by a database of DBoW descriptors from keyframe RGB images. In Osiris-Map, this formulation is adapted to use ORB (Oriented FAST and Rotated BRIEF) features for place recognition. These features allow for place recognition across all keyframes presented to the atlas. Following place recognition, the pose-graph constructed in <xref ref-type="disp-formula" rid="e4">Equation 4</xref> can be optimised to produce a novel set of refined poses.</p>
<p>Unlike Osiris-Nav, which required mesh deformation after optimisation, Osiris-Map can be regenerated on the fly without further optimisation by transforming keyframe point clouds to their updated positions.</p>
</sec>
</sec>
<sec id="s5-3">
<label>5.3</label>
<title>3D object detection</title>
<p>A similar Euclidean clustering approach is used for the detection of objects from the semantic point cloud produced in <italic>Layer 1</italic> to obtain the objects in <italic>Layer 3</italic>. Unlike Osiris-Nav, this process operates on the semantic point cloud rather than on mesh vertices, grouping together points that fall within the specified distance threshold.</p>
<p>The clustering process similarly yields a set of point clusters, each associated with a bounding box and its centroid. For each object obtained, the centroid position <inline-formula id="inf81">
<mml:math id="m88">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is used to create the object node and is added to the set of nodes <inline-formula id="inf82">
<mml:math id="m89">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> representing the object layer. At the same time, an edge is added between the centroid of the boundary box and the clustered vertices of the detected object. Finally, when objects are detected, edges are added between the object centroids and the pose vertices of the robot layer. After optimisation, object-node reconciliation is carried out similarly to Osiris-Nav, ensuring that trees or plants in the scene graph are not duplicated.</p>
</sec>
</sec>
<sec id="s6">
<label>6</label>
<title>NeurOsiris</title>
<p>Camera-based approaches, while powerful, often struggle with lighting conditions when creating dense reconstructions of the traversed areas. LiDAR-based approaches, on the other hand, provide more accurate information for the reconstruction of large-scale agricultural environments, as demonstrated in Osiris-Map, but their measurements are both sparser than camera images and devoid of colour information. These limit further interaction with reconstructions as they do not accurately reflect the observed scenes.</p>
<p>There has been a surge of popularity in the use of neural radiance fields (<xref ref-type="bibr" rid="B20">Mildenhall et al., 2021</xref>) for reconstructions, with approaches such as SiLVr (<xref ref-type="bibr" rid="B32">Tao and Fallon, 2025</xref>) and MegaNerf (<xref ref-type="bibr" rid="B35">Turki et al., 2022</xref>) showing the potential for the creation of large-scale, accurate, and interactable reconstructions. In this section, we present a novel formulation, NeurOsiris, which combines the reconstruction capabilities of modern natural radiance fields with a generated scene-graph to enable further interaction.</p>
<sec id="s6-1">
<label>6.1</label>
<title>Perception layer</title>
<p>The perception layer, namely, <italic>Layer 1</italic>, of NeurOsiris is developed offline using a combination of visual inputs and LiDAR information. This differs from the incremental Osiris-Nav and Osiris-Map, in which the entire scene information is available for constructing the perception layer, which in this case consists of a trainable neural network for rendering. The rendering process for each pixel from an image of known pose follows an integral equation that is approximated using quadrature as follows:<disp-formula id="equ1">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>u</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>This expression calculates pixel colour as the sum of radiance values along the ray, weighted by the opacity of each point along the ray, with the weights calculated using:<disp-formula id="equ2">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The colours <inline-formula id="inf83">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and densities <inline-formula id="inf84">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are obtained by querying the neural network at discrete points along the ray.</p>
<p>In classical nerf models, the network is trained solely using a photometric loss between the rendered and ground-truth images. Photometric loss alone is insufficient for high-fidelity 3D reconstruction; therefore, we utilised a LiDAR depth-based supervision loss to encourage densities along rays to form unimodal distributions that peaked around surface intersections, avoiding the commonly occurring &#x2018;floaters&#x2019; when this supervisory signal is absent. We adopted the depth loss used in urban radiance fields (<xref ref-type="bibr" rid="B26">Rematas et al., 2022</xref>):<disp-formula id="equ3">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">depth</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Since the ideal distribution for the density value along a ray is a Dirac delta centred at the depth measurement, the loss measures the deviation of the function from this ideal distribution. To make this computationally tractable, the Dirac delta is approximated using a normal distribution, and the integral is split into three sections to ensure that samples outside the depth value contain no 3D surface.</p>
<p>Our method utilises Nerfacto as a base model, which utilises hash-grid encoding to ensure fast training and generate a neural point cloud. As a single network usually does not have sufficient capacity, we follow the method of <xref ref-type="bibr" rid="B31">Tancik et al. (2022)</xref> and partition the scene into larger blocks for which point clouds can be individually extracted and fused to generate the complete perception layer. Similarly to language embedding radiance fields (LERF), we used the ADAM optimiser for the proposal networks and fields with weight decay 10<sup>&#x2212;9</sup> and an exponential learning rate scheduler from 10<sup>&#x2212;2</sup> to 10<sup>&#x2212;3</sup> over the first 5,000 training steps, followed by another 10,000 training steps.</p>
</sec>
<sec id="s6-2">
<label>6.2</label>
<title>Robot layer</title>
<p>As mentioned in the section above, the training of the radiance field is conditional on images with known poses. As all images are present, the Global Structure-from-Motion approach (GLOMAP) (<xref ref-type="bibr" rid="B25">Pan et al., 2024</xref>) was used to obtain camera poses via bundle adjustment. These poses were scaled using LiDAR odometry to ensure that the renderings maintained the scene scale.</p>
</sec>
<sec id="s6-3">
<label>6.3</label>
<title>3D object detection</title>
<p>Semantic segmentation of images captured by NeurOsiris reveals object locations in the image plane, and with poses provided by GLOMAP and depths obtained from the LiDAR point cloud, a semantic point cloud of the entire scene can be generated. Euclidean clustering of this scene point cloud reveals the detected objects from which the subsequent scene graph layers can be obtained.</p>
</sec>
<sec id="s6-4">
<label>6.4</label>
<title>Scene graph grounding</title>
<p>The development of the NeurOsiris system has, thus far, not exhibited significant deviations from the Osiris-Nav and Osiris-Map systems apart from the integration of visual and LiDAR data, which enhances the accuracy and resolution of <italic>Layer 1</italic> within the scene graph. Notably, NeurOsiris incorporates the capability to anchor higher layers of the scene graph into neural rendering. Our methodology parallels that of LERF, whereby we enhance the neural radiance fields (NERF) outputs with a language embedding denoted as <inline-formula id="inf85">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">lang</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. This embedding function accepts an input position <inline-formula id="inf86">
<mml:math id="m96">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and a physical scale <inline-formula id="inf87">
<mml:math id="m97">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, producing a <inline-formula id="inf88">
<mml:math id="m98">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional language embedding that is invariant to the viewpoint, thus synthesising information from multiple perspectives.</p>
<p>LERF adopts a multi-patch approach for language supervision and takes the CLIP embeddings of patches centred at pixels that rays originate from. Although this method is useful for open-vocabulary queries such as &#x201c;Show me a plant,&#x201d; it struggles with certain precision agricultural queries such as &#x201c;show me the plants in the first row.&#x201d; Therefore, we adopt a different embedding approach, using text embeddings of the numerical value that a certain object has to a line, row, or section. We achieve this by creating a hierarchical semantic image, enhanced with object, line, and row IDs. The text embeddings of these numbers are then used to supervise the language embedding, allowing us to deal more naturally with homogenous scenes and promote further interaction. A further benefit of these numerical embeddings is that, unlike open-world queries, where there is a substantial performance gap between African languages and English (<xref ref-type="bibr" rid="B23">Ojo et al., 2025</xref>), the use of Arabic numerals and digits is widespread across Africa. While the context surrounding these specific numerals is still necessary, the key embedding remains largely language-agnostic.</p>
</sec>
</sec>
<sec id="s7">
<label>7</label>
<title>Experimental evaluation</title>
<p>We design our experiments to evaluate the capability of our system to perform navigation, mapping, and interaction in the context of precision agriculture. To ensure reproducibility and enable fair comparison, we utilise the CitrusFarm dataset (<xref ref-type="bibr" rid="B33">Teng et al., 2023</xref>), which is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. This real-world dataset was collected in a commercial citrus orchard using a Clearpath Jackal mobile robot equipped with a Zed2i RGB-D camera and calibrated LiDAR, IMU, and GPS sensors. The dataset is challenging primarily due to its spatial extent: the orchard is approximately 250 m in length, and the traversal distances of the evaluated sequences range from 1.5 to 2 km. Details on the robot and the dataset collection are presented in <xref ref-type="bibr" rid="B33">Teng et al. (2023)</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Experimental area of the CitrusFarm dataset. The coloured lines represent the paths along which the robot was driven through the farm.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g002.tif">
<alt-text content-type="machine-generated">Aerial photograph of an orchard with rows of trees and colored paths overlaid, each path in a different color, including red, blue, green, purple, yellow, pink, brown, gray, and light blue, outlining separate sections.</alt-text>
</graphic>
</fig>
<p>The sequences used for this work&#x2019;s evaluation were recorded at different times of the day. We consider two separate recordings as two distinct robot input sessions and additionally subdivide one recording into ten partially overlapping segments, as depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>. To evaluate accuracy at higher levels of the scene graph hierarchy, we generate ground-truth annotations by manually segmenting plant instances from the ground-truth LiDAR point cloud data. Axis-aligned bounding boxes were manually placed around individual plant objects by a domain expert using a custom graphical user interface (GUI). The annotated plant instances were subsequently manually grouped into planting lines, which were then aggregated into rows.</p>
<sec id="s7-1">
<label>7.1</label>
<title>Osiris-Nav</title>
<p>The key innovations of Osiris-Nav over the base Osiris introduced in this work were its multi-session performance and its visual navigation capabilities. The following section details their evaluation.</p>
<p>To characterise multi-session performance, particularly to assess the utility of the scene graph atlas, we construct a &#x201c;ground-truth&#x201d; single-session scene graph for the full sequences using <italic>Osiris</italic>. We then generate a second scene graph using multiple sessions as inputs to Osiris-Nav; the resulting multi-session scene graph is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. The multi-session inputs correspond to a subdivision of the full sequence used as input to <italic>Osiris</italic>, as outlined in <xref ref-type="fig" rid="F2">Figure 2</xref>. We compute the metrics described below for both the single-session and multi-session configurations, and we compare these results to quantify the effectiveness of the proposed multi-session capability.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>3D scene graph of a citrus farm. The layers of <italic>Osiris-Nav</italic> are (from lowest to highest) are the 3D metric semantic mesh; the robot layer, which enables autonomy; the plant object layer, which identifies plant objects from the mesh; the planting line layer, which groups plant objects into planting lines; the rows layer, which groups two planting lines to form a row, and, finally, the farm section layer, which groups all rows together.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g003.tif">
<alt-text content-type="machine-generated">Diagram showing a hierarchical structure of a farm section, with nodes labeled farm section at the top, followed by rows, planting lines, plant objects, and metric semantic mesh at the robot layer, connected by colored lines.</alt-text>
</graphic>
</fig>
<p>For plant objects in Osiris-Nav, accuracy was calculated by the percentage of objects in the ground-truth annotated point cloud that have an object estimated by Osiris and Osiris-Nav with the correct semantic label within a specified radius (&#x201c;% found&#x201d;) and the percentage of objects in the estimated scene graph that have a ground-truth object with the correct semantic label within a specified radius (&#x201c;% correct&#x201d;).</p>
<p>From <xref ref-type="fig" rid="F4">Figure 4</xref>, it can be observed that <italic>Osiris</italic> achieves adequate levels of objects found and objects correctly identified; however, these results are not exceptional, which can be attributed to the accuracy of depth estimation from the visual sensors. <italic>Osiris-Nav</italic> has a slightly lower number of objects that are both correctly identified and found. This is typical of multi-merging techniques, with a slight penalty incurred, particularly due to overlapping mesh faces at the edges of merged maps, which can result in missed detections.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Graph showing objects found and correctly identified objects for a single robot (Osiris) vs. a multi-robot (Osiris-Nav) system.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g004.tif">
<alt-text content-type="machine-generated">Bar chart compares single and multi conditions on objects metric. Single bars are higher for both percent found and percent correct, with percent found being the highest overall. Legend indicates single in blue and multi in orange.</alt-text>
</graphic>
</fig>
<p>However, when higher levels of the scene graph were queried, specifically, lines and rows, as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, similar values for Osiris-Nav and the base Osiris were reported, indicating that Osiris-Nav accurately merges multiple sessions. Qualitatively, the scene-level perception accurately captures the structure of the environment&#x2019;, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. The variances in the performance of 3D perception layer metrics, compared with those of the scene-level perception layer metrics, indicate that scene graph merging has less impact at higher levels.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Graph showing line detection precision and recall and row detection precision and recall for a single robot (Osiris) vs. a multi-robot (Osiris&#x2b;&#x2b;) system. Variable scene-level perception parameters were kept the same for both systems.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g005.tif">
<alt-text content-type="machine-generated">Bar chart comparing precision and recall for line and row detection metrics with two methods: Single (blue) and Multi (orange). Single outperforms Multi in both metrics, as shown by higher bars.</alt-text>
</graphic>
</fig>
<p>To assess the autonomy capability, we construct a test pass using sequence 1 from the CitrusFarm dataset (<xref ref-type="bibr" rid="B33">Teng et al., 2023</xref>). While the accuracy metrics in <xref ref-type="fig" rid="F4">Figure 4</xref> suggest that localisation occurs across shared regions, scene reconstruction tasks generally require only sparse localisations. In contrast, navigation&#x2014;particularly under a visual teach and repeat (VT&#x26;R) framework&#x2014;demands more frequent localisations to minimise drift relative to the teach pass. Adopting the method proposed by <xref ref-type="bibr" rid="B11">Dequaire et al. (2016)</xref>, we define a localisation envelope to explicitly estimate the localisation performance of the robot layer for VT&#x26;R. In this approach, a Gaussian process (GP) is trained to predict the localisation envelope, including in areas that have not yet been revisited. The GP is trained on a sequence from the same citrus farm that overlaps with part of the teach pass, allowing us to test whether the system can reliably localise and provide high-level guidance on the robot&#x2019;s required orientation to follow the teach path. The results in <xref ref-type="fig" rid="F6">Figure 6</xref> demonstrate that the scene graph, via its robot layer, can successfully localise during autonomous operation, indicating that the proposed framework can enable ground-robot autonomy in agricultural settings.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Predicted vs. actual localisation performance using the robot layer.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g006.tif">
<alt-text content-type="machine-generated">Side-by-side scatter plots show actual and predicted localisation data points colored by a value scale from approximately zero point seven five to zero point nine. Both plots display a similar distribution and structure, with vertical alignment and a horizontal cluster at higher y-values. Color bars indicating value ranges are included for each plot.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s7-2">
<label>7.2</label>
<title>Osiris-Map</title>
<p>The key innovation of Osiris-Map over the base Osiris was its retention of higher-precision information for mapping purposes. For the evaluation of Osiris-Map, three sequences from the Field-One dataset, shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, were used. The first sequence consisted of data taken in a lawnmower configuration where every row was traversed once. This was then compared to the second sequence, in which the robot traversed every fifth row, and the third sequence, in which the robot went through every other row and then looped back until all the rows had been traversed.</p>
<p>The evaluation commenced with an analysis of <italic>Layer 1</italic> of Osiris-Map, specifically, the accumulated point cloud generated throughout the sequence, to assess its precision and its preservation of fine-grained geometric information. The accumulated point clouds from sequences 2 and 3 were thus compared with the point cloud generated in sequence 1 to compute the accuracy metrics shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. The error histograms show that, even across the large observed area, the point clouds generated are largely self-similar, with significant overlap, despite the three divergent sequences starting from different locations.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Error histogram between the accumulated point clouds of sequences 2 and 1 on the left, along with sequences 3 and 1 on the right.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g007.tif">
<alt-text content-type="machine-generated">Two side-by-side histograms display C2C absolute distances with counts on the y-axis and distance values on the x-axis. Both charts show a steep decline with most counts clustered near zero. The left plot includes 1,130,791 values; the right plot includes 1,128,329 values. Each uses 256 histogram bins and demonstrates similar distributions, emphasizing that most C2C distances are small.</alt-text>
</graphic>
</fig>
<p>Attention was then directed to the object detection performance of Osiris-Map, as shown in <xref ref-type="fig" rid="F8">Figures 8</xref>, <xref ref-type="fig" rid="F9">9</xref>, with the manually labelled accumulated point cloud for each sequence serving as the ground-truth. In all three sequences, only a single tree of the 282 trees in the citrus farm was not detected. While this could be expected in sequences consisting of well-separated citrus trees due to the depth accuracy of LiDAR scans, the low-odometry drift and strong object-merging ability of Osiris-Map were still able to prevent over-segmentation of the scene when approached from different viewpoints and over long loops. The accuracy of the lower levels of Osiris-Map permeated to the higher levels, with perfect precision and recall recorded for the lines and rows layers.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Object detections in Osiris-Map detections. The point cloud generated from Osiris-Map is overlaid with RGB colour information for visualisation. Osiris-Map is able to create a distinct bounding box for each tree in this point cloud to form its <italic>Layer 3</italic>.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g008.tif">
<alt-text content-type="machine-generated">Computer-generated 3D visualization shows blue wireframe boxes representing obstacles and colored arrows indicating paths and vectors through a simulated environment, likely related to robotic navigation or path planning research.</alt-text>
</graphic>
</fig>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Zoomed-out view of the scene captured through Osiris-Map. In this, the point cloud generated from Osiris-Map is overlaid with RGB colour information for visualisation, with object detection bounding boxes also being showcased.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g009.tif">
<alt-text content-type="machine-generated">Three-dimensional point cloud visualization of a tree plantation with blue rectangular boxes outlining each tree, showing rows of vegetation on brown soil against a plain dark background.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s7-3">
<label>7.3</label>
<title>NeurOsiris</title>
<p>The key innovation of NeurOsiris, compared to the base Osiris, lies in its increased photorealism and scene-level grounding. For evaluation of this approach, five samples of rows from sequence 1 of the CitrusFarm dataset were selected, trained, and evaluated.</p>
<p>To evaluate the photorealism of NeurOsiris, the peak signal-to-noise ratio (PSNR) and structural similarity image measure (SSIM) were used. For NeurOsiris, this resulted in PSNR and SSIM scores of 17.18 and 4.04, respectively, when averaged across the row samples. The base Nerfacto model returned scores of 19.09 and 5.05 across the same scenes. To enable natural-language queries, similarly to LERF, NeurOsiris employs a larger hash grid than pure RGB approaches, which affects its raw photo-realism performance, as shown by the comparison with Nerfacto. However, as shown in <xref ref-type="fig" rid="F10">Figure 10</xref>, the outputs exhibit a significant increase in photo-realism compared to the previous base Osiris variants.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Novel view rendering from NeurOsiris.</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g010.tif">
<alt-text content-type="machine-generated">Dirt road bordered by trees under a clear blue sky, rendered with a blurred effect that gives the scene a dreamlike or abstract quality.</alt-text>
</graphic>
</fig>
<p>To evaluate language grounding, relevant images of objects and row queries, as shown in <xref ref-type="fig" rid="F11">Figures 11</xref>, <xref ref-type="fig" rid="F12">12</xref>, were generated for the sample rows. These were compared to the semantic segmentation images used to train NeurOsiris, and the mean intersection over union was calculated. For objects, an mIoU of 67% was achieved, while a slightly higher of 72% was obtained for rows, demonstrating the grounding of the scene graph within the neural rendering.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Novel view rendering and the accompanying relevance map for the prompt &#x201c;Show me the fourth plant.&#x201d;</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g011.tif">
<alt-text content-type="machine-generated">Two-panel comparison showing a dirt road bordered by trees under a blue sky on the left, and a corresponding false-color heatmap visualization on the right highlighting objects with colors such as blue, green, orange, and purple.</alt-text>
</graphic>
</fig>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Novel view rendering and the accompanying relevance map for the prompt &#x201c;Show me the plants in row 1.&#x201d;</p>
</caption>
<graphic xlink:href="frobt-13-1732004-g012.tif">
<alt-text content-type="machine-generated">Left panel shows a blurry landscape photograph of a tree-lined dirt road under a blue sky; right panel presents a false-color segmented version highlighting trees in blue and background in purple.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s8">
<label>8</label>
<title>Conclusions and future work</title>
<p>In this article, we present a systematic approach that expands hierarchical graph approaches for different precision agricultural tasks. We develop the notion of a flexible scene-graph approach and modify it for navigation, mapping, and interaction tasks while overcoming the challenge of the homogenous nature of agricultural environments. While Osiris&#x2b;&#x2b; demonstrates the utility of hierarchical scene graphs for precision agriculture, several avenues still remain for extending its capabilities, particularly in the context of diverse agricultural environments.</p>
<sec id="s8-1">
<label>8.1</label>
<title>Multilingual human&#x2013;robot interaction</title>
<p>A primary motivation of this work is to lower the barrier to the adoption of agricultural robotics in Africa. Although NeurOsiris uses numerical embeddings to bypass some of the challenges in multilingual queries, it currently can only understand the context around the numerical embeddings in English. Future work will support the expansion of this framework to explicitly accommodate local African languages such as isiXhosa, isiZulu, and Swahili. This will enable field workers to issue commands (e.g., &#x201c;Ndibonise izityalo kumqolo 1&#x201d;/&#x201d;Show me the plants in row 1&#x201d;) in their native tongue, which is a critical step towards inclusive technology adoption.</p>
</sec>
<sec id="s8-2">
<label>8.2</label>
<title>Closed-loop autonomy and navigation</title>
<p>In this work, we validated the autonomy capability of Osiris-Nav through predicted localisation envelopes. Future research will transition this to a fully closed-loop navigation system, utilising the VT&#x26;R framework directly on the robot hardware. This will involve quantifying real-time path-tracking error and obstacle avoidance performance in dynamic field conditions, moving beyond the current open-loop evaluations.</p>
</sec>
<sec id="s8-3">
<label>8.3</label>
<title>Spatio-temporal (4D) monitoring</title>
<p>Agricultural environments are highly dynamic, with crops undergoing significant changes over a season. The current system treats the environment as static within a session. We aim to extend the scene graph formulation to a 4D representation that can track individual plant growth, fruit yield, and disease progression over weeks or months. This &#x201c;long-term memory&#x201d; would allow the scene graph atlas to not only merge sessions but also highlight temporal anomalies for farmers.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s9">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material; further enquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s10">
<title>Author contributions</title>
<p>AM: Writing &#x2013; original draft. AS-A: Writing &#x2013; original draft. TM: Writing &#x2013; original draft. IN: Writing &#x2013; original draft. RS: Writing &#x2013; original draft. VH: Writing &#x2013; original draft. PA: Writing &#x2013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="s12">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s13">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. Generative AI was used for grammar and proofing purposes only.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s14">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aivazidou</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Tsolakis</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Transitioning towards human&#x2013;robot synergy in agriculture: a systems thinking perspective</article-title>. <source>Syst. Res. Behav. Sci.</source> <volume>40</volume>, <fpage>536</fpage>&#x2013;<lpage>551</lpage>. <pub-id pub-id-type="doi">10.1002/sres.2962</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ajoudani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zanchettin</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Ivaldi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Albu-Sch&#xe4;ffer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kosuge</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Khatib</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Progress and prospects of the human&#x2013;robot collaboration</article-title>. <source>Aut. Robots</source> <volume>42</volume>, <fpage>957</fpage>&#x2013;<lpage>975</lpage>. <pub-id pub-id-type="doi">10.1007/s10514-017-9677-2</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Amayo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pini&#xe9;s</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Paz</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Newman</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Geometric multi-model fitting with a convex relaxation algorithm</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>8138</fpage>&#x2013;<lpage>8146</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Diao</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>engVision-based navigation and guidance for agricultural autonomous vehicles and robots: a review</article-title>. <source>Comput. Electronics Agriculture</source> <volume>205</volume>, <fpage>107584</fpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2022.107584</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bavle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sanchez-Lopez</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Shaheer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Civera</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Voos</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Situational graphs for robot navigation in structured indoor environments</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>7</volume>, <fpage>9107</fpage>&#x2013;<lpage>9114</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2022.3189785</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bavle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sanchez-Lopez</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Shaheer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Civera</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Voos</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>S-graphs&#x2b;: real-time localization and mapping leveraging hierarchical representations</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume> (<issue>8</issue>), <fpage>4927</fpage>&#x2013;<lpage>4934</lpage>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Benos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Moysiadis</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kateris</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Tagarakis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Busato</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pearson</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Human&#x2013;robot interaction in agriculture: a systematic review</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>6776</fpage>. <pub-id pub-id-type="doi">10.3390/s23156776</pub-id>
<pub-id pub-id-type="pmid">37571559</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Campos</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Elvira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rodriguez</surname>
<given-names>J. J. G.</given-names>
</name>
<name>
<surname>Montiel</surname>
<given-names>J. M. M.</given-names>
</name>
<name>
<surname>Tardos</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Orb-slam3: an accurate open-source library for visual, visual&#x2013;inertial, and multimap slam</article-title>. <source>IEEE Trans. Robotics</source> <volume>37</volume> (<issue>6</issue>), <fpage>1874</fpage>&#x2013;<lpage>1890</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ray</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Carlone</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Hydra-multi: collaborative online construction of 3d scene graphs with multi-robot teams</article-title>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dellaert</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Factor graphs and gtsam: a hands-on introduction</article-title>. <source>GA. Inst. Technol. Tech. Rep</source> <volume>2</volume>, <fpage>4</fpage>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Dequaire</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Churchill</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Posner</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Off the beaten track: predicting localisation performance in visual teach and repeat</article-title>,&#x201d; in <source>2016 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>795</fpage>&#x2013;<lpage>800</lpage>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Dub&#xe9;</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dugas</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Stumm</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Nieto</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Siegwart</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cadena</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Segmatch: segment based place recognition in 3d point clouds</article-title>,&#x201d; in <source>2017 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5266</fpage>&#x2013;<lpage>5272</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dub&#xe9;</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cramariuc</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dugas</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Nieto</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Siegwart</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cadena</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>SegMap: 3D segment mapping using data-driven descriptors</article-title>,&#x201d; in <source>Robotics: science and systems online proceedings 14</source>.</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duckett</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pearson</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Blackmore</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Grieve</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.-H.</given-names>
</name>
<name>
<surname>Cielniak</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Agricultural robotics: the future of robotic agriculture</article-title>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guadagnino</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Mersch</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Vizzo</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Stachniss</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Effectively detecting loop closures using point cloud density maps</article-title>,&#x201d; in <source>2024 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>10260</fpage>&#x2013;<lpage>10266</lpage>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Carlone</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Hydra: a real-time spatial perception system for 3d scene graph construction and optimization</article-title>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Talak</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Abdulhai</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Strader</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Foundations of spatial perception for robotics: hierarchical representations and real-time systems</article-title>. <comment>arXiv preprint arXiv:2305.07154</comment>.</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Grounding dino: marrying dino with grounded pre-training for open-set object detection</article-title>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lytridis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kaburlasos</surname>
<given-names>V. G.</given-names>
</name>
<name>
<surname>Pachidis</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Manios</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vrochidou</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kalampokas</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>An overview of cooperative robotics in agriculture</article-title>. <source>Agronomy</source> <volume>11</volume>, <fpage>1818</fpage>. <pub-id pub-id-type="doi">10.3390/agronomy11091818</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mildenhall</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>P. P.</given-names>
</name>
<name>
<surname>Tancik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Barron</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Ramamoorthi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Nerf: representing scenes as neural radiance fields for view synthesis</article-title>. <source>Commun. ACM</source> <volume>65</volume>, <fpage>99</fpage>&#x2013;<lpage>106</lpage>. <pub-id pub-id-type="doi">10.1145/3503250</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moysiadis</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Benos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Karras</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kateris</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Peruzzi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Berruto</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Human&#x2013;robot interaction through dynamic movement recognition for agricultural environments</article-title>. <source>AgriEngineering</source> <volume>6</volume>, <fpage>2494</fpage>&#x2013;<lpage>2512</lpage>. <pub-id pub-id-type="doi">10.3390/agriengineering6030146</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Mukuddem</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Amayo</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Osiris: building hierarchical representations for agricultural environments</article-title> in <source>2024 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>15797</fpage>&#x2013;<lpage>15803</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA57147.2024.10610723</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ojo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ogundepo</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Oladipo</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ogueji</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Stenetorp</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). &#x201c;<article-title>Afrobench: how good are large language models on African languages?</article-title>,&#x201d; in <source>Findings of the association for computational linguistics: ACL 2025</source>, <fpage>19048</fpage>&#x2013;<lpage>19095</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Oleynikova</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Fehr</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Siegwart</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nieto</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Voxblox: incremental 3d euclidean signed distance fields for on-board mav planning</article-title> in <source>2017 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1366</fpage>&#x2013;<lpage>1373</lpage>.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Pan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bar&#xe1;th</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pollefeys</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sch&#xf6;nberger</surname>
<given-names>J. L.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Global structure-from-motion revisited</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>58</fpage>&#x2013;<lpage>77</lpage>.</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rematas</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>P. P.</given-names>
</name>
<name>
<surname>Barron</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Tagliasacchi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Funkhouser</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Urban radiance fields</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>12932</fpage>&#x2013;<lpage>12942</lpage>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rosinol</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Violette</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Abate</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Kimera: from slam to spatial perception with 3d dynamic scene graphs</article-title>. <source>Int. J. Robotics Res.</source> <volume>40</volume>, <fpage>1510</fpage>&#x2013;<lpage>1546</lpage>. <pub-id pub-id-type="doi">10.1177/02783649211056674</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Rusu</surname>
<given-names>R. B.</given-names>
</name>
<name>
<surname>Cousins</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>3d is here: point cloud library (pcl)</article-title>,&#x201d; in <source>2011 IEEE international conference on robotics and automation (IEEE)</source>, <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Strader</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Speranzon</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Carlone</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Informing 3d scene graph generation with common-sense spatial knowledge</article-title>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tagarakis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Benos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Aivazidou</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Anagnostis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kateris</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bochtis</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Wearable sensors for identifying activity signatures in human-robot collaborative agricultural environments</article-title>. <source>Eng. Proc.</source> <volume>9</volume>, <fpage>5</fpage>. <pub-id pub-id-type="doi">10.1051/epjconf/202129102005</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tancik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Casser</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Pradhan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mildenhall</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>P. P.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Block-nerf: scalable large scene neural view synthesis</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>8248</fpage>&#x2013;<lpage>8258</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fallon</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>Silvr: scalable lidar-visual radiance field reconstruction with uncertainty quantification</article-title>,&#x201d; in <source>IEEE Transactions on Robotics 42</source>, <fpage>98</fpage>&#x2013;<lpage>114</lpage>.</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Teng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Karydis</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Multimodal dataset for localization, mapping and crop monitoring in citrus tree farms</article-title>,&#x201d; in <source>International symposium on visual computing</source>, <fpage>571</fpage>&#x2013;<lpage>582</lpage>.</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Herrera Arias</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Nieto-Granda</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>How</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Carlone</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Kimera-multi: robust, distributed, dense metric-semantic slam for multi-robot systems</article-title>. <source>IEEE Trans. Robotics</source> <volume>38</volume>, <fpage>2022</fpage>&#x2013;<lpage>2038</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2021.3137751</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Turki</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ramanan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Satyanarayanan</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Mega-nerf: scalable construction of large-scale nerfs for virtual fly-throughs</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>12922</fpage>&#x2013;<lpage>12931</lpage>.</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>V&#xe1;sconez</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Auat Cheein</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Workload and production assessment in the avocado harvesting process using human-robot collaborative strategies</article-title>. <source>Biosyst. Eng.</source> <volume>223</volume>, <fpage>56</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1016/j.biosystemseng.2022.06.010</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Repvit-sam: towards real-time segmenting anything</article-title>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Fast-lio2: fast direct lidar-inertial odometry</article-title>. <source>IEEE Trans. Robotics</source> <volume>38</volume>, <fpage>2053</fpage>&#x2013;<lpage>2073</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2022.3141876</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/972984/overview">Johann Laconte</ext-link>, INRAE, France</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3174407/overview">George Adamides</ext-link>, Agricultural Research Institute, Cyprus</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3256123/overview">Zichen Huang</ext-link>, Zhejiang University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3257507/overview">Riccardo Bertoglio</ext-link>, INRAE, France</p>
</fn>
</fn-group>
</back>
</article>