<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title-group>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1644230</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2025.1644230</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Dense mapping from sparse visual odometry: a lightweight uncertainty-guaranteed depth completion method</article-title>
<alt-title alt-title-type="left-running-head">Yang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2025.1644230">10.3389/frobt.2025.1644230</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Daolong</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3094956"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Xudong</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Haoyuan</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Haoyang</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Chengcai</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xu</surname>
<given-names>Kun</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/979986"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ding</surname>
<given-names>Xilun</given-names>
</name>
<xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/789572"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>School of Mechanical Engineering and Automation, Beihang University</institution>, <city>Beijing</city>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Chengcai Wang, <email xlink:href="cc_wang@buaa.edu.cn">cc_wang@buaa.edu.cn</email>; Kun Xu, <email xlink:href="xk007@buaa.edu.cn">xk007@buaa.edu.cn</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-09-22">
<day>22</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1644230</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>06</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>08</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Yang, Zhang, Liu, Wu, Wang, Xu and Ding.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Yang, Zhang, Liu, Wu, Wang, Xu and Ding</copyright-holder>
<license>
<ali:license_ref start_date="2025-09-22">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Visual odometry (VO) has been widely deployed on mobile robots for spatial perception. State-of-the-art VO offers robust localization, the maps it generates are often too sparse for downstream tasks due to insufffcient depth data. While depth completion methods can estimate dense depth from sparse data, the extreme sparsity and highly uneven distribution of depth signals in VO (&#x223C; 0.15% of the pixels in the depth image available) poses signiffcant challenges.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address this issue, we propose a lightweight Image-Guided Uncertainty-Aware Depth Completion Network (IU-DC) for completing sparse depth from VO. This network integrates color and spatial information into a normalized convolutional neural network to tackle the sparsity issue and simultaneously outputs dense depth and associated uncertainty. The estimated depth is uncertainty-aware, allowing for the filtering of outliers and ensuring precise spatial perception.</p>
</sec>
<sec>
<title>Results</title>
<p>The superior performance of IU-DC compared to SOTA is validated across multiple open-source datasets in terms of depth and uncertainty estimation accuracy. In real-world mapping tasks, by integrating IU-DC with the mapping module, we achieve 50 &#x00D7; more reconstructed volumes and 78% coverage of the ground truth with twice the accuracy compared to SOTA, despite having only 0.6 M parameters (just 3% of the size of the SOTA).</p>
</sec>
<sec>
<title>Discussion</title>
<p>Our code will be released at <ext-link ext-link-type="uri" xlink:href="https://github.com/YangDL-BEIHANG/Dense-mapping-from-sparse-visual-odometry/tree/d5a11b4403b5ac2e9e0c3644b14b9711c2748bf9">https://github.com/YangDL-BEIHANG/Dense-mapping-from-sparse-visual-odometry/tree/d5a11b4403b5ac2e9e0c3644b14b9711c2748bf9</ext-link>.</p>
</sec>
</abstract>
<kwd-group>
<kwd>mapping</kwd>
<kwd>deep learning for visual perception</kwd>
<kwd>visual odometry</kwd>
<kwd>depth completion</kwd>
<kwd>uncertainty estimation</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>National Natural Science Foundation of China</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100001809</institution-id>
</institution-wrap>
</funding-source>
<award-id rid="sp1">T2121003 52375003 U22B2080</award-id>
</award-group>
<funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by National Natural Science Foundation of China under Grant U22B2080, Grant T2121003 and Grant 52375003.</funding-statement>
</funding-group>
<counts>
<fig-count count="10"/>
<table-count count="5"/>
<equation-count count="18"/>
<ref-count count="38"/>
<page-count count="15"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Robot Vision and Artificial Perception</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Constructing a detailed and accurate map of the environment is a core task in the spatial perception of mobile robots (<xref ref-type="bibr" rid="B19">Malakouti-Khah et al., 2024</xref>). Visual odometry (VO) is widely used on mobile robots for perception due to its computational efficiency and adaptability to various environments (<xref ref-type="bibr" rid="B14">Labb&#xe9; and Michaud, 2022</xref>; <xref ref-type="bibr" rid="B2">Aguiar et al., 2022</xref>). While state-of-the-art VO provides accurate localization, the resulting sparse depth data often leads to incomplete maps with insufficient spatial information, posing challenges for downstream tasks (<xref ref-type="bibr" rid="B3">Araya-Martinez et al., 2025</xref>). With breakthroughs in the computer vision community, sparse depth data can be completed using depth completion approaches (<xref ref-type="bibr" rid="B20">Mathew et al., 2023</xref>; <xref ref-type="bibr" rid="B33">Wan et al., 2025</xref>), offering a pathway to achieving dense maps in VO. However, the extreme sparsity of depth in VO can only offer limited prior knowledge and still poses significant challenges for depth completion approaches to estimate accurate dense depth for mapping.</p>
<p>Recent developments in depth completion approaches have achieved high accuracy on datasets even with limited input data through carefully designed feature extraction mechanisms and sophisticated network architectures (<xref ref-type="bibr" rid="B4">Chen et al., 2023</xref>; <xref ref-type="bibr" rid="B16">Liu et al., 2023</xref>; <xref ref-type="bibr" rid="B15">Liu et al., 2022</xref>). However, the computational load and memory requirements hinder their practical implementation on mobile robots with limited memory capacity. Additionally, even approaches with high accuracy on datasets still produce a non-negligible number of outliers during inference, leading to false mapping of the environment for robots (<xref ref-type="bibr" rid="B29">Tao et al., 2022</xref>). Several previous works have attempted to estimate both dense depth and associated uncertainty within a lightweight network architecture, using the uncertainty to reevaluate depth estimation (<xref ref-type="bibr" rid="B29">Tao et al., 2022</xref>; <xref ref-type="bibr" rid="B18">Ma and Karaman, 2018</xref>). These works have demonstrated real-world applications in reconstruction, motion planning, and localization. However, most of these works primarily consider inputs from LIDAR or incomplete depth images from depth cameras, which tend to exhibit lower sparsity and a more uniform distribution compared to data obtained from VO.</p>
<p>Following this method, we propose a novel depth completion network inspired by the normalized convolutional neural network (NCNN) (<xref ref-type="bibr" rid="B5">Eldesokey et al., 2019</xref>) to complete the extremely sparse depth data from VO. The pipeline of our method is presented in <xref ref-type="fig" rid="F1">Figure 1</xref>. We name our approach Image-Guided Uncertainty-Aware Depth Completion Network (IU-DC). Our contributions can be summarized as.<list list-type="bullet">
<list-item>
<p>We introduce a Confidence Refine Block that integrates image features into the multi-resolution propagation of NCNN layers, effectively addressing the lack of priors in the sparse input from VO.</p>
</list-item>
<list-item>
<p>We propose using a map probability density function with the Inverse Sensor Model in the final uncertainty estimation after the last layer of NCNN, enhancing the spatial awareness of the outputs. The accurate uncertainty estimated by IU-DC can then be used to filter out outliers in the depth estimation, providing a more reliable input for mapping.</p>
</list-item>
<list-item>
<p>The superior performance of IU-DC has been validated against SOTA across multiple datasets in terms of depth and uncertainty estimation. We also conducted mapping experiments on both open-source datasets and our own sequences to support our claims. Our approach reconstructs <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> more volumes than VO, achieving <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mn>78</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> coverage of the ground truth with twice the accuracy compared to SOTA. Despite these improvements, IU-DC occupies only 2.76 MB of memory and can achieve near real-time performance on NVIDIA Xavier NX. We are planning to release the code to support future research.</p>
</list-item>
</list>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Pipeline of robot mapping with our approach. A dense map of the environment is constructed using only camera images and extremely sparse depth from VO with the proposed IU-DC. <bold>(a)</bold> RGB frame from the camera; <bold>(b)</bold> sparse depth from visual odometry; <bold>(c,d)</bold> dense depth and the associated uncertainty estimated by our network; <bold>(e)</bold> filtered depth obtained using the predicted uncertainty.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g001.tif">
<alt-text content-type="machine-generated">Flowchart depicting a visual odometry system workflow. Camera image (a) feeds into IU-DC, producing sparse depth (b) and dense depth (c). Uncertainty (d) is derived, and filtered depth (e) is routed through outlier rejection, feeding into a mapping module for dense and sparse maps. Below are images illustrating camera input, sparse depth, dense depth, and the filtered depth outputs.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2-1">
<label>2.1</label>
<title>Depth completion with uncertainty awareness</title>
<p>We first briefly review recent developments in depth completion approaches that address both depth and uncertainty estimation. A widely adopted approach involves introducing a second decoder to the original network to output uncertainty. <xref ref-type="bibr" rid="B22">Popovi&#x107; et al. (2021)</xref> and <xref ref-type="bibr" rid="B29">Tao et al. (2022)</xref> both employed dual decoders to output depth estimation and uncertainty, demonstrating applications in robot mapping and path planning. However, their input sparsity is much lower than that of VO. <xref ref-type="bibr" rid="B24">Qu et al. (2021)</xref> introduced a Bayesian Deep Basis Fitting approach that can be concatenated with a base model to generate high-quality uncertainty, even with sparse or no depth input. However, its performance is highly dependent on the base model, making it difficult to achieve in a lightweight network architecture. Additionally, approaches such as ensembling and MC-dropout can estimate uncertainty without modifying the original network (<xref ref-type="bibr" rid="B8">Gustafsson et al., 2020</xref>). However, these methods involve a time-consuming inference process, which hinders real-time performance on robots.</p>
<p>Another promising approach is based on the theory of confidence-equipped signals in normalized convolution. <xref ref-type="bibr" rid="B5">Eldesokey et al. (2019)</xref> proposed a normalized convolutional neural network (NCNN) that generates continuous confidence maps for depth completion using limited network parameters. They further refined their work to obtain a probabilistic version of NCNN in (<xref ref-type="bibr" rid="B6">Eldesokey et al., 2020</xref>). Though the NCNN demonstrates outstanding performance in both depth completion and uncertainty estimation, it can only be used in an unguided manner due to algebraic constraints. This limitation results in performance degradation when the input has high sparsity due to a lack of prior information (<xref ref-type="bibr" rid="B9">Hu et al., 2022</xref>). <xref ref-type="bibr" rid="B31">Teixeira et al. (2020)</xref> attempted to extend NCNN into an image-guided method by concatenating the image with the outputs from NCNN into another network to generate the final prediction. While this approach improved depth completion accuracy, the resulting uncertainty lacked the continuity inherently modeled by NCNN. In this work, our proposed IU-DC extends NCNN into an image-guided approach to address the sparsity issue while maintaining inherent continuity to generate precise uncertainty estimation.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Depth completion from sparse VO</title>
<p>Several recent works have addressed the challenge of completing sparse depth from VO (<xref ref-type="bibr" rid="B15">Liu et al., 2022</xref>) (<xref ref-type="bibr" rid="B35">Wong et al., 2020</xref>; <xref ref-type="bibr" rid="B21">Merrill et al., 2021</xref>; <xref ref-type="bibr" rid="B34">Wofk et al., 2023</xref>). <xref ref-type="bibr" rid="B35">Wong et al. (2020)</xref> adopted an unsupervised approach, utilizing a predictive cross-modal criterion to train a network for inferring dense depth. <xref ref-type="bibr" rid="B15">Liu et al. (2022)</xref> adopted an adaptive knowledge distillation approach that allows the student model to leverage a blind ensemble of teacher models for depth prediction. <xref ref-type="bibr" rid="B34">Wofk et al. (2023)</xref> performed global scale and shift alignment with respect to sparse metric depth, followed by learning-based dense alignment, achieving state-of-the-art performance in depth completion accuracy. Although the sparsity issue of VO has been addressed in depth completion processes, few works are uncertainty-aware and demonstrate evaluations in mapping tasks.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title>Methodology</title>
<sec id="s3-1">
<label>3.1</label>
<title>Overall network architecture</title>
<p>Our network mainly comprises three main modules: the Input Confidence Estimation Network, which takes camera images and sparse depth as input and estimates the confidence mask input to first NCNN layer; the Image-Guided Normalized Convolutional Neural Network, which uses NCNN as backbone and refines the confidence output from NCNN layers at different resolutions with image features using the proposed Confidence Refine Block; and the Model-based Uncertainty Estimation Network, which takes the estimated depth and confidence output from last NCNN layer to estimates the final output uncertainty for each data. The overall architecture of our network is presented in <xref ref-type="fig" rid="F2">Figure 2</xref>, and the details of each module are explained in the following sections.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>An overview of the proposed IU-DC. <bold>(1)</bold> Input Confidence Estimation Network, <bold>(2)</bold> Model-Based Uncertainty Estimation Network. The middle section with the Confidence Refine Block (Conf. Refine Block) represents the Image-Guided Normalized Convolutional Neural Network.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g002.tif">
<alt-text content-type="machine-generated">Diagram of the Image-Guided Uncertainty-Aware Depth Completion Network (IU-DC). It shows the flow from an input camera image and sparse depth data through a series of convolutional layers. The network includes confidence refinement blocks, concatenation points, and up/down-sampling layers, leading to outputs of uncertainty and dense depth. Each block is annotated with kernel sizes and layer types, such as Conv2D and confidence-aware layers, illustrating the network's architecture for depth estimation.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Input confidence estimation network</title>
<p>In <xref ref-type="bibr" rid="B6">Eldesokey et al. (2020)</xref>, the initial confidence mask input into NCNN is learned from the sparse depth using a compact network. However, when the input data becomes sparser and more randomly distributed, confidence estimation may degrade because structure information, such as neighboring objects and sharp edges, is significantly missing (<xref ref-type="bibr" rid="B9">Hu et al., 2022</xref>). Sparse depth from VO is always calculated through the KLT sparse optical flow algorithm using corner features (<xref ref-type="bibr" rid="B23">Qin et al., 2018</xref>), which have a close correlation with the camera image. To compensate for the missing cues, we utilize both the image and sparse depth together to estimate the input confidence. In the Input Confidence Estimation Network, the image and sparse depth are first concatenated and then input into a compact UNet (<xref ref-type="bibr" rid="B25">Ronnebe et al., 2015</xref>) with a Softplus activation at the final layer to generate positive confidence estimations.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Image-guided normalized convolutional neural network</title>
<p>The motivation for adopting NCNN as our backbone lies in its inherent ability to explicitly model confidence propagation. Unlike conventional convolutional networks, NCNN operates on confidence-equipped signals and interpolates missing values in a mathematically principled manner. This capability is particularly valuable under extreme sparsity, such as in VO-derived depth inputs, where the lack of priors makes robust estimation difficult. Moreover, NCNN naturally facilitates uncertainty estimation through confidence propagation, which aligns well with our objective of producing uncertainty-aware depth maps.</p>
<p>Prior studies have shown that image features&#x2014;especially in regions such as reflective surfaces and occlusion boundaries&#x2014;often carry rich structural cues that can complement sparse or unreliable depth information (<xref ref-type="bibr" rid="B11">Kendall and Gal, 2017</xref>). These features serve as valuable priors for improving confidence estimation, particularly in scenarios where the input depth is extremely sparse and unevenly distributed, as in VO-based depth completion.</p>
<p>However, directly incorporating image features into NCNN is not straightforward. This is because normalized convolution enforces algebraic constraints that require a strict correspondence between the input signal and its associated confidence. Consequently, common practices in image-guided depth completion&#x2014;such as concatenating image features with the depth signal (<xref ref-type="bibr" rid="B29">Tao et al., 2022</xref>; <xref ref-type="bibr" rid="B22">Popovi&#x107; et al., 2021</xref>; <xref ref-type="bibr" rid="B5">Eldesokey et al., 2019</xref>)&#x2014;would violate these constraints and compromise the formulation of NCNN.</p>
<p>To leverage this potential without violating NCNN&#x2019;s constraints, we propose the Confidence Refine Block (CRB). The primary motivation behind CRB is to introduce image guidance indirectly, by refining the intermediate confidence maps produced by NCNN layers. Rather than altering the depth signal directly, CRB enhances the confidence propagation process using gated fusion mechanisms and attention-based refinement. This design preserves the integrity of normalized convolution while effectively injecting contextual priors from the image, leading to improved performance under extreme sparsity.</p>
<p>In this section, we first review the basic concepts of the NCNN, and then introduce the details of how the proposed CRB fits into NCNN.</p>
<sec id="s3-3-1">
<label>3.3.1</label>
<title>Normalized convolutional neural network</title>
<p>The fundamental idea of the normalized convolution is to project the confidence-equipped signal <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to a new subspace spanned by a set of basis functions <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> using the signal with high confidence <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. Afterwards, the full signal is reconstructed from this subspace, where the less-confident areas are interpolated from their vicinity using a weighting kernel denoted as the applicability function <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. Thus the image of the signal under the subspace spanned by the basis is obtained as <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a matrix contains all the basis functions and <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a vector of coordinates. These coordinates can be estimated from a weighted least-squares problem (WLS) between the signal <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the basis <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B12">Knutsson and Westin, 1993</xref>):<disp-formula id="e1">
<mml:math id="m19">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mi>arg</mml:mi>
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munder>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mi mathvariant="bold">r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>diag</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>diag</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>Finally, the WLS solution <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> can be used to estimate the signal:<disp-formula id="e2">
<mml:math id="m21">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Instead of manually choosing the applicability function, the optimal <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in certain scenarios can be learned from NCNN (<xref ref-type="bibr" rid="B5">Eldesokey et al., 2019</xref>). This was achieved by using the na&#xef;ve basis which set <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e3">
<mml:math id="m24">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">cy</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2299;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a vector of ones, <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mo>&#x2299;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the Hadamard product, <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mo>.</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>.</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the scalar product, <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a scalar which is equivalent to the estimated value at the signal <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The superscripts <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> indicate the <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:math>
</inline-formula> layer of NCNN, respectively. The confidence is propagated as:<disp-formula id="e4">
<mml:math id="m34">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x232a;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where the output from one layer is the input to the next layer. If the image feature is directly concatenated with the depth signal <inline-formula id="inf31">
<mml:math id="m35">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in (<xref ref-type="disp-formula" rid="e3">Equation 2</xref>) to construct a new signal <inline-formula id="inf32">
<mml:math id="m36">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, its dimensionality increases. Moreover, since <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in (<xref ref-type="disp-formula" rid="e3">Equation 2</xref>) is the output of the previous NCNN layer and maintains a strict correspondence with each depth signal <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, its dimensions remain consistent. Consequently, the new signal <inline-formula id="inf35">
<mml:math id="m39">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> &#x2018;s dimensions do not match those of <inline-formula id="inf36">
<mml:math id="m40">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, thereby preventing the application of the Hadamard operation. Another straightforward way to integrate the image feature with the depth signal <inline-formula id="inf37">
<mml:math id="m41">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is through convolution to form a new signal <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Although this operation resolves the dimensional mismatch, it no longer guarantees the correspondence between <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. These issues motivate us to design the Confidence Refinement Block to integrate the image feature into NCNN without violating the signal-confidence correspondence.</p>
</sec>
<sec id="s3-3-2">
<label>3.3.2</label>
<title>Confidence refine block</title>
<p>We attempt to utilize the image features to refine, but not entirely alter, the confidence from the NCNN layers, as this would severely violate the correspondence between confidence and signals. Since sparse depth from VO is primarily concentrated in high-texture areas (e.g., object contours) while being sparsely distributed in low-texture regions (e.g., flat walls), this disparity leads to varying contributions of image features to confidence estimation across different areas. Given these challenges, a vanilla convolution (a standard convolution operation with normalization and an activation function) that treats all inputs as valid values is not suitable.</p>
<p>Gated Convolution (<xref ref-type="bibr" rid="B37">Yu et al., 2019</xref>), which uses additional convolution kernels to generate gating masks for adaptive feature reweighting, is well-suited to our case. We modified the original form of gated convolution, which originally takes only one feature as input, to simultaneously consider both confidence and image features when calculating the gating signal, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. Although sophisticated modality fusion techniques have been proposed in recent years and can be adopted to fuse confidence features with image features (<xref ref-type="bibr" rid="B16">Liu et al., 2023</xref>), these methods often rely on complex convolution operations, which increase model complexity and go against our lightweight design. To address this issue, we adopt a straightforward yet effective strategy: first concatenating the two feature maps and encoding them with a lightweight convolution layer, then refining the fused representation using an efficient Self-Attention Module.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Detailed structure of proposed Confidence Refine Block.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g003.tif">
<alt-text content-type="machine-generated">Diagram of a Confidence Refine Block showing data flow from &#x22;img feat&#x22; through a convolution and ReLU function to &#x22;feat&#x22;. It undergoes a Self-Attention Module with Channel and Spatial components, followed by another convolution and Sigmoid function. The output combines with &#x22;conf&#x22; to produce &#x22;conf_refine&#x22;.</alt-text>
</graphic>
</fig>
<p>Denoting the confidence from NCNN layer as <inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and image feature as <inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, they have the same size of <inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> but with different channel number <inline-formula id="inf44">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf45">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. We first concatenates them into tensor <inline-formula id="inf46">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> whose size is <inline-formula id="inf47">
<mml:math id="m51">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Then we use a <italic>Conv</italic> layer, which contains a 2D-convolution layer and a batch normalization layer with a leakyReLU activation layer (to avoid a substantial increase in the number of parameters during feature extraction while maintaining responsiveness to negative values), to encode the concatenated feature tensor:<disp-formula id="e5">
<mml:math id="m52">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">feat</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>however, <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf49">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> often exhibit large differences, posing a challenge for the encoding process to distinguish between weights from different features. For instance, at the lowest image resolution, <inline-formula id="inf50">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is two while <inline-formula id="inf51">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">img</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is 16. To address this limitation, we remap the feature map using a self-attention mechanism from (<xref ref-type="bibr" rid="B36">Woo et al., 2018</xref>). The feature map <inline-formula id="inf52">
<mml:math id="m57">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">feat</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is first inferred through a 1D channel attention map <inline-formula id="inf53">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">feat</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and then through a 2D spatial attention map <inline-formula id="inf54">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e6">
<mml:math id="m60">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2297;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf55">
<mml:math id="m61">
<mml:mrow>
<mml:mo>&#x2297;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the element-wise multiplication between two tensors. To calculate the final gating signal <inline-formula id="inf56">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we decode the remapped feature <inline-formula id="inf57">
<mml:math id="m63">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> using a <italic>Conv</italic> layer followed by a sigmoid activation layer. Finally, the refined confidence <inline-formula id="inf58">
<mml:math id="m64">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> can be obtained by implementing element-wise multiplication.<disp-formula id="e7">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m66">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">conf</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>In IU-DC, we integrate one CRB after each confidence-aware down-sampling layer in the NCNN to learn the correlation between confidence and image at different resolutions, as shown in the upper section of <xref ref-type="fig" rid="F2">Figure 2</xref>. We also provide a visualization of the gating signals <inline-formula id="inf59">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="F4">Figure 4</xref>. It can be observed that <inline-formula id="inf60">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> effectively captures semantic features from the image to enhance the confidence map&#x2014;such as sharp edges and reflective surfaces. Consequently, regions with sparse input signals can be effectively interpolated. This visualization also unveils a hidden relationship between the input depth from VO and the internal propagation within NCNN&#x2014;depth signals located at more salient object contours tend to have a greater impact on the reconstruction process.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Visualization of the gating signal in the Confidence Refine Block. The first row presents the input image, the second row presents the input sparse depth from VO, and the third shows the corresponding gating signal.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g004.tif">
<alt-text content-type="machine-generated">A series of images depicting office equipment and environments, followed by three rows of corresponding thermal or sensor data visualizations. The bottom row highlights areas labeled as &#x22;edge&#x22; and &#x22;reflective surface.&#x22; Each visualization includes color gradients and dot patterns, indicating different surface properties or characteristics within the environment.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Model-based uncertainty estimation network</title>
<p>In NCNN, the confidence is propagated separately from the depth signal as shown in (<xref ref-type="disp-formula" rid="e4">Equation 3</xref>), which results in a lack of spatial information. For instance, neighbors estimated from larger depth values typically have higher uncertainty compared to those from smaller depth values, which cannot be distinguished by normalized convolution due to the fixed size of the applicability function <inline-formula id="inf61">
<mml:math id="m69">
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>To address this limitation, we assume that the dense depth output from NCNN forms an occupancy map in the camera frame and follows the probabilistic formulation of the Inverse Sensor Model (ISM) (<xref ref-type="bibr" rid="B1">Agha-Mohammadi et al., 2019</xref>). We integrate the confidence output from NCNN as a prior into this ISM-based probability model, thereby enabling the estimation of spatially-aware uncertainty. Furthermore, the entire module can be smoothly trained in an end-to-end manner using the loss function proposed in <xref ref-type="sec" rid="s3-5">Section 3.5</xref>.</p>
<p>The probability distribution of individual voxel <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> can be computed through Bayes&#x2019; rule in recursive manner as:<disp-formula id="e9">
<mml:math id="m71">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf63">
<mml:math id="m72">
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the measurement depth, <inline-formula id="inf64">
<mml:math id="m73">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the robot location, <inline-formula id="inf65">
<mml:math id="m74">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the steps of iteration. By integrating the ISM formulation,<disp-formula id="e10">
<mml:math id="m75">
<mml:mrow>
<mml:mtable class="align" columnalign="left">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x2248;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>which indicates the occupancy probability given a single measurement, into (<xref ref-type="disp-formula" rid="e9">Equation 4</xref>), and assuming that the robot&#x2019;s previous trajectory <inline-formula id="inf66">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> does not affect the map, we obtain:<disp-formula id="e11">
<mml:math id="m77">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>Assuming a binary occupancy model for voxels (i.e., each voxel is either occupied <inline-formula id="inf67">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> or free <inline-formula id="inf68">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) and considering only the occupancy map in the camera frame - where the occupancy probability is independent of the robot&#x2019;s motion, (<xref ref-type="disp-formula" rid="e11">Equation 5</xref>) can be simplified as:<disp-formula id="e12">
<mml:math id="m80">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ISM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf69">
<mml:math id="m81">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ISM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> indicates the probability of the voxel is occupied given the ISM model with measurement, <inline-formula id="inf70">
<mml:math id="m82">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> indicates the prior knowledge.</p>
<p>
<inline-formula id="inf71">
<mml:math id="m83">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ISM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be approximated by (<xref ref-type="bibr" rid="B17">Loop et al., 2016</xref>):<disp-formula id="e13">
<mml:math id="m84">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ISM</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>H</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf72">
<mml:math id="m85">
<mml:mrow>
<mml:mtext>H</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is a cubic curve function maps the measurement into occupancy probability and <inline-formula id="inf73">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a scalar. We initialize the <inline-formula id="inf74">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> using a clipping operator to constrain the estimated depth <inline-formula id="inf75">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> within an ideal range [in our case, between 0.1 and 8 in VOID (<xref ref-type="bibr" rid="B35">Wong et al., 2020</xref>)]. Next, we refine the scalar by leveraging the spatial dependencies of neighboring depth values, and finally, we map each input <inline-formula id="inf76">
<mml:math id="m89">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> into an occupancy probability by uniformly applying the function <inline-formula id="inf77">
<mml:math id="m90">
<mml:mrow>
<mml:mtext>H</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> across all inputs.</p>
<p>Typically, a fixed <inline-formula id="inf78">
<mml:math id="m91">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for all voxels is assumed during deployment. However, due to various environmental factors, this assumption may not hold. We address this issue by adopting the confidence estimated from NCNN, which encodes both geometric and semantic features, as a strong prior. We construct the <inline-formula id="inf79">
<mml:math id="m92">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for each voxel to represent the heteroscedastic uncertainty in the estimation by formulating WLS problem in (<xref ref-type="disp-formula" rid="e1">Equation 1</xref>) as a special case of the Generalized least-squares (GLS), which offers more flexibility in handling individual variances for each observation (<xref ref-type="bibr" rid="B6">Eldesokey et al., 2020</xref>):<disp-formula id="e14">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>GLS</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">y</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf80">
<mml:math id="m94">
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to ensure consistency with the solution in (<xref ref-type="disp-formula" rid="e3">Equation 2</xref>). Then, we utilize the GLS solution <inline-formula id="inf81">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>GLS</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to estimate the signal <inline-formula id="inf82">
<mml:math id="m96">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, and the uncertainty of <inline-formula id="inf83">
<mml:math id="m97">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> can be obtained as:<disp-formula id="e15">
<mml:math id="m98">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>cov</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mi>cov</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>GLS</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>cov</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold">r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>GLS</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="&#x27e8;" close="&#x27e9;">
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf84">
<mml:math id="m99">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is global variance for each signal. (<xref ref-type="disp-formula" rid="e15">Equation 8</xref>) indicates equal uncertainty for the entire neighborhood under a na&#xef;ve basis. Since each voxel grid corresponds to the signal center <inline-formula id="inf85">
<mml:math id="m100">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf86">
<mml:math id="m101">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>P</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> can be represented as:<disp-formula id="e16">
<mml:math id="m102">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">prior</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>cov</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf87">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is stochastic noise variance <inline-formula id="inf88">
<mml:math id="m104">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf89">
<mml:math id="m105">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represent the applicability function and confidence from the last NCNN layer respectively, as discussed in <xref ref-type="sec" rid="s3-3">Section 3.3</xref>. The noise variance <inline-formula id="inf90">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be estimated from the confidence output of the last NCNN layer. By integrating (<xref ref-type="disp-formula" rid="e13">Equation 7</xref>) and (<xref ref-type="disp-formula" rid="e16">Equation 9</xref>) into (<xref ref-type="disp-formula" rid="e12">Equation 6</xref>) and extending it to all the pixels in the depth image, we can estimate the uncertainty using a mapping function <inline-formula id="inf91">
<mml:math id="m107">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> as follows:<disp-formula id="e17">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">last</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>Our objective is to learn the mapping function <inline-formula id="inf92">
<mml:math id="m109">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> and the scalar <inline-formula id="inf93">
<mml:math id="m110">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in (<xref ref-type="disp-formula" rid="e17">Equation 10</xref>) by concatenating the depth estimation and confidence output from the last NCNN layer into a compact UNet (<xref ref-type="bibr" rid="B25">Ronnebe et al., 2015</xref>), as shown in the right section of <xref ref-type="fig" rid="F2">Figure 2</xref>. A direct comparison of the output uncertainty from the Model-based Uncertainty Estimation Network (ISM-net), the NCNN layer (NCNN), and the conventional ISM in (<xref ref-type="bibr" rid="B1">Agha-Mohammadi et al., 2019</xref>) (ISM) is presented in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Qualitative and quantitative evaluation of the effectiveness of the Model-based Uncertainty Estimation Network. The upper part of the figure presents the input image, sparse depth from VO, uncertainty estimation from the last NCNN layer, and the final uncertainty estimation from our network (ISM-net). The lower part of the figure illustrates the area under the sparsification error plots (<xref ref-type="bibr" rid="B10">Ilg et al., 2018</xref>), where curves closer to the <italic>oracle</italic> represent estimated uncertainty that more closely approximates the real error distribution. ISM-net significantly enhances the uncertainty estimation from NCNN and outperforms the ISM by a large margin.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g005.tif">
<alt-text content-type="machine-generated">Room interior photograph with a sofa and a small table. Three additional images show visual data: input from VO with color-coded points, and two uncertainty maps labeled for NCNN and ISM-net. A line graph below compares RMSE for Oracle, ISM-net, NCNN, and ISM across the percentage of removed pixels. Associated metrics are AUSE_ISM &#x3d; 0.69, AUSE_NCNN &#x3d; 0.37, and AUSE_ISM-net &#x3d; 0.14.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-5">
<label>3.5</label>
<title>Loss function and training strategy</title>
<p>To achieve the different functions of each module, we require a loss function that enables training the proposed network with uncertainty awareness. Following (<xref ref-type="bibr" rid="B6">Eldesokey et al., 2020</xref>) we assume a univariate distribution of each estimated signal under na&#xef;ve basis <inline-formula id="inf94">
<mml:math id="m111">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf95">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the depth estimation and <inline-formula id="inf96">
<mml:math id="m113">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the uncertainty estimation from IU-DC. The least squares solution in (<xref ref-type="disp-formula" rid="e3">Equation 2</xref>) can be formulated as a maximum likelihood problem of a Gaussian error model. Then the objective is defined as minimizing the negative log likelihood:<disp-formula id="e18">
<mml:math id="m114">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf97">
<mml:math id="m115">
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the network parameters.</p>
<p>During the training of our network, we find that initializing the network parameters randomly and training with the loss function <inline-formula id="inf98">
<mml:math id="m116">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> does not guarantee stable convergence. We assume that in the initial training stages, excessively large uncertainty estimations dominate the loss, causing the depth estimation to overcompensate. To address this issue, we adopt a multi-stage training strategy. Initially, we train the network with L2 loss until the network parameters stabilize. Subsequently, we fine-tune the uncertainty output using <inline-formula id="inf99">
<mml:math id="m117">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Evaluation on NYU and KITTI datasets</title>
<p>We use the standard error metrics of the KITTI depth completion challenge (<xref ref-type="bibr" rid="B32">Uhrig et al., 2017</xref>): the Root Mean Square Error (RMSE <italic>m</italic>), the Mean Absolute Error (MAE <italic>m</italic>), the Root Mean Squared Error of the Inverse depth (iRMSE <italic>1/km</italic>), Mean Absolute Error of the Inverse depth (iMAE <italic>1/km</italic>), and the area under sparsification error plots (AUSE) (<xref ref-type="bibr" rid="B10">Ilg et al., 2018</xref>) as measure for the accuracy of the uncertainty.</p>
<sec id="s4-1">
<label>4.1</label>
<title>Datasets and setup</title>
<p>
<italic>Outdoor</italic>: KITTI dataset (<xref ref-type="bibr" rid="B32">Uhrig et al., 2017</xref>) is a large outdoor autonomous driving dataset. We use KITTI depth completion dataset for evaluation, where the training set contains 86k frames, validation set contains 7k frames, and the test set contains 1k frames. The original input depth images have 5% of the pixels available. To simulate the input sparsity of VO, we randomly sample 1k pixels from the raw input depth image, representing approximately 0.2% of the pixels available.</p>
<p>
<italic>Indoor</italic>: NYU dataset (<xref ref-type="bibr" rid="B27">Silberman et al., 2012</xref>) is an RGB-D dataset for indoor scenes, captured with a Micrasoft Kinect. We use the official split with roughly 48k RGB-D pairs for training and 654 pairs for testing. We randomly sample 500 pixels and 200 pixels from the ground truth depth image, representing available pixels of 0.7% and 0.2%, respectively.</p>
<p>
<italic>Setup</italic>: We implement all the networks in PyTorch and train them using the Adam optimizer with an initial learning rate of 0.001 that is decayed with a factor of <inline-formula id="inf100">
<mml:math id="m118">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> every 6 epochs follow the training strategy outlined in <xref ref-type="sec" rid="s3-5">Section 3.5</xref>. All datasets are preprocessed using the same cropping and data augmentation procedures as (<xref ref-type="bibr" rid="B6">Eldesokey et al., 2020</xref>).</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Comparison to the SOTA</title>
<p>
<italic>Baselines:</italic> We propose to obtain dense depth and uncertainty simultaneously, while also considering a lightweight network architecture with low memory consumption suitable for deployment on mobile robots. As baselines, we selected three state-of-the-art networks that meet the requirements: (i) NCONV-AERIAL (<xref ref-type="bibr" rid="B31">Teixeira et al., 2020</xref>) is an image guided approach that incorporates NCNN and fuses its output with images to estimate the final depth and uncertainty. (ii) S2D (<xref ref-type="bibr" rid="B29">Tao et al., 2022</xref>) is an image-guided approach that concatenates the image and sparse depth image in one encoder and outputs dense depth image and uncertainty from two separate decoders. (iii) PNCNN (<xref ref-type="bibr" rid="B6">Eldesokey et al., 2020</xref>) is an unguided approach that also utilizes NCNN as its backbone and has a network structure similar to our approach. We train PNCNN using their open-source code. For S2D, we follow their implementation as described in their paper since they did not release their code. As for NCONV-AERIAL, we use the best results reported in their paper and calculate AUSE using their open-source model.</p>
<p>We initially test all the methods on the KITTI test set with raw input sparsity and the NYU test set with 500 samples as input. We report the accuracy of depth and uncertainty estimation, as well as the number of network parameters in <xref ref-type="table" rid="T1">Table 1</xref>. S2D and NCONV-AERIAL demonstrate superior accuracy in depth completion compared to PNCNN on the KITTI, attributed to their integration of image features. However, on the NYU dataset where the input sparsity increases to <inline-formula id="inf101">
<mml:math id="m119">
<mml:mrow>
<mml:mn>0.7</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the integration of image features fails to enhance the depth completion performance, even underperforming compared to the unguided approach. Furthermore, both S2D and NCONV-AERIAL exhibit significantly higher AUSE, indicating that the uncertainty output from the networks is not tightly correlated with the actual error distribution. Our proposed IU-DC outperforms PNCNN in depth estimation accuracy and maintains accurate uncertainty estimation across both datasets. This indicates that our modifications enhances overall performance without compromising the uncertainty consistency.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Depth completion results on NYU and KITTI datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Algorithm</th>
<th align="center">RMSE</th>
<th align="center">MAE</th>
<th align="center">iRMSE</th>
<th align="center">iMAE</th>
<th align="center">AUSE<inline-formula id="inf102">
<mml:math id="m120">
<mml:mrow>
<mml:mi>&#x2193;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="6" align="left">KITTI test set</td>
</tr>
<tr>
<td align="left">
<italic>NCONV-AERIAL</italic>
</td>
<td align="center">1.01</td>
<td align="center">0.26</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2014;</td>
<td align="center">0.39</td>
</tr>
<tr>
<td align="left">
<italic>S2D</italic>
</td>
<td align="center">1.14</td>
<td align="center">0.40</td>
<td align="center">3.97</td>
<td align="center">1.92</td>
<td align="center">0.13</td>
</tr>
<tr>
<td align="left">
<italic>PNCNN</italic>
</td>
<td align="center">1.23</td>
<td align="center">0.28</td>
<td align="center">4.46</td>
<td align="center">1.07</td>
<td align="center">
<bold>0.05</bold>
</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC</italic>
</td>
<td align="center">
<bold>0.94</bold>
</td>
<td align="center">
<bold>0.23</bold>
</td>
<td align="center">
<bold>2.72</bold>
</td>
<td align="center">
<bold>0.97</bold>
</td>
<td align="center">0.06</td>
</tr>
<tr>
<td colspan="6" align="left">KITTI-1000 samples</td>
</tr>
<tr>
<td align="left">
<italic>PNCNN</italic>
</td>
<td align="center">2.41</td>
<td align="center">0.70</td>
<td align="center">70.08</td>
<td align="center">2.56</td>
<td align="center">
<bold>0.05</bold>
</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC</italic>
</td>
<td align="center">
<bold>1.59</bold>
</td>
<td align="center">
<bold>0.50</bold>
</td>
<td align="center">
<bold>4.87</bold>
</td>
<td align="center">
<bold>2.02</bold>
</td>
<td align="center">
<bold>0.05</bold>
</td>
</tr>
<tr>
<td colspan="6" align="left">NYU-500 samples</td>
</tr>
<tr>
<td align="left">
<italic>NCONV-AERIAL</italic>
</td>
<td align="center">0.22</td>
<td align="center">0.11</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2014;</td>
<td align="center">0.24</td>
</tr>
<tr>
<td align="left">
<italic>S2D</italic>
</td>
<td align="center">0.22</td>
<td align="center">0.16</td>
<td align="center">24.29</td>
<td align="center">16.90</td>
<td align="center">0.30</td>
</tr>
<tr>
<td align="left">
<italic>PNCNN</italic>
</td>
<td align="center">0.18</td>
<td align="center">0.07</td>
<td align="center">24.38</td>
<td align="center">8.77</td>
<td align="center">
<bold>0.06</bold>
</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC</italic>
</td>
<td align="center">
<bold>0.11</bold>
</td>
<td align="center">
<bold>0.04</bold>
</td>
<td align="center">
<bold>14.28</bold>
</td>
<td align="center">
<bold>5.13</bold>
</td>
<td align="center">
<bold>0.06</bold>
</td>
</tr>
<tr>
<td colspan="6" align="left">NYU-200 samples</td>
</tr>
<tr>
<td align="left">
<italic>PNCNN</italic>
</td>
<td align="center">0.24</td>
<td align="center">0.10</td>
<td align="center">56.78</td>
<td align="center">13.82</td>
<td align="center">0.09</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC</italic>
</td>
<td align="center">
<bold>0.16</bold>
</td>
<td align="center">
<bold>0.06</bold>
</td>
<td align="center">
<bold>20.71</bold>
</td>
<td align="center">
<bold>8.36</bold>
</td>
<td align="center">
<bold>0.09</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The number of parameters (&#x23;P) for each model is as follows: NCONV-AERIAL: 980 K; S2D: 12 M; PNCNN: 668 K; IU-DC: 689 K. Bold numbers indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To simulate the input data from VO with a sparsity of approximately 0.2%, we further test IU-DC and PNCNN on the KITTI dataset with 1,000 samples and the NYU dataset with 200 samples. The iRMSE of PNCNN significantly degraded, being up to 15 times greater than IU-DC in KITTI, indicating the presence of a large number of outliers. This result suggests that when the sparsity becomes extremely high, the neighborhood of the signal cannot be correctly estimated due to the limited receptive field when depth data is the only input source. In contrast, IU-DC achieves robust performance even with this extreme sparsity of input by enriching information around the signal through the integration of image features. This makes IU-DC more suitable for deployment in VO scenarios. To qualitatively observe the results, we present the depth maps estimated from PNCNN and IU-DC on the KITTI dataset in <xref ref-type="fig" rid="F6">Figure 6</xref>. IU-DC captures clearer edges and more detailed contours even when the input sparsity increases significantly.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visualization of depth completion results on the KITTI dataset, where <inline-formula id="inf103">
<mml:math id="m121">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf104">
<mml:math id="m122">
<mml:mrow>
<mml:mn>0.2</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the percentage of available pixels in the depth image. PNCNN performs well under low sparsity but exhibits blurriness on the contours of objects and fails to capture most information under high sparsity. In contrast, IU-DC shows more robust performance in both cases.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g006.tif">
<alt-text content-type="machine-generated">Comparison of image processing techniques. The top row shows a street scene in color and its ground truth (GT). The middle row displays IU-DC results highlighting areas with red rectangles. The bottom row presents PNCNN results with similar red-highlighted areas, noting a smaller error percentage.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Dense mapping from sparse visual odometry</title>
<p>While <xref ref-type="sec" rid="s4">Section 4</xref> presents evaluations on standard benchmark datasets using synthetically downsampled sparse inputs, in this section we further evaluate IU-DC in real-world visual odometry (VO) scenarios, where the input sparsity and distribution better reflect robot deployment conditions. We additionally assess the impact of uncertainty-aware depth completion on mapping performance.</p>
<sec id="s5-1">
<label>5.1</label>
<title>Evaluation with VO input</title>
<sec id="s5-1-1">
<label>5.1.1</label>
<title>Dataset</title>
<p>VOID (<xref ref-type="bibr" rid="B35">Wong et al., 2020</xref>) provides real-world data collected using an Intel RealSense D435i camera and the VIO frontend (<xref ref-type="bibr" rid="B7">Fei et al., 2019</xref>), where metric pose and structure estimation are performed in a gravity-aligned and scaled reference frame using an inertial measurement unit (IMU). The dataset is more realistic in that no sensor measures depth at random locations. VOID contains <inline-formula id="inf105">
<mml:math id="m123">
<mml:mrow>
<mml:mn>47</mml:mn>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> training and 800 test samples, with varying levels of input depth density. We adopt 500 points, corresponding to <inline-formula id="inf106">
<mml:math id="m124">
<mml:mrow>
<mml:mn>0.15</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the pixels in the depth image, and follow the published train-test split for evaluation. It is worth noting that our method can generalize to different forms of VO or VIO, as long as the front-end provides metric-scale sparse depth.</p>
</sec>
<sec id="s5-1-2">
<label>5.1.2</label>
<title>Comparison to the SOTA</title>
<p>As baselines, we select two methods that are designed to complete sparse depth from VO, similar to ours but without uncertainty estimation: (i) VOICED (<xref ref-type="bibr" rid="B35">Wong et al., 2020</xref>) is an unsupervised method that is among the first to tackle input from VO. (ii) VI-Depth (<xref ref-type="bibr" rid="B34">Wofk et al., 2023</xref>) integrates monocular depth estimation with VO to produce dense depth estimates with metric scale. Note that the open-source VI-Depth model was trained with a resolution of <inline-formula id="inf107">
<mml:math id="m125">
<mml:mrow>
<mml:mn>265</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>265</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. We also report the depth completion results at the raw resolution <inline-formula id="inf108">
<mml:math id="m126">
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in the VOID. The depth completion results are summarized in <xref ref-type="table" rid="T2">Table 2</xref>. The depth estimated by IU-DC demonstrates higher accuracy compared to VOICED and VI-Depth<inline-formula id="inf109">
<mml:math id="m127">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. However, IU-DC underperforms relative to VI-Depth<inline-formula id="inf110">
<mml:math id="m128">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>265</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>265</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, which is the resolution used for training the external monocular depth estimation network in VI-Depth. To demonstrate the effectiveness of accurate uncertainty estimation from the network, we further filter the top <inline-formula id="inf111">
<mml:math id="m129">
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of the most uncertain depth values in the depth image and evaluate its accuracy. This is denoted as IU-DC(filtered) in <xref ref-type="table" rid="T2">Table 2</xref>. After applying the uncertainty-aware filtering, the depth accuracy improves significantly and surpasses other SOTAs by a large margin, e.g., VOICED by <inline-formula id="inf112">
<mml:math id="m130">
<mml:mrow>
<mml:mn>48</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and VI-Depth <inline-formula id="inf113">
<mml:math id="m131">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>265</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>265</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> by <inline-formula id="inf114">
<mml:math id="m132">
<mml:mrow>
<mml:mn>32</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Depth completion results on VOID dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Method</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">iMAE</th>
<th align="center">iRMSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<italic>VOICED</italic>
</td>
<td align="center">124.11</td>
<td align="center">217.43</td>
<td align="center">66.95</td>
<td align="center">121.23</td>
</tr>
<tr>
<td align="left">
<italic>VI-Depth</italic> <inline-formula id="inf115">
<mml:math id="m133">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>265</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>265</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">94.81</td>
<td align="center">164.36</td>
<td align="center">43.19</td>
<td align="center">
<bold>69.25</bold>
</td>
</tr>
<tr>
<td align="left">
<italic>VI-Depth</italic> <inline-formula id="inf116">
<mml:math id="m134">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">129.95</td>
<td align="center">210.39</td>
<td align="center">92.23</td>
<td align="center">61.68</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC (raw)</italic>
</td>
<td align="center">102.04</td>
<td align="center">198.29</td>
<td align="center">54.66</td>
<td align="center">103.01</td>
</tr>
<tr>
<td align="left">
<italic>IU-DC (filtered)</italic>
</td>
<td align="center">
<bold>62.61</bold>
</td>
<td align="center">
<bold>111.32</bold>
</td>
<td align="center">
<bold>37.65</bold>
</td>
<td align="center">69.86</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn1">
<label>
<sup>a</sup>
</label>
<p>
<inline-formula id="inf117">
<mml:math id="m135">
<mml:mrow>
<mml:mn>265</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>265</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf118">
<mml:math id="m136">
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> denote the input resolutions. Bold numbers indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5-1-3">
<label>5.1.3</label>
<title>Runtime analysis and memory consumption</title>
<p>Runtime and memory consumption are both crucial for deployment on mobile robots to achieve real-time performance. IU-DC exhibits significantly lower parameter counts <inline-formula id="inf119">
<mml:math id="m137">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0.6</mml:mn>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> compared to VOICED <inline-formula id="inf120">
<mml:math id="m138">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>6.4</mml:mn>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and VI-Depth <inline-formula id="inf121">
<mml:math id="m139">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>21</mml:mn>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and only occupies 2.76 MB of memory. We further tested the runtime on NVIDIA GeForce RTX 3050 and NVIDIA Xavier NX with an input resolution of <inline-formula id="inf122">
<mml:math id="m140">
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. IU-DC runs at 17.5 FPS on the NVIDIA GeForce RTX 3050 and 5.5 FPS on the NVIDIA Xavier NX, while VI-Depth runs at 9 FPS and 3.5 FPS, respectively. IU-DC is nearly twice as fast as VI-Depth. We also tested IU-DC with a lower resolution of <inline-formula id="inf123">
<mml:math id="m141">
<mml:mrow>
<mml:mn>384</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>384</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, achieving 10 FPS on the NVIDIA Xavier NX, which guarantees the update rate for most keyframes in VO. The runtime of IU-DC can be further reduced with engineering enhancements and more advanced computational hardware, e.g., Jetson AGX Orin.</p>
</sec>
</sec>
<sec id="s5-2">
<label>5.2</label>
<title>Ablation study</title>
<sec id="s5-2-1">
<label>5.2.1</label>
<title>Effect of confidence refine</title>
<p>We first analyze the effect of our proposed Confidence Refinement Block (CRB) by introducing three baselines: (i) integrating image features using vanilla convolution instead of gated convolution (<italic>-w/o gated convolution</italic>); (ii) employing gated convolution without the self-attention module (<italic>-w/o self-attention</italic>); and (iii) using image features to refine the depth signal instead of confidence (<italic>-w/depth refine</italic>). The results are presented in <xref ref-type="table" rid="T3">Table 3</xref>. Our full model outperforms all baselines across all evaluation metrics, validating that gated convolution extracts more reliable features than vanilla convolution, thereby leading to improved accuracy in depth completion. Furthermore, integrating the self-attention module further enhances performance. We also find that removing the self-attention module in CRB significantly deteriorates the accuracy of uncertainty estimation, increasing the AUSE from 0.14 to 0.49. Moreover, refining confidence yields better results than the depth signal, highlighting the strong correlation between confidence in NCNN layers and images, which supports our motivation for designing CRB.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Ablation study on VOID dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="right">Model</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">iMAE</th>
<th align="center">iRMSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="right">Full</td>
<td align="center">
<bold>102.04</bold>
</td>
<td align="center">
<bold>198.29</bold>
</td>
<td align="center">
<bold>54.66</bold>
</td>
<td align="center">
<bold>103.01</bold>
</td>
</tr>
<tr>
<td colspan="5" align="left">Confidence Refine</td>
</tr>
<tr>
<td align="right">
<italic>- w/o gated convolution</italic>
</td>
<td align="center">117.17</td>
<td align="center">200.18</td>
<td align="center">64.62</td>
<td align="center">112.18</td>
</tr>
<tr>
<td align="right">
<italic>- w/o self-attention</italic>
</td>
<td align="center">104.89</td>
<td align="center">203.85</td>
<td align="center">55.41</td>
<td align="center">107.67</td>
</tr>
<tr>
<td align="right">
<italic>- w/depth refine</italic>
</td>
<td align="center">119.12</td>
<td align="center">210.15</td>
<td align="center">66.72</td>
<td align="center">115.43</td>
</tr>
<tr>
<td colspan="5" align="left">ISM Network</td>
</tr>
<tr>
<td align="right">
<italic>- w/o ISM model</italic>
</td>
<td align="center">144.03</td>
<td align="center">244.00</td>
<td align="center">73.14</td>
<td align="center">123.60</td>
</tr>
<tr>
<td align="right">w/VO confidence init</td>
<td align="center">196.44</td>
<td align="center">448.85</td>
<td align="center">921.34</td>
<td align="center">2696.48</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold numbers indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>We further assess whether the multi-resolution integration of CRBs benefits the depth completion process and uncertainty estimation. The results are shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. The major difference between VOID and NYU lies in the input signal distribution. In NYU, the inputs are randomly sampled from the ground truth, whereas in VOID, the inputs are generated from the VO frontend. When integrating more CRBs during inference, we observe an improvement in both depth and uncertainty estimation accuracy. This indicates that CRBs enhance the depth completion process, with their effectiveness becoming more pronounced through multi-resolution integration. It&#x2019;s worth noting that when no CRBs are integrated into the network, the RMSE and MAE increase by <inline-formula id="inf124">
<mml:math id="m142">
<mml:mrow>
<mml:mn>132</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf125">
<mml:math id="m143">
<mml:mrow>
<mml:mn>208</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in NYU, but by <inline-formula id="inf126">
<mml:math id="m144">
<mml:mrow>
<mml:mn>924</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf127">
<mml:math id="m145">
<mml:mrow>
<mml:mn>1716</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in VOID. We attribute the substantial deterioration in VOID to the uneven distribution of sparse depth from VO, which validates the crucial role of CRBs in VO depth completion tasks.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Accuracy of multi-resolution integration of Confidence Refinement Blocks (one per resolution). <bold>(a)</bold> Results on the NYU dataset with 200 samples. <bold>(b)</bold> Results on the VOID dataset.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g007.tif">
<alt-text content-type="machine-generated">Two line graphs comparing RMSE, AUSE, and MAE metrics across different numbers of confidence refine blocks. Graph (a) shows RMSE, AUSE, and MAE decreasing steadily with each block. Graph (b) shows RMSE and MAE starting high and decreasing rapidly, while AUSE decreases more gradually.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s5-2-2">
<label>5.2.2</label>
<title>Effect of ISM model in uncertainty estimation</title>
<p>IU-DC follows the same uncertainty propagation method as NCNN during the depth completion process but is distinct in its output uncertainty by integrating a map probability density function with the ISM. We validate the role of the ISM model by training a network that only utilize the confidence from the last NCNN layer for uncertainty estimation, the same method as in PNCNN. We evaluate the accuracy of uncertainty for different ranges of depth values and report the error bars in <xref ref-type="fig" rid="F8">Figure 8</xref>. By incorporating the ISM model, the uncertainty estimation improves across different ranges of depth signals, aligning with our motivation discussed in <xref ref-type="sec" rid="s3-4">Section 3.4</xref> and confirming that our approach yields more spatially accurate uncertainty outputs. This improvement is consistent whether the input comes from random sampling or VO, validating that the ISM model is robust to the type of input signal and generalizes well across different environments.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Accuracy of uncertainty estimation with and without the ISM model across different depth value ranges. <bold>(a)</bold> Results on the NYU dataset with 200 samples. <bold>(b)</bold> Results on the VOID dataset. The horizontal axis represents the percentage of top depth values in the depth image, e.g., 40 represents the top <inline-formula id="inf128">
<mml:math id="m146">
<mml:mrow>
<mml:mn>40</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> of values in the depth image.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g008.tif">
<alt-text content-type="machine-generated">Bar charts comparing AUSE values with and without ISM at various percentage depths. Panel (a) shows values increasing from 0.08 to 0.18 for both conditions. Panel (b) shows values from 0.15 to 0.23. Red bars indicate without ISM and blue bars with ISM, with error bars on each bar.</alt-text>
</graphic>
</fig>
<p>Additionally, we report the depth completion accuracy in <xref ref-type="table" rid="T3">Table 3</xref> (denoted as <italic>-w/o ISM model</italic>). The results validate that integrating the ISM model into the uncertainty estimation network not only improves uncertainty estimation but also benefits the network training, enabling it to converge to a more accurate depth estimation model.</p>
</sec>
<sec id="s5-2-3">
<label>5.2.3</label>
<title>Does VO uncertainty aid in depth completion?</title>
<p>We are also interested in whether the uncertainty estimated from VO can benefit the depth completion process. Since the VOID dataset does not provide uncertainty estimation for each input point, we adopt the uniform uncertainty estimation method from (<xref ref-type="bibr" rid="B38">Zhang and Ye, 2020</xref>) to compute the initial uncertainty for the input sparse depth. This estimated uncertainty is then fed into the first NCNN layer to train a baseline model. We report the results in <xref ref-type="table" rid="T3">Table 3</xref> (denoted as <italic>w/VO conf. init.</italic>). After model convergence, the accuracy of the estimated depth significantly drops, with a large iRMSE indicating a high number of outliers. These observations suggest that directly incorporating the uncertainty from a model-based VO does not align well with the NCNN. We believe that employing a deep VO framework and training in an end-to-end manner may yield better results.</p>
</sec>
</sec>
<sec id="s5-3">
<label>5.3</label>
<title>Evaluation of mapping performance</title>
<p>We adopt RTAB-Map (<xref ref-type="bibr" rid="B13">Labb&#xe9; and Michaud, 2019</xref>) as the mapping module and utilize a voxel grid resolution of 0.01 m to store map for each sequence. We use either the ground truth pose (in VOID) or the pose from the V-SLAM algorithm (in our own sequence) in the mapping module to fairly assess the impact of depth estimation from different methods on mapping performance. The ground truth is generated using the ground truth depth with offline post-processing. Following (<xref ref-type="bibr" rid="B28">Stathoulopoulos et al., 2023</xref>), we use CloudCompare, an open-source point cloud processing software, to first align each map and then calculate the distance between the two point clouds (Mean Dist. <inline-formula id="inf129">
<mml:math id="m147">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>) and the standard deviation (Var.) as error metrics for mapping.</p>
<sec id="s5-3-1">
<label>5.3.1</label>
<title>VOID</title>
<p>We evaluated the mapping performance of IU-DC and VI-Depth on three distinct sequences from the VOID dataset under two levels of input sparsity: <inline-formula id="inf130">
<mml:math id="m148">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.15</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf131">
<mml:math id="m149">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.05</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The results are summarized in <xref ref-type="table" rid="T4">Table 4</xref>. Under normal sparsity <inline-formula id="inf132">
<mml:math id="m150">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.15</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, IU-DC outperforms VI-Depth by almost twice the error metrics across all sequences, with accuracy improvements ranging from <inline-formula id="inf133">
<mml:math id="m151">
<mml:mrow>
<mml:mn>27</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf134">
<mml:math id="m152">
<mml:mrow>
<mml:mn>52</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and nearly half the variance. Moreover, when the input sparsity is further reduced to <inline-formula id="inf135">
<mml:math id="m153">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.05</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2014;representing extreme scenarios where the robot may operate in low-texture regions&#x2014;IU-DC continues to significantly outperform VI-Depth. These findings indicate that, despite IU-DC being only <inline-formula id="inf136">
<mml:math id="m154">
<mml:mrow>
<mml:mn>2.8</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the size of VI-Depth, it is better suited for robotic mapping tasks, as it facilitates the generation of more precise spatial maps through its uncertainty-aware approach.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Mapping accuracy in VOID sequences.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Metric</th>
<th colspan="2" align="center">
<italic>desktop</italic>
</th>
<th colspan="2" align="center">
<italic>office</italic>
</th>
<th colspan="2" align="center">
<italic>visionlab</italic>
</th>
</tr>
<tr>
<th align="center">IU-DC</th>
<th align="center">VI-Depth</th>
<th align="center">IU-DC</th>
<th align="center">VI-Depth</th>
<th align="center">IU-DC</th>
<th align="center">VI-Depth</th>
</tr>
</thead>
<tbody valign="top">
<tr style="background-color:#CCCCCC">
<td colspan="7" align="left">
<inline-formula id="inf137">
<mml:math id="m155">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>0.15%</italic>
</td>
</tr>
<tr>
<td align="left">Mean Dist.</td>
<td align="center">
<bold>0.08</bold>
</td>
<td align="center">0.11</td>
<td align="center">
<bold>0.09</bold>
</td>
<td align="center">0.19</td>
<td align="center">
<bold>0.11</bold>
</td>
<td align="center">0.22</td>
</tr>
<tr>
<td align="left">Var.</td>
<td align="center">
<bold>0.03</bold>
</td>
<td align="center">0.05</td>
<td align="center">
<bold>0.06</bold>
</td>
<td align="center">0.14</td>
<td align="center">
<bold>0.06</bold>
</td>
<td align="center">0.17</td>
</tr>
<tr style="background-color:#CCCCCC">
<td colspan="7" align="left">
<inline-formula id="inf138">
<mml:math id="m156">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>0.05%</italic>
</td>
</tr>
<tr>
<td align="left">Mean Dist.</td>
<td align="center">
<bold>0.09</bold>
</td>
<td align="center">0.20</td>
<td align="center">
<bold>0.10</bold>
</td>
<td align="center">0.23</td>
<td align="center">0.18</td>
<td align="center">0.23</td>
</tr>
<tr>
<td align="left">Var.</td>
<td align="center">
<bold>0.04</bold>
</td>
<td align="center">0.17</td>
<td align="center">
<bold>0.09</bold>
</td>
<td align="center">0.21</td>
<td align="center">
<bold>0.15</bold>
</td>
<td align="center">
<bold>0.17</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn2">
<label>
<sup>a</sup>
</label>
<p>
<inline-formula id="inf139">
<mml:math id="m157">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.15</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf140">
<mml:math id="m158">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>0.05</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicate input sparsity. Bold numbers indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5-3-2">
<label>5.3.2</label>
<title>Long trajectory sequence in the study office</title>
<p>Most sequences in the VOID dataset are recorded in constrained areas with short trajectories. To evaluate our method in a more open environment with longer trajectories, which are more common scenarios encountered by mobile robots, we conducted an experiment in a large student office using a handheld Intel RealSense D435i depth camera. We obtain the pose using the open-source V-SLAM algorithm provided by RealSense and use depth images from the D435i as ground truth (same as in the VOID dataset) for both network training and inference. To generate the input for the network, we follow the method described in (<xref ref-type="bibr" rid="B34">Wofk et al., 2023</xref>) running the VINS-Mono feature tracker front-end (<xref ref-type="bibr" rid="B23">Qin et al., 2018</xref>) to obtain sparse feature locations and then sampling ground truth depth at those locations. Both the camera image and depth image have a resolution of <inline-formula id="inf141">
<mml:math id="m159">
<mml:mrow>
<mml:mn>480</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>640</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, with approximately 500 points used in the input depth image.</p>
<p>The visualization of the map generated by sparse VO, Droid-SLAM (dense VO) (<xref ref-type="bibr" rid="B30">Teed and Deng, 2021</xref>), our method, and the ground truth is presented in <xref ref-type="fig" rid="F9">Figure 9</xref>. Our method achieves significant completeness with <inline-formula id="inf142">
<mml:math id="m160">
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> reconstruction volumes compared to the Sparse Map and captures detailed structural information of the environment with high accuracy. Although Droid-SLAM shows improvement in completeness over sparse VO, there is still considerable missing spatial information. To further validate the impact of uncertainty filtering approach, we conduct a comparative study using the raw depth output from IU-DC in the mapping module. We consider the distance between maps and the ground truth within 0.05 m as correct volumes, while the rest are classified as false volumes. From the results in <xref ref-type="table" rid="T5">Table 5</xref>, the map constructed using filtered depth shows higher accuracy and greater consistency with the ground truth. Although using raw depth increases reconstruction volumes, it has a <inline-formula id="inf143">
<mml:math id="m161">
<mml:mrow>
<mml:mn>28</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> error rate, whereas filtered depth only has <inline-formula id="inf144">
<mml:math id="m162">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. A video demonstrating the real-time mapping performance using our proposed approach can be found in the <xref ref-type="sec" rid="s13">Supplementary Materials</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Evaluation of the Mapping Performance. The left part presents the maps generated by sparse VO, Droid-SLAM, and our method, while the right part shows three zoomed-in sections of our map with the associated error distribution. The Dense Map (ours) covers <inline-formula id="inf145">
<mml:math id="m163">
<mml:mrow>
<mml:mn>78</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> volumes of the Ground Truth, whereas the Sparse Map covers only <inline-formula id="inf146">
<mml:math id="m164">
<mml:mrow>
<mml:mn>1.5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g009.tif">
<alt-text content-type="machine-generated">Three sets of maps are shown. On the left, a sparse map and two dense maps&#x2014;Droid-SLAM and a new approach&#x2014;are presented. The right displays comparative error maps with a color scalar from blue (low) to red (high), showing mapping errors against ground truth for three scenes. Each scene compares the new approach with ground truth, highlighting differences.</alt-text>
</graphic>
</fig>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Mapping accuracy in study office.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Input</th>
<th align="center">Correct vol</th>
<th align="center">False vol</th>
<th align="center">Mean dist</th>
<th align="center">Var</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<italic>raw_depth</italic>
</td>
<td align="center">9.55</td>
<td align="center">3.67</td>
<td align="center">0.11</td>
<td align="center">0.09</td>
</tr>
<tr>
<td align="left">
<italic>filtered_depth</italic>
</td>
<td align="center">7.05</td>
<td align="center">0.87</td>
<td align="center">0.07</td>
<td align="center">0.04</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn3">
<label>
<sup>a</sup>
</label>
<p>Vol. Is in <inline-formula id="inf147">
<mml:math id="m165">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5-3-3">
<label>5.3.3</label>
<title>Application in map alignment</title>
<p>We conducted two trajectories, each mapping half of the office with a small overlapping area, to simulate the case of a two-robot system. First, we align the two maps using the ground truth pose and then introduce random translations and rotations to simulate potential false relative poses in real-world scenarios, which is presented in <xref ref-type="fig" rid="F10">Figure 10A</xref>. We align the two local maps using the transformation matrix calculated by GICP (<xref ref-type="bibr" rid="B26">Segal et al., 2009</xref>), utilizing the same overlapping region of the sparse map generated by VO, as shown in <xref ref-type="fig" rid="F10">Figure 10B</xref>, and the dense map generated by our method, as shown in <xref ref-type="fig" rid="F10">Figure 10C</xref>. The results show that directly using the sparse map results in false alignment due to the map being too sparse, with missing reliable features. In contrast, the completed map recovers most of the environmental structures, providing sufficient features for accurately aligning the two maps. Though the experiment is conducted with two robots, it can be easily extended to a multi- or even swarm-robot system.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Alignment of maps from two robot coordinates. <bold>(a)</bold> Initial relative pose. <bold>(b)</bold> Alignment with maps generated by VO. <bold>(c)</bold> Alignment with maps generated by our method. We use the ground truth map with a voxel resolution downsampled to 0.05 m for visualization.</p>
</caption>
<graphic xlink:href="frobt-12-1644230-g010.tif">
<alt-text content-type="machine-generated">Three point cloud visualizations, labeled (a), (b), and (c). Each visual features a dense cluster of red and blue points forming distinct check-mark shapes with different orientations.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6</label>
<title>Conclusion</title>
<p>In this work, we propose a novel IU-DC to complete the extremely sparse depth data from VO, enhancing spatial perception through dense mapping of the environment. We extend NCNN into an image-guided approach with a specifically designed image feature integration mechanism and an ISM-based uncertainty estimation method to encode both color and spatial features, demonstrating superior performance in both depth and uncertainty estimation. The uncertainty-aware depth output from IU-DC exhibits outstanding performance compared to other VO depth completion methods in the context of robot mapping, achieving <inline-formula id="inf148">
<mml:math id="m166">
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> more reconstructed space than the original sparse map and <inline-formula id="inf149">
<mml:math id="m167">
<mml:mrow>
<mml:mn>78</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> coverage of the ground truth with high accuracy. IU-DC is also computationally efficient and requires limited memory consumption, showcasing its potential deployment on mobile robots.</p>
<p>The major limitation of our work, similar to other VO depth completion methods, is the reliance on VO to generate an accurate initial depth estimation. A promising future direction would be to generate uncertainty estimations for both input and output depth, and further apply optimization techniques to tightly couple these uncertainties with the depth estimation from the network.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s13">Supplementary Material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>DY: Writing &#x2013; original draft. XZ: Writing &#x2013; review and editing. HL: Writing &#x2013; review and editing. HW: Writing &#x2013; review and editing. CW: Writing &#x2013; review and editing. KX: Writing &#x2013; review and editing. XD: Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s13">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2025.1644230/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2025.1644230/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Video1.mp4" id="SM1" mimetype="application/mp4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2605483/overview">Li Li</ext-link>, Wuhan University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2327515/overview">Long Cheng</ext-link>, Chinese Academy of Sciences (CAS), China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3135402/overview">Hanyun Wang</ext-link>, PLA Strategic Support Force Information Engineering University, China</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Agha-Mohammadi</surname>
<given-names>A.-A.</given-names>
</name>
<name>
<surname>Heiden</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hausman</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sukhatme</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Confidence-rich grid mapping</article-title>. <source>Int. J. Robotics Res.</source> <volume>38</volume> (<issue>12-13</issue>), <fpage>1352</fpage>&#x2013;<lpage>1374</lpage>. <pub-id pub-id-type="doi">10.1177/0278364919839762</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aguiar</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Neves dos Santos</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Sobreira</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Boaventura-Cunha</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sousa</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Localization and mapping on agriculture based on point-feature extraction and semiplanes segmentation from 3d lidar data</article-title>. <source>Front. Robotics AI</source> <volume>9</volume>, <fpage>832165</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2022.832165</pub-id>
<pub-id pub-id-type="pmid">35155589</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Araya-Martinez</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Matthiesen</surname>
<given-names>V. S.</given-names>
</name>
<name>
<surname>B&#xf8;gh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lambrecht</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>de Figueiredo</surname>
<given-names>R. P.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A fast monocular 6d pose estimation method for textureless objects based on perceptual hashing and template matching</article-title>. <source>Front. Robotics AI</source> <volume>11</volume>, <fpage>1424036</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2024.1424036</pub-id>
<pub-id pub-id-type="pmid">39845569</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Agg-net: attention guided gated-convolutional network for depth image completion</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>8853</fpage>&#x2013;<lpage>8862</lpage>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eldesokey</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Felsberg</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>F. S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Confidence propagation through cnns for guided sparse depth regression</article-title>. <source>IEEE Trans. pattern analysis Mach. Intell.</source> <volume>42</volume> (<issue>10</issue>), <fpage>2423</fpage>&#x2013;<lpage>2436</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2019.2929170</pub-id>
<pub-id pub-id-type="pmid">31331882</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Eldesokey</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Felsberg</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Holmquist</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Persson</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Uncertainty-aware cnns for depth completion: uncertainty from beginning to end</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>12 014</fpage>&#x2013;<lpage>12 023</lpage>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Soatto</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Geo-supervised visual depth prediction</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>4</volume> (<issue>2</issue>), <fpage>1661</fpage>&#x2013;<lpage>1668</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2019.2896963</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gustafsson</surname>
<given-names>F. K.</given-names>
</name>
<name>
<surname>Danelljan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Schon</surname>
<given-names>T. B.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Evaluating scalable bayesian deep learning methods for robust computer vision</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</source>, <fpage>318</fpage>&#x2013;<lpage>319</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ozay</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Deep depth completion from extremely sparse data: a survey</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>45</volume> (<issue>7</issue>), <fpage>8244</fpage>&#x2013;<lpage>8264</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3229090</pub-id>
<pub-id pub-id-type="pmid">37015558</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ilg</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Cicek</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Galesso</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Makansi</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Uncertainty estimates and multi-hypotheses networks for optical flow</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV)</source>, <fpage>652</fpage>&#x2013;<lpage>667</lpage>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kendall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gal</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>What uncertainties do we need in bayesian deep learning for computer vision?</article-title> <source>Adv. neural Inf. Process. Syst.</source> <volume>30</volume>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Knutsson</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Westin</surname>
<given-names>C.-F.</given-names>
</name>
</person-group> (<year>1993</year>). &#x201c;<article-title>Normalized and differential convolution</article-title>,&#x201d; in <source>Proceedings of IEEE conference on computer vision and pattern recognition</source> (<publisher-name>IEEE</publisher-name>), <fpage>515</fpage>&#x2013;<lpage>523</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Labb&#xe9;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Michaud</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Rtab-map as an open-source lidar and visual simultaneous localization and mapping library for large-scale and long-term online operation</article-title>. <source>J. field robotics</source> <volume>36</volume> (<issue>2</issue>), <fpage>416</fpage>&#x2013;<lpage>446</lpage>. <pub-id pub-id-type="doi">10.1002/rob.21831</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Labb&#xe9;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Michaud</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Multi-session visual slam for illumination-invariant re-localization in indoor environments</article-title>. <source>Front. Robotics AI</source> <volume>9</volume>, <fpage>801886</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2022.801886</pub-id>
<pub-id pub-id-type="pmid">35783022</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>T. Y.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>B.-W.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Monitored distillation for positive congruent depth completion</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>35</fpage>&#x2013;<lpage>53</lpage>.</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lyu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Mff-net: towards efficient monocular depth completion with multi-modal feature fusion</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>8</volume> (<issue>2</issue>), <fpage>920</fpage>&#x2013;<lpage>927</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2023.3234776</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Loop</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Orts-Escolano</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>P. A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>A closed-form bayesian fusion equation using occupancy probabilities</article-title>,&#x201d; in <source>2016 fourth international conference on 3D vision (3DV)</source> (<publisher-name>IEEE</publisher-name>), <fpage>380</fpage>&#x2013;<lpage>388</lpage>.</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Karaman</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sparse-to-dense: depth prediction from sparse depth samples and a single image</article-title>,&#x201d; in <source>2018 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>4796</fpage>&#x2013;<lpage>4803</lpage>.</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Malakouti-Khah</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sadeghzadeh-Nokhodberiz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Montazeri</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Simultaneous localization and mapping in a multi-robot system in a dynamic environment with unknown initial correspondence</article-title>. <source>Front. Robotics AI</source> <volume>10</volume>, <fpage>1291672</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2023.1291672</pub-id>
<pub-id pub-id-type="pmid">38283801</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mathew</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Magerand</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Trucco</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Manfredi</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Self-supervised monocular depth estimation for high field of view colonoscopy cameras</article-title>. <source>Front. Robotics AI</source> <volume>10</volume>, <fpage>1212525</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2023.1212525</pub-id>
<pub-id pub-id-type="pmid">37559569</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Merrill</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Geneva</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Robust monocular visual-inertial depth completion for embedded systems</article-title>,&#x201d; in <source>2021 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5713</fpage>&#x2013;<lpage>5719</lpage>.</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Popovi&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Thomas</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Papatheodorou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Funk</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Vidal-Calleja</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Leutenegger</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Volumetric occupancy mapping with probabilistic depth completion for robotic navigation</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>6</volume> (<issue>3</issue>), <fpage>5072</fpage>&#x2013;<lpage>5079</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2021.3070308</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qin</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Vins-mono: a robust and versatile monocular visual-inertial state estimator</article-title>. <source>IEEE Trans. Robotics</source> <volume>34</volume> (<issue>4</issue>), <fpage>1004</fpage>&#x2013;<lpage>1020</lpage>. <pub-id pub-id-type="doi">10.1109/tro.2018.2853729</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Qu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>C. J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Bayesian deep basis fitting for depth completion with uncertainty</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>16 147</fpage>&#x2013;<lpage>16 157</lpage>.</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <source>Medical image computing and computer-assisted intervention&#x2013;MICCAI 2015: 18Th international conference, munich, Germany, October 5-9, 2015, proceedings, part III 18</source> (<publisher-name>Springer</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Segal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Haehnel</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Thrun</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2009</year>). <source>The Robotics: Science and Systems (RSS)</source>. <publisher-loc>Seattle, Washington, USA</publisher-loc>: <publisher-name>University of Washington</publisher-name>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Silberman</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hoiem</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kohli</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fergus</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Indoor segmentation and support inference from rgbd images</article-title>,&#x201d; in <source>Computer Vision&#x2013;ECCV 2012: 12Th European conference on computer vision, florence, Italy, October 7-13, 2012, proceedings, part V 12</source> (<publisher-name>Springer</publisher-name>), <fpage>746</fpage>&#x2013;<lpage>760</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stathoulopoulos</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Koval</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Agha-mohammadi</surname>
<given-names>A.-a.</given-names>
</name>
<name>
<surname>Nikolakopoulos</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Frame: fast and robust autonomous 3d point cloud map-merging for egocentric multi-robot exploration</article-title>,&#x201d; in <conf-name>2023 IEEE International Conference on Robotics and Automation ICRA</conf-name>, <fpage>3483</fpage>&#x2013;<lpage>3489</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Popovi&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Digumarti</surname>
<given-names>S. T.</given-names>
</name>
<name>
<surname>Chebrolu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Fallon</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>3d lidar reconstruction with probabilistic depth completion for robotic navigation</article-title>,&#x201d; in <source>2022 IEEE/RSJ international conference on intelligent robots and systems (IROS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>5339</fpage>&#x2013;<lpage>5346</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Teed</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Droid-slam: deep visual slam for monocular, stereo, and rgb-d cameras</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>16 558</fpage>&#x2013;<lpage>16 569</lpage>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Teixeira</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Oswald</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Pollefeys</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chli</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Aerial single-view depth completion with image-guided uncertainty estimation</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>5</volume> (<issue>2</issue>), <fpage>1055</fpage>&#x2013;<lpage>1062</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2020.2967296</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Uhrig</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Franke</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Geiger</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Sparsity invariant cnns</article-title>,&#x201d; in <source>International conference on 3D vision (3DV)</source>.</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Mixssc: forward-backward mixture for vision-based 3d semantic scene completion</article-title>. <source>IEEE Trans. Circuits Syst. Video Technol.</source> <volume>35</volume>, <fpage>5684</fpage>&#x2013;<lpage>5696</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2025.3527235</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Wofk</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ranftl</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Koltun</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Monocular visual-inertial depth estimation</article-title>,&#x201d; in <source>2023 IEEE international conference on robotics and automation (ICRA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>6095</fpage>&#x2013;<lpage>6101</lpage>.</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tsuei</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Soatto</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Unsupervised depth completion from visual inertial odometry</article-title>. <source>IEEE Robotics Automation Lett.</source> <volume>5</volume> (<issue>2</issue>), <fpage>1899</fpage>&#x2013;<lpage>1906</lpage>. <pub-id pub-id-type="doi">10.1109/lra.2020.2969938</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Woo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.-Y.</given-names>
</name>
<name>
<surname>Kweon</surname>
<given-names>I. S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Cbam: convolutional block attention module</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV)</source>, <fpage>3</fpage>&#x2013;<lpage>19</lpage>.</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>T. S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Free-form image inpainting with gated convolution</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>4471</fpage>&#x2013;<lpage>4480</lpage>.</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Dui-vio: depth uncertainty incorporated visual inertial odometry based on an rgb-d camera</article-title>,&#x201d; in <conf-name>IEEE/RSJ International Conference on Intelligent Robots and Systems IROS</conf-name>, <fpage>5002</fpage>&#x2013;<lpage>5008</lpage>. <pub-id pub-id-type="doi">10.1109/iros45743.2020.9341592</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>