<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2024.1473937</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>ISFM-SLAM: dynamic visual SLAM with instance segmentation and feature matching</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes"><name><surname>Li</surname> <given-names>Chao</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2804779/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Hu</surname> <given-names>Yang</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Liu</surname> <given-names>Jianqiang</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2803732/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Jin</surname> <given-names>Jianhai</given-names></name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author"><name><surname>Sun</surname> <given-names>Jun</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Artificial Intelligence and Computer Science, Jiangnan University</institution>, <addr-line>Wuxi</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>China Ship Scientific Research Center</institution>, <addr-line>Wuxi</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Di Wu, Southwest University, China</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Fangwen Yu, Tsinghua University, China</p>
<p>Lin Chen, Chinese Academy of Sciences (CAS), China</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Chao Li, <email>chaoli@jiangnan.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>11</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>18</volume>
<elocation-id>1473937</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>07</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Li, Hu, Liu, Jin and Sun.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Li, Hu, Liu, Jin and Sun</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Simultaneous Localization and Mapping (SLAM) is a technology used in intelligent systems such as robots and autonomous vehicles. Visual SLAM has become a more popular type of SLAM due to its acceptable cost and good scalability when applied in robot positioning, navigation and other functions. However, most of the visual SLAM algorithms assume a static environment, so when they are implemented in highly dynamic scenes, problems such as tracking failure and overlapped mapping are prone to occur.</p>
</sec>
<sec>
<title>Methods</title>
<p>To deal with this issue, we propose ISFM-SLAM, a dynamic visual SLAM built upon the classic ORB-SLAM2, incorporating an improved instance segmentation network and enhanced feature matching. Based on YOLACT, the improved instance segmentation network applies the multi-scale residual network Res2Net as its backbone, and utilizes CIoU_Loss in the bounding box loss function, to enhance the detection accuracy of the segmentation network. To improve the matching rate and calculation efficiency of the internal feature points, we fuse ORB key points with an efficient image descriptor to replace traditional ORB feature matching of ORB-SLAM2. Moreover, the motion consistency detection algorithm based on external variance values is proposed and integrated into ISFM-SLAM, to assist the proposed SLAM systems in culling dynamic feature points more effectively.</p>
</sec>
<sec>
<title>Results and discussion</title>
<p>Simulation results on the TUM dataset show that the overall pose estimation accuracy of the ISFM-SLAM is 97% better than the ORB-SLAM2, and is superior to other mainstream and state-of-the-art dynamic SLAM systems. Further real-world experiments validate the feasibility of the proposed SLAM system in practical applications.</p>
</sec>
</abstract>
<kwd-group>
<kwd>simultaneous localization and mapping (SLAM)</kwd>
<kwd>instance segmentation network</kwd>
<kwd>dynamic environment</kwd>
<kwd>motion consistency detection</kwd>
<kwd>feature matching</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="6"/>
<equation-count count="11"/>
<ref-count count="31"/>
<page-count count="15"/>
<word-count count="9128"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Frontiers in Neurorobotics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1"><label>1</label>
<title>Introduction</title>
<p>Simultaneous Localization and Mapping (SLAM) is a technology that enables robots to determine their location and construct a map in real time by collecting data in an unknown environment. The mainstream categories of the SLAM technologies include laser SLAM and visual SLAM. Due to its higher precision than the laser SLAM and acceptable construction cost, visual SLAM has become a research focus in the field of SLAM (<xref ref-type="bibr" rid="ref23">Taketomi et al., 2017</xref>).</p>
<p>Feature point method and direct method are two common methods used in visual SLAM to extract information from the scene and estimate camera motion (<xref ref-type="bibr" rid="ref23">Taketomi et al., 2017</xref>). The feature-based method represents features in the scene by extracting key points and descriptors from pixels. <xref ref-type="bibr" rid="ref6">Davison et al. (2007)</xref> proposed a monocular visual SLAM algorithm, namely MonoSLAM, to achieves online localization and mapping by extracting and tracking feature points. ORB-SLAM2 (<xref ref-type="bibr" rid="ref14">Mur-Artal and Tard&#x00F3;s, 2017</xref>) utilizes feature points and descriptors with selection and scale invariance to generate camera poses, and employs closed-loop detection to optimize map consistency, thereby eliminating cumulative camera errors. On the contrary, the direct method does not rely on feature extraction or descriptor matching, but utilizes original pixel values for information extraction and motion estimation. An example is the LSD-SLAM (<xref ref-type="bibr" rid="ref7">Engel et al., 2014</xref>), which uses image continuity information and dense optical flow fields to estimate the motion of the camera. However, the SLAM algorithms based on the aforementioned two kind of methods usually treats the external environment as static, ignoring the impact of dynamic objects on map accuracy. In a dynamic environment, the movement of objects may cause changes in the map, making it difficult for traditional visual SLAMs to accurately estimate the camera motion and scene structure. Hence, new visual SLAM algorithms need to be developed to handle the issues in dynamic environments.</p>
<p>In recent years, scholars have focused on the combination of deep neural networks (DNNs) and visual SLAM to achieve good SLAM effects in dynamic environments. DNNs can provide semantic information for SLAM systems, enhancing the systems&#x2019; perception capabilities and effectively improves the accuracy of both tracking and mapping. For instance, <xref ref-type="bibr" rid="ref27">Yu et al. (2018)</xref> proposed the DS-SLAM, which integrates the semantic segmentation network SegNet (<xref ref-type="bibr" rid="ref1">Badrinarayanan et al., 2015</xref>) and motion consistency detection into the ORB-SLAM2. By eliminating feature points associated with dynamic objects, this approach mitigates the adverse effects of dynamic environments and improves the stability of map construction. The DynaSLAM (<xref ref-type="bibr" rid="ref2">Bescos et al., 2018</xref>) employs mask Region-based Convolutional Neural Network (R-CNN) and multi-view geometry to filter dynamic feature points by integrating sparse and dense map information. However, both DS-SLAM and DynaSLAM are prone to issues such as insufficient or incorrectly removed feature points, often due to erroneous prior knowledge or challenging lighting conditions, which can ultimately compromise SLAM accuracy. Detect-SLAM (<xref ref-type="bibr" rid="ref31">Zhong et al., 2018</xref>) incorporated a DNN-based object detector into the ORB-SLAM2 system and added three new modules: moving object rejection, object mapping, and SLAM-enhanced detector. This algorithm enhances the accuracy of localization and mapping in a highly dynamic environment, but its performance is not as robust as that of ORB-SLAM2 in a static environment. As demonstrated, the aforementioned visual SLAM systems mitigate interference from dynamic objects by discarding feature points associated with them. However, in certain scenarios, such approaches may mistakenly remove feature points from static objects or inadvertently retain feature points from dynamic objects. This often results in a reduction in the number of feature points matches, consequently causing the SLAM systems to lose track. Thus, addressing the issue of incorrect feature point matching in dynamic environments remains a pressing challenge for visual SLAM systems.</p>
<p>Several research have been conducted in response to the above problem. For example, <xref ref-type="bibr" rid="ref4">Cai and Wu (2022)</xref> proposed a SLAM algorithm based on the YOLACT (<xref ref-type="bibr" rid="ref3">Bolya et al., 2019</xref>) instance segmentation network. This SLAM performed static point recovery based on external constraints after removing dynamic objects, which to some extent alleviated the problem of insufficient feature points. However, YOLACT&#x2019;s bounding box loss primarily emphasizes the coordinates of the four corners rather than the position of the center point, which can lead to a shift in the segmented bounding box. Moreover, the bounding box loss does not fully take into account the shape and size of the target object, potentially resulting in suboptimal segmentation accuracy. Integrated with the SegNet, <xref ref-type="bibr" rid="ref5">Cui and Ma (2019)</xref> proposed the SOF-SLAM to tightly couple visual semantic information and optical flow information, thereby effectively and reasonably removing dynamic feature points. Based on the ORB-SLAM2, <xref ref-type="bibr" rid="ref20">Su et al. (2022)</xref> developed a parallel semantic module based on the lightweight object detection network YOLOv5s. This module utilizes semantic information to optimize the homography matrix, and uses optical flow masks to remove dynamic feature points from the image. <xref ref-type="bibr" rid="ref10">He et al. (2023)</xref> proposed OVD-SLAM, which integrates semantic, depth, and optical flow information to differentiate between foreground and background, thereby identifying dynamic objects. It can be observed that combining semantic segmentation with optical flow for detecting dynamic feature points has become a prominent research focus. However, optical flow estimation algorithms are prone to errors when handling fast motion and occlusions. Fast-moving objects can cause instability in optical flow estimation, while occluded objects may lead to distorted optical flow, both of which can negatively impact segmentation accuracy.</p>
<p>To overcome the challenges of suboptimal instance segmentation accuracy and incorrect feature point classification in dynamic visual SLAM systems, this paper proposes ISFM-SLAM, a dynamic visual SLAM system based on ORB-SLAM2, which incorporates an improved instance segmentation network, a novel motion consistency detection approach, and an introduced efficient learned binary image descriptor. The main contributions of this work are as follows:<list list-type="order">
<list-item>
<p>Accurately obtaining prior knowledge of objects in an image is crucial for designing an effective visual SLAM system. To this end, we propose an improved instance segmentation network based on YOLACT and integrate it into the ISFM-SLAM system. Specifically, we replace YOLACT&#x2019;s backbone with Res2Net-50 (<xref ref-type="bibr" rid="ref8">Gao et al., 2019</xref>), which offers a superior receptive field, a more compact scale, and greater ease of deployment. Furthermore, we introduce CIoU_Loss (<xref ref-type="bibr" rid="ref30">Zheng et al., 2020</xref>) to rectify YOLACT&#x2019;s occasional inaccuracies in bounding box estimation. In this way, the accuracy of the instance segmentation network is significantly enhanced, which greatly benefits subsequent processes such as feature point matching.</p>
</list-item>
<list-item>
<p>To address the issue of incorrectly removing or retaining feature points in certain scenarios, this paper introduces a novel motion consistency detection approach based on the Perspective-n-Point (PnP) algorithm (<xref ref-type="bibr" rid="ref11">Lepetit et al., 2009</xref>). By calculating the difference in external parameters between frames, the PnP-based motion consistency detection method can more reliably determine whether feature points belong to the same static object. This enables more accurate removal of dynamic and incorrectly matched feature points, while ensuring the proper retention of static feature points, ultimately leading to a more reliable motion consistency detection outcome.</p>
</list-item>
<list-item>
<p>Moreover, a learned binary image descriptor, BEBLID (<xref ref-type="bibr" rid="ref21">Su&#x00E1;rez et al., 2020</xref>), is combined with ORB key point detection to further enhance the accuracy and efficiency of feature matching. The BEBLID descriptor, trained using a boosted method, significantly improves feature point matching accuracy, and its parallel computing capability ensures high computational efficiency. This allows feature matching based on the BEBLID descriptor to maintain high accuracy even in scenes with numerous dynamic objects, while also better satisfying the real-time performance requirements of SLAM systems in practical applications.</p>
</list-item>
<list-item>
<p>Simulation results demonstrate that the proposed ISFM-SLAM system achieves outstanding overall pose estimation accuracy in both low-dynamic and high-dynamic environments, with a 97% improvement compared to the baseline ORB-SLAM2, and outperforms many other mainstream and state-of-the-art dynamic SLAM algorithms. Furthermore, real-world experimental results validate the high accuracy of ISFM-SLAM in both dynamic feature point removal and feature point matching.</p>
</list-item>
</list></p>
<p>The remainder of this paper is organized as follows. Section 2 reviews the related work pertinent to the studies presented in this paper. Section 3 presents the proposed ISFM-SLAM system, including the details of the improved instance segmentation network, PnP-based motion consistency detection, and BEBLID feature matching. Simulation and real-world experimental results, along with corresponding discussions, are presented in Sections 4 and 5, respectively. Some conclusions and directions for future work are provided in Section 6.</p>
</sec>
<sec id="sec2"><label>2</label>
<title>Related work</title>
<sec id="sec3"><label>2.1</label>
<title>ORB-SLAM2</title>
<p>As a classic visual SLAM system, ORB-SLAM2 is composed of three main parallel threads: tracking, local mapping, and loop closure (<xref ref-type="bibr" rid="ref14">Mur-Artal and Tard&#x00F3;s, 2017</xref>). To locate the camera pose and generate keyframes, the tracking thread extracts feature points from each frame of images and matches them with the local map. The local mapping thread receives the keyframes from the tracking thread, uses the bundle adjustment (BA) algorithm to optimize the camera pose, and eliminates redundant information from the map. The loop closure thread detects the map loop, corrects the accumulated drift, and eliminates accumulated errors. After optimizing the pose graph, the ORB-SLAM2 launches the fourth thread to perform full BA, to calculate the optimal structure and the motion solution. For a detailed explanation of the ORB-SLAM2 system framework and its components, refer to <xref ref-type="bibr" rid="ref14">Mur-Artal and Tard&#x00F3;s (2017)</xref>.</p>
</sec>
<sec id="sec4"><label>2.2</label>
<title>YOLACT instance segmentation network</title>
<p>Instance segmentation is a task in the field of computer vision that aims to identify the pixel-level segmentation of each object in an image and assign a unique identifier to each object. Instance segmentation generates a mask on the image target, but preserve the shape and features of the target. The YOLACT network is a one-stage instance segmentation model proposed by <xref ref-type="bibr" rid="ref3">Bolya et al. (2019)</xref>. Compared with the two-stage models represented by Mask R-CNN (<xref ref-type="bibr" rid="ref9">He et al., 2017</xref>), the YOLACT has the advantages of fewer parameters and faster operation, making it more suitable for application in SLAM systems with high real-time requirements.</p>
<p>In the YOLACT instance segmentation network, a backbone network based on ResNet-101 is used to extract multi-scale feature maps from the input image. These feature maps are then passed through a feature pyramid network for further processing, leading to the prediction of bounding boxes. To evaluate the regression performance of the model for the location of the bounding box, YOLACT uses Smooth L1 as the bounding box regression loss function. After regression, the detected boxes are filtered by non-maximum suppression (<xref ref-type="bibr" rid="ref16">Qiu et al., 2018</xref>) to obtain the instances corresponding to each object, and the mask segmentation results corresponding to each anchor are generated by linear combination. The specific steps of YOLACT instance segmentation are detailed in <xref ref-type="bibr" rid="ref3">Bolya et al. (2019)</xref>.</p>
</sec>
</sec>
<sec sec-type="methods" id="sec5"><label>3</label>
<title>Methodology</title>
<sec id="sec6"><label>3.1</label>
<title>ISFM-SLAM framework based on ORB-SLAM2</title>
<p>The key to enhancing ORB-SLAM2 in dynamic environments is to accurately perform instance segmentation on the image, enabling the reasonable removal of feature points associated with dynamic objects. To this end, we modified the tracking thread in the ORB-SLAM2 system in this paper to construct the ISFM-SLAM framework. Specifically, an improved instance segmentation network is introduced in the tracking thread to divide the image frame into static background and potential dynamic instances. Then, a motion consistency detection method based on PnP is employed to effectively remove dynamic instances while retaining static feature points. Finally, a feature matching algorithm based on the boosted efficient BEBLID descriptor is utilized to perform feature matching and accurately calculate the camera pose. Following the completion of the tracking thread, ISFM-SLAM proceeds with local mapping, loop closure, and global BA, ultimately producing the corresponding poses and a global point cloud map. The overall pipeline of the ISFM-SLAM system is illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>.</p>
<fig position="float" id="fig1"><label>Figure 1</label>
<caption>
<p>Overall pipeline of the ISFM-SLAM system.</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g001.tif"/>
</fig>
</sec>
<sec id="sec7"><label>3.2</label>
<title>Improved instance segmentation network based on Res2Net and CIoU_Loss</title>
<p>In order to improve the accuracy of the YOLACT segmentation network, this paper utilizes the Res2Net-50 to replace the backbone of the original YOLACT, so that the multi-scale receptive field of the network can be improved. Res2Net, proposed by <xref ref-type="bibr" rid="ref8">Gao et al. (2019)</xref>, is a multi-scale backbone network for computer vision tasks such as object detection and semantic segmentation. By constructing layered residual connections in the convolutional blocks, Res2Net-50 is capable of representing multi-scale features within a single residual block. This allows the network to better capture image features at different scales, thus improving the accuracy of instance segmentation. Moreover, Res2Net-50 can enhance the network&#x2019;s ability to comprehensively learn image information by expanding the receptive field of each layer, resulting in more precise localization and segmentation of target objects. Another advantage of Res2Net-50 is its ease of integration into existing state-of-the-art CNN models, offering flexibility that allows it to excel across various tasks and improve the overall performance of the YOLACT model. Given the high real-time requirements of the SLAM algorithm in dynamic environments, this paper implements Res2Net-50 with a scale of 4 as the backbone network of the improved instance segmentation network, which can not only ensure sufficient semantic features but also reduce the cost of instance segmentation. The detailed architecture of the adopted Res2Net-50 backbone can be found in <xref ref-type="bibr" rid="ref8">Gao et al. (2019)</xref>.</p>
<p>In addition, the CIoU_Loss function (<xref ref-type="bibr" rid="ref30">Zheng et al., 2020</xref>) is used to replace the original loss function Smooth L1 in the original YOLACT to obtain higher accuracy of bounding box regression. The reason is that the Smooth L1 function cannot accurately measure the position of the predicted box due to the lack of calculation of the intersection over union (IoU) and the minimum bounding rectangle. In contrast, CIoU_Loss considers not only the overlap area between the predicted box and the ground truth box, but also the distance between their center points and the aspect ratio, which are geometric factors critical for accurately localizing and segmenting the target object. By introducing these geometric elements, CIoU_Loss can more effectively handle challenging localization scenarios, resulting in better regression performance for the predicted box compared to Smooth L1. Another reason we use CIoU_Loss is that this loss function is its faster convergence during training, which improves the model&#x2019;s training efficiency. The calculation method for CIoU_Loss is shown in <xref ref-type="disp-formula" rid="EQ1">Equation (1)</xref>:<disp-formula id="EQ1"><label>(1)</label>
<mml:math id="M1">
<mml:mo stretchy="true">{</mml:mo>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi mathvariant="italic">CIoU</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x03C1;</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mfenced>
</mml:mrow>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>&#x03BD;</mml:mi>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x22A5;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x222A;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>&#x03BD;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:mi>&#x03BD;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mi>&#x03BD;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>4</mml:mn>
<mml:msup>
<mml:mi>&#x03C0;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
<mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>arctan</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>arctan</mml:mo>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mtext>,</mml:mtext>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M2">
<mml:mi>&#x03C1;</mml:mi>
</mml:math>
</inline-formula> represents the distance between the geometric centers of the prediction box <inline-formula>
<mml:math id="M3">
<mml:mi>b</mml:mi>
</mml:math>
</inline-formula> and the target box <inline-formula>
<mml:math id="M4">
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. <inline-formula>
<mml:math id="M5">
<mml:mfrac>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mfrac>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M6">
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
</mml:mfrac>
</mml:math>
</inline-formula> represent the aspect ratios of the target box and prediction box, respectively. <inline-formula>
<mml:math id="M7">
<mml:mi>c</mml:mi>
</mml:math>
</inline-formula> represents the diagonal length of the smallest circumscribed rectangle of the prediction box and target box. With the implementation of the CIoU_Loss, the convergence speed and the multi-scale object detection robustness of the improved instance segmentation network can be significantly enhanced.</p>
</sec>
<sec id="sec8"><label>3.3</label>
<title>PnP-based motion consistency detection</title>
<p>The instance segmentation network is capable of acquiring prior information for the motion of objects within a video frame. However, relying solely on this prior information to determine whether a feature point should be removed may lead to two issues. One is the feature points of objects in a stationary state with non-dynamic prior information, such as those of a stationary person, will be removed. Another one is the feature points of moving objects with non-dynamic prior information will be preserved, such as those of the books that interact with people. Thus, it is necessary to compare the motion state of the same object across consecutive frames to accurately decide whether the associated feature points should be removed. That is, a motion consistency detection method should be employed to assist in the classification of feature points.</p>
<p>To improve the performance of consistency detection of motion objects, this paper proposes a novel motion consistency detection method based on the PnP algorithm. On the premise that the camera has observed the 3-dimensions (3D) positions of multiple points, the PnP algorithm accurately determines the position and orientation of feature points in 3D space by solving the geometric relationships between the camera and the feature points in the scene. The detailed procedure of the PnP algorithm can be found in <xref ref-type="bibr" rid="ref11">Lepetit et al. (2009)</xref>. Our method analyzes this geometric relationship to compare changes in feature points across frames, thereby determining whether the feature points belong to the same static object. If the motion trajectories of the feature points are inconsistent, they may be either incorrectly matched or dynamic feature points, and will thus be appropriately removed. In this way, the PnP algorithm is employed to detect the motion consistency of feature points across consecutive frames, enabling a more accurate distinction between true dynamic and static feature points, even in the presence of inaccurate prior information. The specific steps of the PnP-based motion consistency detection method are as follows.</p>
<p>After applying the improved instance segmentation network proposed in Section 3.2 to divide video frames into static background and potential dynamic instances, we use PnP algorithm to calculate the static baseline extrinsic parameter <inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="italic">static</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> based on the continuous two frame <inline-formula>
<mml:math id="M9">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M10">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> under constant speed motion model as shown in <xref ref-type="disp-formula" rid="EQ2">Equation (2)</xref>:<disp-formula id="EQ2"><label>(2)</label>
<mml:math id="M11">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="italic">static</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">static</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">static</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M12">
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">static</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M13">
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">static</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> denotes the sets of the static background feature points in <inline-formula>
<mml:math id="M14">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M15">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, respectively.</p>
<p>Using <inline-formula>
<mml:math id="M16">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="italic">static</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> as the baseline, it is capable to determine the true dynamic instances among the potential dynamic instances in the current frame. Specifically, for each potential dynamic instance, its corresponding pose transformation matrix <inline-formula>
<mml:math id="M17">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> can be calculated using <xref ref-type="disp-formula" rid="EQ4">Equation (3)</xref>:<disp-formula id="EQ4"><label>(3)</label>
<mml:math id="M18">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mfenced>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M19">
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M20">
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> are the sets of the feature points of instance <inline-formula>
<mml:math id="M21">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> in <inline-formula>
<mml:math id="M22">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M23">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, respectively. Then, the 2-norm of the difference matrix <inline-formula>
<mml:math id="M24">
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> between <inline-formula>
<mml:math id="M25">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="italic">static</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M26">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> can be calculated by <xref ref-type="disp-formula" rid="EQ5">Equation (4)</xref>:<disp-formula id="EQ5"><label>(4)</label>
<mml:math id="M27">
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mo>&#x2225;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="italic">static</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mo>&#x2225;</mml:mo>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:math>
</disp-formula></p>
<p>If <inline-formula>
<mml:math id="M28">
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is greater than the perset threshold <inline-formula>
<mml:math id="M29">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, then the potential instance <inline-formula>
<mml:math id="M30">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> is determined to be a true dynamic instance.</p>
<p>Based on the above procedure, the ORB feature points belonging to the dynamic instance can be discarded more accurately. After that, the remaining static ORB feature points can be utilized to calculate the camera pose, and then the stable and accurate extrinsic parameters can be obtained.</p>
</sec>
<sec id="sec9"><label>3.4</label>
<title>BEBLID feature matching</title>
<p>Since feature matching plays a pivotal role in visual SLAM, the accuracy and efficiency of feature matching algorithm directly influences the quality of subsequent localization and mapping. ORB-SLAM2 employs binary robust independent elementary features (BRIEF) to obtain descriptors of feature points. However, the expressiveness of BRIEF descriptors is constrained by their derivation from straightforward pixel comparison, diminishing the matching accuracy of the ORB algorithm. In addition, although the improved instance segmentation network and PnP-based motion consistency detection can largely prevent the incorrect removal of feature points, in cases where a frame contains a significant number of dynamic instances, the available points for feature matching may become insufficient due to the elimination of dynamic feature points, thereby affecting the overall performance of the SLAM system. To address this, the ISFM-SLAM framework implements a feature matching algorithm based on the BEBLID descriptor (<xref ref-type="bibr" rid="ref21">Su&#x00E1;rez et al., 2020</xref>). BEBLID employs a learning-based approach, specifically adaptive boosting (AdaBoost) (<xref ref-type="bibr" rid="ref15">Pardoe and Stone, 2010</xref>), for training. AdaBoost combines multiple weak classifiers, iteratively adjusting the weights of samples to focus more on previously misclassified instances, thereby significantly improving classification accuracy. The use of the AdaBoost algorithm to minimize the BEBLID loss function is described in <xref ref-type="disp-formula" rid="EQ6">Equation (5)</xref>:<disp-formula id="EQ6"><label>(5)</label>
<mml:math id="M31">
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi mathvariant="italic">BEBLID</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:mo>exp</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03B3;</mml:mi>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi mathvariant="normal">K</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M32">
<mml:mi>&#x03B3;</mml:mi>
</mml:math>
</inline-formula> is the learning rate. <inline-formula>
<mml:math id="M33">
<mml:mfenced open="{" close="}" separators=",">
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula> is a training set composed of pairs of image patches. <inline-formula>
<mml:math id="M34">
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is the label of the training sample. <inline-formula>
<mml:math id="M35">
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> denotes that both patches correspond to the same image structure, while <inline-formula>
<mml:math id="M36">
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> denotes that they correspond to different image structures. <inline-formula>
<mml:math id="M37">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mi>z</mml:mi>
</mml:mfenced>
<mml:mtext>&#x2261;</mml:mtext>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>;,</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>;,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> denotes the <inline-formula>
<mml:math id="M38">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>th weak learner with weight <inline-formula>
<mml:math id="M39">
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, which depends on a feature extraction function <inline-formula>
<mml:math id="M40">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mo>&#x00B7;</mml:mo>
</mml:mfenced>
</mml:math>
</inline-formula> and a threshold <inline-formula>
<mml:math id="M41">
<mml:mi>T</mml:mi>
</mml:math>
</inline-formula> as shown in <xref ref-type="disp-formula" rid="EQ7">Equation (6)</xref>:<disp-formula id="EQ7"><label>(6)</label>
<mml:math id="M42">
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi>f</mml:mi>
<mml:mo>;</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mo stretchy="true">{</mml:mo>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mo>+</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">if</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mo>&#x2A7D;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">if</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mo>&#x003E;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula></p>
<p>In particular, the key to improving the efficiency calculation of the BEBLID descriptor is the choice of <inline-formula>
<mml:math id="M43">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula>. Here, <inline-formula>
<mml:math id="M44">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula> is defined as the average gray difference between pixels in two different image boxes as shown in <xref ref-type="disp-formula" rid="EQ8">Equation (7)</xref>.<disp-formula id="EQ8"><label>(7)</label>
<mml:math id="M45">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:mspace width="0.25em"/>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mi>s</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:munder>
<mml:mi>I</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>q</mml:mi>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mi>s</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:munder>
<mml:mi>I</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>r</mml:mi>
</mml:mfenced>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M46">
<mml:mi>I</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>t</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula> is the gray value at pixel <inline-formula>
<mml:math id="M47">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M48">
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:mi>p</mml:mi>
<mml:mi>s</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula> is the square box with a side length of <inline-formula>
<mml:math id="M49">
<mml:mi>s</mml:mi>
</mml:math>
</inline-formula> centered at pixel <inline-formula>
<mml:math id="M50">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula>. The descriptor of the response map is shown in <xref ref-type="disp-formula" rid="EQ9">Equation 8</xref>.<disp-formula id="EQ9"><label>(8)</label>
<mml:math id="M51">
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:msup>
<mml:mi>h</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:msqrt>
<mml:mi>g</mml:mi>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mi>L</mml:mi>
<mml:msqrt>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:msqrt>
<mml:mi>g</mml:mi>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M52">
<mml:mi>A</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">diag</mml:mi>
<mml:mfenced open="(" close=")" separators=",,,">
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x22EF;</mml:mo>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula>.</p>
<p>It can be seen that the BEBLID descriptor improves the loss function by using all the weak learner and the integral image, enabling the feature matching algorithm to obtain high-quality binary descriptors. As a result, feature matching based on BEBLID is more accurate than that based on BRIEF, allowing the algorithm to perform precise matching even when the number of available feature points is limited. Furthermore, BEBLID ensures relatively high feature matching accuracy under challenging lighting conditions, such as strong or weak light, thereby further enhancing the proposed SLAM system&#x2019;s ability to handle complex scenes. In addition, BEBLID computes each descriptor in parallel, which can significantly improve the efficiency of feature matching. The extraction workflow of the BEBLID descriptor is demonstrated in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
<fig position="float" id="fig2"><label>Figure 2</label>
<caption>
<p>BEBLID descriptor extraction workflow (<xref ref-type="bibr" rid="ref21">Su&#x00E1;rez et al., 2020</xref>).</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g002.tif"/>
</fig>
</sec>
</sec>
<sec id="sec10"><label>4</label>
<title>Simulation experiments and discussions</title>
<sec id="sec11"><label>4.1</label>
<title>Simulation environment</title>
<p>To verify the effectiveness of ISFM-SLAM and the proposed components, a series of simulation experiments were conducted. To accelerate the training of the deep learning model, the proposed improved instance segmentation network was trained on a server with an Intel Xeon Silver 4214R CPU, 90 Giga-Bytes (GB) memory, and an RTX 3080 TI GPU (12&#x2009;GB graphics memory). All the other experiments were conducted on a personal computer (PC) with the following configurations: an AMD Ryzen 75800H 3.2&#x2009;GHz CPU, 16&#x2009;GB memory, an RTX 3060 laptop GPU with 6&#x2009;GB graphics memory. The operating system is Ubuntu 18.04 with CUDA 11.3 and Pytorch 1.11.0. The code for the improved instance segmentation network of the ISFM-SLAM is written in Python 3.6, while the codes for the other parts of the ISFM-SLAM are written in C++.</p>
</sec>
<sec id="sec12"><label>4.2</label>
<title>Performance of improved instance segmentation network</title>
<p>The effectiveness of visual SLAMs depends heavily on the performance of instance segmentation networks. Therefore, in this section, we analyzed the segmentation performance on some samples of the original YOLACT and the improved instance segmentation network proposed in this paper, and also compared the statistical results on public datasets by these two networks as well as some canonical and state-of-the-art instance segmentation methods. The improved instance segmentation network was trained on the COCO Minitrain dataset (<xref ref-type="bibr" rid="ref18">Samet et al., 2020</xref>). This dataset is a subset of the Microsoft Common Objects in COntext (MS COCO) dataset (<xref ref-type="bibr" rid="ref13">Lin et al., 2014</xref>), which contains approximately 25,000 images and all the 80 categories of the MS COCO. The Res2Net pre-trained weights were used for training the improved YOLACT network. The batch size was set to 24. The number of iterations was 100,000. Stochastic gradient descent (SGD) was utilized as the optimizer, with an initial momentum of 0.9, a learning rate of 0.001, and a weight decay coefficient of 0.0001.</p>
<p>To facilitate a more intuitive comparison of the instance segmentation effect of the original YOLACT and the improved one, we visualize the segmentation result obtained by the two compared network on five samples with complex indoor environments of COCO dataset. The results are presented in <xref ref-type="fig" rid="fig3">Figure 3</xref>, where the left column contains the input images, the middle column contains the segmentation results obtained by the original YOLACT, and the right column contains the segmentation result obtained by the improved instance segmentation network. According to the results in the middle column, the original YOLACT may yield unsatisfactory outcomes in highly complex scenes. For example, an instance may be divided into two parts, as seen with the chair in <xref ref-type="fig" rid="fig3">Figure 3A</xref>. Additionally, some instances may not be detected or segmented, such as the vase in <xref ref-type="fig" rid="fig3">Figure 3B</xref>, the chair in <xref ref-type="fig" rid="fig3">Figure 3C</xref>, and the person in <xref ref-type="fig" rid="fig3">Figure 3D</xref>. Misidentification of instances can also occur, as observed with the debris on the ground and bowls on the cabinet in <xref ref-type="fig" rid="fig3">Figure 3D</xref>, and the sofa in <xref ref-type="fig" rid="fig3">Figure 3E</xref>. Furthermore, some masks may fail to accurately cover the corresponding instances, such as the person in <xref ref-type="fig" rid="fig3">Figure 3E</xref>. The main reasons for these issues are the poor feature extraction ability of the instance segmentation network, which leads to a lack of multi-scale perception of the scene and low localization accuracy of the detection boxes. For our proposed network, the multi-scale receptive field has been increased due to the replacement of the backbone, and the precision of the detection box has been improved because of the optimization of the boundary regression box. Consequently, the improved network can better utilize environmental semantic information and achieve more accurate segmentation, as shown by the results on the right column in <xref ref-type="fig" rid="fig3">Figure 3</xref>.</p>
<fig position="float" id="fig3"><label>Figure 3</label>
<caption>
<p>Comparison of segmentation results between our improved algorithm and YOLACT. The raw images were obtained from the COCO Minitrain Dataset, and this dataset is licensed under a Creative Commons Attribution 4.0 License (<ext-link xlink:href="https://cocodataset.org/#termsofuse" ext-link-type="uri">https://cocodataset.org/#termsofuse</ext-link>).</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g003.tif"/>
</fig>
<p>To demonstrate the performance of the improved network in various instance segmentation tasks, we compared our proposed network with YOLACT and some other mainstream segmentation networks, including Mask R-CNN (<xref ref-type="bibr" rid="ref9">He et al., 2017</xref>), PolarMask (<xref ref-type="bibr" rid="ref25">Xie et al., 2020</xref>), and FourierNet (<xref ref-type="bibr" rid="ref17">Riaz et al., 2021</xref>), on the COCO validation set. The results are recorded in <xref ref-type="table" rid="tab1">Table 1</xref>. The evaluation metrics based on the Average Precision (AP) was utilized for evaluation, including mean AP (mAP), AP50, AP75, APS, APM, and APL, where AP represents the area under the precision-recall curve for a given class. The equations for calculating AP and mAP are shown in <xref ref-type="disp-formula" rid="EQ10">Equations (9)</xref> and <xref ref-type="disp-formula" rid="EQ11">(10)</xref>:<disp-formula id="EQ10"><label>(9)</label>
<mml:math id="M53">
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x222B;</mml:mo>
</mml:mstyle>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:munderover>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>r</mml:mi>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi>r</mml:mi>
</mml:math>
</disp-formula><disp-formula id="EQ11"><label>(10)</label>
<mml:math id="M54">
<mml:mi mathvariant="italic">mAP</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x03A3;</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M55">
<mml:mi>P</mml:mi>
</mml:math>
</inline-formula> is the average precision value for the current class, and <inline-formula>
<mml:math id="M56">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> is the number of sample categories in the dataset. AP50 and AP75 are special cases of calculating AP where the IoU thresholds are set to 0.5 and 0.75, respectively. A prediction is considered correct when the IoU is greater than or equal to a certain threshold (e.g., 0.5 or 0.75). The last three metrics measure the performance of detecting objects of different scales: small, medium, and large. In addition, the frames per second (FPS) is also measured to show the efficiency of the compared segmentation methods.</p>
<table-wrap position="float" id="tab1"><label>Table 1</label>
<caption>
<p>Performance comparison of different instance segmentation methods on COCO validation set.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">FPS</th>
<th align="center" valign="top">mAP</th>
<th align="center" valign="top">AP50</th>
<th align="center" valign="top">AP75</th>
<th align="center" valign="top">APS</th>
<th align="center" valign="top">APM</th>
<th align="center" valign="top">APL</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Mask R-CNN</td>
<td align="center" valign="middle">8.6</td>
<td align="center" valign="middle">37.52</td>
<td align="center" valign="middle">58.86</td>
<td align="center" valign="middle">40.26</td>
<td align="center" valign="middle">16.71</td>
<td align="center" valign="middle">39.82</td>
<td align="center" valign="middle">54.34</td>
</tr>
<tr>
<td align="left" valign="middle">PolarMask</td>
<td align="center" valign="middle">17.2</td>
<td align="center" valign="middle">30.63</td>
<td align="center" valign="middle">50.81</td>
<td align="center" valign="middle">31.89</td>
<td align="center" valign="middle">12.74</td>
<td align="center" valign="middle">33.73</td>
<td align="center" valign="middle">45.29</td>
</tr>
<tr>
<td align="left" valign="middle">FourierNet</td>
<td align="center" valign="middle">26.6</td>
<td align="center" valign="middle">32.97</td>
<td align="center" valign="middle">55.47</td>
<td align="center" valign="middle">33.82</td>
<td align="center" valign="middle">15.52</td>
<td align="center" valign="middle">35.15</td>
<td align="center" valign="middle">46.38</td>
</tr>
<tr>
<td align="left" valign="middle">YOLACT</td>
<td align="center" valign="middle">45</td>
<td align="center" valign="middle">29.82</td>
<td align="center" valign="middle">48.53</td>
<td align="center" valign="middle">31.23</td>
<td align="center" valign="middle">9.98</td>
<td align="center" valign="middle">31.35</td>
<td align="center" valign="middle">47.76</td>
</tr>
<tr>
<td align="left" valign="middle">Ours</td>
<td align="center" valign="middle">42.2</td>
<td align="center" valign="middle">33.61</td>
<td align="center" valign="middle">56.24</td>
<td align="center" valign="middle">36.26</td>
<td align="center" valign="middle">16.47</td>
<td align="center" valign="middle">36.21</td>
<td align="center" valign="middle">49.82</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Compared with the original YOLACT, our enhanced model results in a 2.8 frames reduction in FPS, but a 3.8% improvement in mAP. This verifies that our improved model significantly enhances segmentation precision with only a slight reduction in computational speed compared to the original YOLACT. When our proposed method is compared with the other approach in <xref ref-type="table" rid="tab1">Table 1</xref>, it can be seen that the efficiency of our method is much better than other competitors, and all the AP-related results are the second-best ones, only slightly worse than those by the Mask R-CNN. It should be noted that since the real-time processing capability is crucial for SLAM problems, the Mask R-CNN network may be not suitable for these applications. Therefore, it can be concluded that the proposed improved instance segmentation network can better meet the accuracy and real-time requirements of a visual SLAM system in a dynamic environment when compared to most of the other instance segmentation methods.</p>
</sec>
<sec id="sec13"><label>4.3</label>
<title>Performance analysis of the ISFM-SLAM</title>
<p>In this section, the absolute trajectory error (ATE) is utilized to evaluate the performance of the proposed SLAM system and the compared ones for each run. The ATE is calculated by subtracting the ground-truth from the estimated value of the camera pose, as shown in <xref ref-type="disp-formula" rid="EQ12">Equation 11</xref>, so that this metric can provide an intuitive representation of the accuracy of the trajectory.<disp-formula id="EQ12"><label>(11)</label>
<mml:math id="M57">
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi mathvariant="italic">ATE</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mspace width="thickmathspace"/>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msqrt>
<mml:mrow>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
<mml:mo>&#x2225;</mml:mo>
<mml:msubsup>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:msubsup>
<mml:mover accent="true">
<mml:mi>T</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2225;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:math>
</disp-formula>where <inline-formula>
<mml:math id="M58">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> is the total number of frames, <inline-formula>
<mml:math id="M59">
<mml:msubsup>
<mml:mover accent="true">
<mml:mi>T</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula> denotes the estimated pose trajectory, <inline-formula>
<mml:math id="M60">
<mml:msubsup>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula> is the gound-truth trajectory, and <inline-formula>
<mml:math id="M61">
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi mathvariant="italic">ATE</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is the absolute trajectory error. After multiple SLAM experiments, the mean, standard deviation (STD), and Root Mean Squared Error (RMSE) of the <inline-formula>
<mml:math id="M62">
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi mathvariant="italic">ATE</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is adopted to evaluate the performance of SLAM systems from a statistical perspective, where the RMSE is more sensitive to occasional errors than the other metrics and thus can better reflect the robustness of the system.</p>
<p>Firstly, the proposed ISFM-SLAM is quantitatively compared with its baseline, ORB-SLAM2 (<xref ref-type="bibr" rid="ref14">Mur-Artal and Tard&#x00F3;s, 2017</xref>), using four high-dynamic sequences (labeled &#x201C;walking&#x201D;) and four sets of low-dynamic sequences (labeled &#x201C;sitting&#x201D;) from the TUM dataset (<xref ref-type="bibr" rid="ref19">Sturm et al., 2012</xref>). Each experiment was performed three times, and the mean, STD and RMSE results obtained by the two compared SLAM systems are recorded in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2"><label>Table 2</label>
<caption>
<p>Comparison of the mean, STD and RMSE of ATE obtained by the ORB-SLAM2, OVD-SLAM, and the ISFM-SLAM.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Scene</th>
<th align="center" valign="top" colspan="3">ORB-SLAM2</th>
<th align="center" valign="top" colspan="3">ISFM-SLAM</th>
<th align="center" valign="top" colspan="3">Improving rate (%)</th>
</tr>
<tr>
<th align="center" valign="top">RMSE</th>
<th align="center" valign="top">STD</th>
<th align="center" valign="top">Means</th>
<th align="center" valign="top">RMSE</th>
<th align="center" valign="top">STD</th>
<th align="center" valign="top">Means</th>
<th align="center" valign="top">RMSE</th>
<th align="center" valign="top">STD</th>
<th align="center" valign="top">Means</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">fr3/walking_static</td>
<td align="center" valign="middle">0.3775</td>
<td align="center" valign="middle">0.1657</td>
<td align="center" valign="middle">0.3392</td>
<td align="center" valign="middle">0.0081</td>
<td align="center" valign="middle">0.0034</td>
<td align="center" valign="middle">0.0072</td>
<td align="center" valign="middle">97.9</td>
<td align="center" valign="middle">97.9</td>
<td align="center" valign="middle">97.9</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walkig_xyz</td>
<td align="center" valign="middle">0.6783</td>
<td align="center" valign="middle">0.3761</td>
<td align="center" valign="middle">0.5645</td>
<td align="center" valign="middle">0.0164</td>
<td align="center" valign="middle">0.0089</td>
<td align="center" valign="middle">0.0137</td>
<td align="center" valign="middle">97.6</td>
<td align="center" valign="middle">97.6</td>
<td align="center" valign="middle">97.6</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walking_rpy</td>
<td align="center" valign="middle">0.7565</td>
<td align="center" valign="middle">0.3360</td>
<td align="center" valign="middle">0.6778</td>
<td align="center" valign="middle">0.0301</td>
<td align="center" valign="middle">0.0161</td>
<td align="center" valign="middle">0.0254</td>
<td align="center" valign="middle">96.0</td>
<td align="center" valign="middle">95.2</td>
<td align="center" valign="middle">96.3</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walking_half</td>
<td align="center" valign="middle">0.4699</td>
<td align="center" valign="middle">0.2458</td>
<td align="center" valign="middle">0.4004</td>
<td align="center" valign="middle">0.0246</td>
<td align="center" valign="middle">0.0131</td>
<td align="center" valign="middle">0.0208</td>
<td align="center" valign="middle">94.8</td>
<td align="center" valign="middle">94.7</td>
<td align="center" valign="middle">94.8</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting_static</td>
<td align="center" valign="middle">0.0094</td>
<td align="center" valign="middle">0.0045</td>
<td align="center" valign="middle">0.0082</td>
<td align="center" valign="middle">0.0064</td>
<td align="center" valign="middle">0.0035</td>
<td align="center" valign="middle">0.0061</td>
<td align="center" valign="middle">25.5</td>
<td align="center" valign="middle">22.2</td>
<td align="center" valign="middle">25.6</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _xyz</td>
<td align="center" valign="middle">0.0089</td>
<td align="center" valign="middle">0.0042</td>
<td align="center" valign="middle">0.0078</td>
<td align="center" valign="middle">0.0103</td>
<td align="center" valign="middle">0.0047</td>
<td align="center" valign="middle">0.0091</td>
<td align="center" valign="middle">&#x2212;15.7</td>
<td align="center" valign="middle">&#x2212;11.9</td>
<td align="center" valign="middle">&#x2212;16.7</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _rpy</td>
<td align="center" valign="middle">0.0197</td>
<td align="center" valign="middle">0.0109</td>
<td align="center" valign="middle">0.0163</td>
<td align="center" valign="middle">0.0162</td>
<td align="center" valign="middle">0.0090</td>
<td align="center" valign="middle">0.0556</td>
<td align="center" valign="middle">17.8</td>
<td align="center" valign="middle">17.4</td>
<td align="center" valign="middle">16.6</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _half</td>
<td align="center" valign="middle">0.0385</td>
<td align="center" valign="middle">0.0194</td>
<td align="center" valign="middle">0.0338</td>
<td align="center" valign="middle">0.0175</td>
<td align="center" valign="middle">0.0089</td>
<td align="center" valign="middle">0.0151</td>
<td align="center" valign="middle">54.5</td>
<td align="center" valign="middle">54.1</td>
<td align="center" valign="middle">55.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As illustrated in <xref ref-type="table" rid="tab2">Table 2</xref>, the accuracy and robustness of the proposed SLAM system is significantly better than the ORB-SLAM2 in high-dynamic scenes, with an average improvement of 96.7% in mean ATE, 96.9% in STD, and 97.2% in RMSE. However, the proposed algorithm cannot obtain significantly better performance than the ORB-SLAM2 in low-dynamic scenes. Specifically, in the fr3/sitting_xyz scene, inaccurate matching or segmentation occurred in our system, which results in a decrease in accuracy. For fr3/sitting_static and fr3/sitting_rpy, since ORB-SLAM2 has already applied RANSAC to successfully remove some outliers, the advantage of the ISFM-SLAM is not very obvious. Nevertheless, the performance of the ISFM-SLAM is still outstanding in some low-dynamic scenes. For example, in the fr3/sitting_half scene including some moving instances, our proposed algorithm improves by more than 50% compared to ORB-SLAM2.</p>
<p>To further demonstrate the advantage of the ISFM-SLAM over ORB-SLAM2, the camera estimation trajectories obtained by the two competitors were compared with the real trajectories in four scenes including fr3/walking_half, fr3/walking_rpy, fr3/sitting_static, and fr3/sitting_xyz. The results are presented in <xref ref-type="fig" rid="fig4">Figure 4</xref>. From this figure, it is evident that in the high-dynamic environments, the pose trajectory estimated by the ISFM-SLAM is much more closely aligned with the real trajectory than that by the ORB-SLAM2, while in the low-dynamic environments, the two estimated trajectories are both close to the real one.</p>
<fig position="float" id="fig4"><label>Figure 4</label>
<caption>
<p>Comparison of the estimated trajectories by ORB-SLAM2 and ISFM-SLAM with the real trajectory on different sequences.</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g004.tif"/>
</fig>
<p>Finally, the proposed ISFM-SLAM is compared with some other dynamic SLAM systems, including Dyna-SLAM (<xref ref-type="bibr" rid="ref2">Bescos et al., 2018</xref>), DS-SLAM (<xref ref-type="bibr" rid="ref27">Yu et al., 2018</xref>), MR-SLAM (<xref ref-type="bibr" rid="ref22">Sun et al., 2017</xref>), DRSO-SLAM (<xref ref-type="bibr" rid="ref26">Yu et al., 2021</xref>), and OVD-SLAM (<xref ref-type="bibr" rid="ref10">He et al., 2023</xref>) to verify its effectiveness. Among them, Dyna-SLAM, DS-SLAM, and OVD-SLAM is designed based on semantic segmentation approaches, MR-SLAM is implemented based on optical flow method, and DRSO-SLAM is based on both the semantic segmentation and optical flow schemes. The RMSE results of the ATE obtained by these compared SLAM systems are illustrated in <xref ref-type="table" rid="tab3">Table 3</xref>, with the best results on each scene highlighted in bold. Note that except for the experimental results of the ISFM-SLAM, the results of the other compared algorithms are all from the corresponding references, and the &#x201C;None&#x201D; in <xref ref-type="table" rid="tab3">Table 3</xref> indicates that the corresponding sample was not tested. As shown in <xref ref-type="table" rid="tab3">Table 3</xref>, the proposed is comparable to Dyna-SLAM (<xref ref-type="bibr" rid="ref2">Bescos et al., 2018</xref>) in terms of pose estimation accuracy in high-dynamic scenes, but is superior to the other competitors. In low-dynamic scenes, the ISFM-SLAM still has a significant advantage over the other four algorithms, for it can achieve the best result in almost each scene only except fr3/sitting _half. Therefore, it can be summarized that our proposed method can effectively address the issue of static assumption failure in visual SLAM in dynamic scenes, thereby significantly improving its positioning accuracy and robustness.</p>
<table-wrap position="float" id="tab3"><label>Table 3</label>
<caption>
<p>Comparison of pose estimation accuracy obtained by different SLAM systems in 8 different scenes.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Scene</th>
<th align="center" valign="top">Dyna-SLAM</th>
<th align="center" valign="top">DS-SLAM</th>
<th align="center" valign="top">MR-SLAM</th>
<th align="center" valign="top">DRSO-SLAM</th>
<th align="center" valign="top">OVD-SLAM</th>
<th align="center" valign="top">ISFM-SLAM</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">fr3/walking_static</td>
<td align="center" valign="middle">
<bold>0.0060</bold>
</td>
<td align="center" valign="middle">0.0081</td>
<td align="center" valign="middle">0.0656</td>
<td align="center" valign="middle">0.01112</td>
<td align="center" valign="top">0.0087</td>
<td align="center" valign="middle">0.0081</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walkig_xyz</td>
<td align="center" valign="middle">
<bold>0.0150</bold>
</td>
<td align="center" valign="middle">0.0247</td>
<td align="center" valign="middle">0.0932</td>
<td align="center" valign="middle">0.01576</td>
<td align="center" valign="top">0.1091</td>
<td align="center" valign="middle">0.0164</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walking_rpy</td>
<td align="center" valign="middle">0.0350</td>
<td align="center" valign="middle">0.4442</td>
<td align="center" valign="middle">0.1333</td>
<td align="center" valign="middle">0.07515</td>
<td align="center" valign="top">0.0317</td>
<td align="center" valign="middle">
<bold>0.0301</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/walking_half</td>
<td align="center" valign="middle">0.0250</td>
<td align="center" valign="middle">0.0303</td>
<td align="center" valign="middle">0.1252</td>
<td align="center" valign="middle">0.02684</td>
<td align="center" valign="top">0.3512</td>
<td align="center" valign="middle">
<bold>0.0246</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting_static</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">
<bold>0.0064</bold>
</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">
<bold>0.0064</bold>
</td>
<td align="center" valign="top">0.0125</td>
<td align="center" valign="middle">
<bold>0.0064</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _xyz</td>
<td align="center" valign="middle">0.0150</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">0.0482</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="top">0.0200</td>
<td align="center" valign="middle">
<bold>0.0103</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _rpy</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="top">0.0929</td>
<td align="center" valign="middle">
<bold>0.0162</bold>
</td>
</tr>
<tr>
<td align="left" valign="middle">fr3/sitting _half</td>
<td align="center" valign="middle">0.0170</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="middle">0.0470</td>
<td align="center" valign="middle">None</td>
<td align="center" valign="top">
<bold>0.0147</bold>
</td>
<td align="center" valign="middle">0.0175</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>In each scene, the best RMSE results of the ATE obtained by all compared SLAM systems are bolded.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec14"><label>4.4</label>
<title>Ablation studies</title>
<sec id="sec15"><label>4.4.1</label>
<title>Effectiveness of the modified components in the improved instance segmentation network</title>
<p>In the improved instance segmentation network proposed in this paper, we made two primary modifications to YOLACT: replacing the backbone with Res2Net-50 and using CIoU_Loss as the loss function. To thoroughly verify the effectiveness of these improvements, the ablation experiments in this subsection not only compare Res2Net-50 and CIoU_Loss with the original backbone and loss function used in YOLACT, but also with several other backbones and loss functions. The datasets used are still the COCO Minitrain dataset, and the experimental settings are also consistent with those described in Section 4.2. The results of the ablation experiments for the backbone and loss function are presented in <xref ref-type="table" rid="tab4">Tables 4</xref>, <xref ref-type="table" rid="tab5">5</xref>, respectively.</p>
<table-wrap position="float" id="tab4"><label>Table 4</label>
<caption>
<p>Ablation study of different backbones of instance segmentation network.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Backbone</th>
<th align="center" valign="top">FPS</th>
<th align="center" valign="top">mAP</th>
<th align="center" valign="top">AP50</th>
<th align="center" valign="top">AP75</th>
<th align="center" valign="top">APS</th>
<th align="center" valign="top">APM</th>
<th align="center" valign="top">APL</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">ResNeXt-50</td>
<td align="center" valign="middle">34.2</td>
<td align="center" valign="middle">28.20</td>
<td align="center" valign="middle">49.61</td>
<td align="center" valign="middle">30.15</td>
<td align="center" valign="middle">11.32</td>
<td align="center" valign="middle">33.94</td>
<td align="center" valign="middle">42.89</td>
</tr>
<tr>
<td align="left" valign="middle">ResNeSt-50</td>
<td align="center" valign="middle">35.3</td>
<td align="center" valign="middle">24.22</td>
<td align="center" valign="middle">44.34</td>
<td align="center" valign="middle">31.18</td>
<td align="center" valign="middle">12.66</td>
<td align="center" valign="middle">27.05</td>
<td align="center" valign="middle">46.39</td>
</tr>
<tr>
<td align="left" valign="middle">ResNet-101</td>
<td align="center" valign="middle">42.0</td>
<td align="center" valign="middle">29.91</td>
<td align="center" valign="middle">48.62</td>
<td align="center" valign="middle">31.32</td>
<td align="center" valign="middle">10.06</td>
<td align="center" valign="middle">31.44</td>
<td align="center" valign="middle">47.85</td>
</tr>
<tr>
<td align="left" valign="middle">Res2Net-50 (Ours)</td>
<td align="center" valign="middle">42.2</td>
<td align="center" valign="middle">33.61</td>
<td align="center" valign="middle">56.24</td>
<td align="center" valign="middle">36.26</td>
<td align="center" valign="middle">16.47</td>
<td align="center" valign="middle">36.21</td>
<td align="center" valign="middle">49.82</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab5"><label>Table 5</label>
<caption>
<p>Ablation study of different loss functions of instance segmentation network.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Loss function</th>
<th align="center" valign="top">FPS</th>
<th align="center" valign="top">mAP</th>
<th align="center" valign="top">AP50</th>
<th align="center" valign="top">AP75</th>
<th align="center" valign="top">APS</th>
<th align="center" valign="top">APM</th>
<th align="center" valign="top">APL</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Dice loss</td>
<td align="center" valign="middle">33.10</td>
<td align="center" valign="middle">28.90</td>
<td align="center" valign="middle">48.01</td>
<td align="center" valign="middle">31.52</td>
<td align="center" valign="middle">9.46</td>
<td align="center" valign="middle">30.87</td>
<td align="center" valign="middle">47.14</td>
</tr>
<tr>
<td align="left" valign="middle">EIOU loss</td>
<td align="center" valign="middle">39.15</td>
<td align="center" valign="middle">29.83</td>
<td align="center" valign="middle">48.70</td>
<td align="center" valign="middle">31.06</td>
<td align="center" valign="middle">10.01</td>
<td align="center" valign="middle">31.29</td>
<td align="center" valign="middle">47.84</td>
</tr>
<tr>
<td align="left" valign="middle">Smooth L1 loss</td>
<td align="center" valign="middle">45.50</td>
<td align="center" valign="middle">28.65</td>
<td align="center" valign="middle">48.10</td>
<td align="center" valign="middle">31.62</td>
<td align="center" valign="middle">10.49</td>
<td align="center" valign="middle">30.77</td>
<td align="center" valign="middle">47.14</td>
</tr>
<tr>
<td align="left" valign="middle">CIoU_Loss (Ours)</td>
<td align="center" valign="middle">42.20</td>
<td align="center" valign="middle">33.61</td>
<td align="center" valign="middle">56.24</td>
<td align="center" valign="middle">36.26</td>
<td align="center" valign="middle">16.47</td>
<td align="center" valign="middle">36.21</td>
<td align="center" valign="middle">49.82</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>First, we replaced the backbone of the improved instance segmentation network with other ResNet-based architectures, including ResNet-101 (<xref ref-type="bibr" rid="ref3">Bolya et al., 2019</xref>), ResNeXt-50 (<xref ref-type="bibr" rid="ref24">Xie et al., 2017</xref>), and ResNeSt-50 (<xref ref-type="bibr" rid="ref28">Zhang Y. H. et al., 2022</xref>; <xref ref-type="bibr" rid="ref29">Zhang H. et al., 2022</xref>). Among these, ResNet-101 is the original backbone used by YOLACT. As shown in <xref ref-type="table" rid="tab4">Table 4</xref>, the mAP of both ResNeXt-50 and ResNeSt-50 did not surpass that of Res2Net-50, or even ResNet-101. This is primarily because, although ResNeXt introduces greater parallel cardinality and ResNeSt employs split convolution strategies to enhance feature learning, they may not be as effective in multi-scale feature representation as Res2Net. This ultimately led to their poorer performance in instance segmentation tasks. For ResNet-101, its deeper architecture not only results in a slightly lower FPS compared to ResNet-50, but also leads to a reduction in mAP relative to Res2Net-50 primarily due to a certain degree of overfitting. Consequently, the experimental results presented in <xref ref-type="table" rid="tab4">Table 4</xref> demonstrate that employing Res2Net-50 as the backbone enables the improved instance segmentation network to achieve superior performance in terms of both segmentation accuracy and computational efficiency.</p>
<p>Second, we replaced the loss function of the improved instance segmentation network with Dice Loss (<xref ref-type="bibr" rid="ref12">Li et al., 2019</xref>), EIOU Loss (<xref ref-type="bibr" rid="ref28">Zhang Y. H. et al., 2022</xref>; <xref ref-type="bibr" rid="ref29">Zhang H. et al., 2022</xref>), and Smooth L1 Loss (<xref ref-type="bibr" rid="ref3">Bolya et al., 2019</xref>), where the last one is the loss function originally used in YOLACT. According to <xref ref-type="table" rid="tab5">Table 5</xref>, the models employing Dice Loss and EIOU Loss show negligible improvements in segmentation accuracy compared to the model using Smooth L1 Loss. In contrast, the improved instance segmentation network using CIoU_Loss demonstrates a significant enhancement in AP-related metrics, albeit with a slight reduction in computational efficiency. The primary reason for the improvement is that CIoU_Loss considers not only the IoU overlap area but also the distance between center points and the aspect ratio. This enables CIoU_Loss to better handle challenging localization scenarios, resulting in substantially improved regression performance for the predicted bounding boxes compared to Smooth L1.</p>
</sec>
<sec id="sec16"><label>4.4.2</label>
<title>Effectiveness of the PnP-based motion consistency detection method</title>
<p>To verify the effectiveness of the proposed PnP-based motion consistency detection method, in this section, we employed this method combined with the proposed instance segmentation network on one static sample and two samples containing people in motion. The corresponding results of the feature point extraction of these samples are illustrated in <xref ref-type="fig" rid="fig5">Figures 5B</xref>&#x2013;<xref ref-type="fig" rid="fig5">D</xref>, respectively, while <xref ref-type="fig" rid="fig5">Figure 5A</xref> is the feature point extraction result for a static sample when only the instance segmentation network is employed.</p>
<fig position="float" id="fig5"><label>Figure 5</label>
<caption>
<p>The feature point extraction result of the PnP-based motion consistency detection method combined with the improved instance segmentation network. <bold>(A)</bold> Result only by the improved YOLACT for a static sample. <bold>(B)</bold> Result by the motion consistency detection combined with improved YOLACT for a static sample. <bold>(C,D)</bold> Results by the motion consistency detection combined with improved YOLACT for samples containing people in motion. The raw images were obtained from the TUM Dataset, and this dataset is licensed under a Creative Commons 4.0 Attribution License (<ext-link xlink:href="https://cvg.cit.tum.de/data/datasets/rgbd-dataset" ext-link-type="uri">https://cvg.cit.tum.de/data/datasets/rgbd-dataset</ext-link>).</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g005.tif"/>
</fig>
<p>According to <xref ref-type="fig" rid="fig5">Figure 5A</xref>, it is evident that the feature points on the person in a static state are all removed and do not participate in the pose calculation. When the motion consistency detection algorithm and instance segmentation network are integrated and implemented, <xref ref-type="fig" rid="fig5">Figure 5B</xref> demonstrates that feature points on a stationary person can be successfully recovered. When the person in the sample is in a state of motion, as shown in <xref ref-type="fig" rid="fig5">Figures 5C</xref>,<xref ref-type="fig" rid="fig5">D</xref>, the feature points on the person can be removed (the blues points in <xref ref-type="fig" rid="fig5">Figures 5C</xref>,<xref ref-type="fig" rid="fig5">D</xref>). Therefore, it can be summarized that the motion consistency detection algorithm based on PnP can effectively remove the dynamic ORB feature points, and thus improve the accuracy of the camera pose.</p>
</sec>
<sec id="sec17"><label>4.4.3</label>
<title>Effectiveness of the BEBLID feature matching</title>
<p>To verify the effect of the BEBLID descriptor on improving the feature matching accuracy of the proposed system, we compared the feature matching rate and computation time of the adopted BEBLID descriptor and the BRIEF descriptor in this section. These experiments were conducted on the adjacent image frames in two sets of low-dynamic sequences including fr3_stingting_static and fr3_stingting_rpy, as well as in two sets of high-dynamic sequences including fr3-walking_malf and fr3-walking_xyz, of the TUM dataset. A total of 500 feature points was extracted for each image frame, and then single response matrixes are employed to determine the number of matching points based on the Hamming distance. The RANSAC threshold was set to 3. The feature matching rate is defined as the percentage of the number of matching points.</p>
<p><xref ref-type="table" rid="tab6">Table 6</xref> presents a comparison of the matching rates and the computational time by the matching algorithms based on the BEBLID and BRIEF descriptors across the selected four sequences. The average matching rates of the algorithm based on BEBLID descriptors are observed to be 6.6 and 6.9% higher than those of the algorithm based on BRIEF descriptors, respectively. Furthermore, BEBLID employs parallel computing to calculate each feature point descriptor, resulting in an average increase in calculation efficiency of 14.1 and 15.1%, respectively. Moreover, the specific feature matching results corresponding to each comparison in <xref ref-type="table" rid="tab6">Table 6</xref> are illustrated in <xref ref-type="fig" rid="fig6">Figure 6</xref> to further demonstrate the effectiveness of the adopted descriptor. From <xref ref-type="fig" rid="fig6">Figure 6</xref>, it can be seen that on the same image frame, the matching algorithm based on the BEBLID descriptor has more correctly matched feature points than that based on the BRIEF descriptor, especially on some instances in the corners of the image.</p>
<table-wrap position="float" id="tab6"><label>Table 6</label>
<caption>
<p>Comparison of matching rates and computational time by the matching algorithms based on the BRIEF and BEBILD descriptors on four sequences.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Dataset</th>
<th align="center" valign="top">descriptor</th>
<th align="center" valign="top">Matching number</th>
<th align="center" valign="top">Point number</th>
<th align="center" valign="top">Matching rate</th>
<th align="center" valign="top">Time /s</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" rowspan="2">Fr3/sitting_static</td>
<td align="center" valign="middle">BRIEF</td>
<td align="center" valign="middle">389</td>
<td align="center" valign="middle">348</td>
<td align="center" valign="middle">89.4%</td>
<td align="center" valign="middle">0.0373</td>
</tr>
<tr>
<td align="center" valign="middle">BEBLID</td>
<td align="center" valign="middle">386</td>
<td align="center" valign="middle">360</td>
<td align="center" valign="middle">93.2%</td>
<td align="center" valign="middle">0.0328</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="2">Fr3/sitting_rpy</td>
<td align="center" valign="middle">BRIEF</td>
<td align="center" valign="middle">345</td>
<td align="center" valign="middle">254</td>
<td align="center" valign="middle">73.6%</td>
<td align="center" valign="middle">0.0341</td>
</tr>
<tr>
<td align="center" valign="middle">BEBLID</td>
<td align="center" valign="middle">342</td>
<td align="center" valign="middle">284</td>
<td align="center" valign="middle">83.1%</td>
<td align="center" valign="middle">0.0286</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="2">Fr3/walking_half</td>
<td align="center" valign="middle">BRIEF</td>
<td align="center" valign="middle">348</td>
<td align="center" valign="middle">235</td>
<td align="center" valign="middle">67.5%</td>
<td align="center" valign="middle">0.0346</td>
</tr>
<tr>
<td align="center" valign="middle">BEBLID</td>
<td align="center" valign="middle">337</td>
<td align="center" valign="middle">267</td>
<td align="center" valign="middle">79.2%</td>
<td align="center" valign="middle">0.0291</td>
</tr>
<tr>
<td align="left" valign="middle" rowspan="2">Fr3/walking_xyz</td>
<td align="center" valign="middle">BRIEF</td>
<td align="center" valign="middle">338</td>
<td align="center" valign="middle">256</td>
<td align="center" valign="middle">75.7%</td>
<td align="center" valign="middle">0.0332</td>
</tr>
<tr>
<td align="center" valign="middle">BEBLID</td>
<td align="center" valign="middle">338</td>
<td align="center" valign="middle">263</td>
<td align="center" valign="middle">77.8%</td>
<td align="center" valign="middle">0.0284</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig6"><label>Figure 6</label>
<caption>
<p>Comparison of specific matching results by the matching algorithms based on the BRIEF and BEBLID descriptors. The raw images were obtained from the TUM Dataset, and this dataset is licensed under a Creative Commons 4.0 Attribution License (<ext-link xlink:href="https://cvg.cit.tum.de/data/datasets/rgbd-dataset" ext-link-type="uri">https://cvg.cit.tum.de/data/datasets/rgbd-dataset</ext-link>).</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g006.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec id="sec18"><label>5</label>
<title>Real-world experiment and discussions</title>
<sec id="sec19"><label>5.1</label>
<title>Experimental setup</title>
<p>To evaluate the effectiveness of ISFM-SLAM in solving real-world tasks and its advantages over ORB-SLAM2, we deployed both the systems on a three-wheeled mobile robot for real-world experiments. As depicted in <xref ref-type="fig" rid="fig7">Figure 7</xref>, the mobile robot is equipped with an Astrapro RGB-D camera, capturing images at a frame rate of 30 FPS with a resolution of 640 <inline-formula>
<mml:math id="M63">
<mml:mo>&#x00D7;</mml:mo>
</mml:math>
</inline-formula> 480. ISFM-SLAM and ORB-SLAM2 were implemented on the NVIDIA Jetson Orin Nano Developer Kit of the robot, and both SLAM systems were initiated through the Robot Operating System (ROS). The parameters of the SGD optimizer for the instance segmentation network were adjusted for the experiment, with an initial momentum set to 0.9, a learning rate of 0.0001, and a weight decay coefficient of 0.00015.</p>
<fig position="float" id="fig7"><label>Figure 7</label>
<caption>
<p>The three-wheeled mobile robot for the real-world experiment.</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g007.tif"/>
</fig>
</sec>
<sec sec-type="results" id="sec20"><label>5.2</label>
<title>Results and discussions</title>
<p>To better emphasize the impact of our proposed improvements, we conducted an experiment where the robot remained stationary to capture moving people, testing the sensitivity of ISFM-SLAM and ORB-SLAM2 to dynamic objects, rather than merely scanning a static laboratory scene. As we did not have the necessary equipment to record ground-truth trajectories, the analysis focuses on how dynamic objects influence our SLAM system compared to its competitor. The experimental results are presented in <xref ref-type="fig" rid="fig8">Figure 8</xref>.</p>
<fig position="float" id="fig8"><label>Figure 8</label>
<caption>
<p>Experimental results in real environment. <bold>(A,B)</bold> Final preserved feature points by ORB-SLAM2; <bold>(C,D)</bold> Final preserved feature points by ISFM-SLAM.</p>
</caption>
<graphic xlink:href="fnbot-18-1473937-g008.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig8">Figures 8A</xref>,<xref ref-type="fig" rid="fig8">B</xref> show that after running its tracking thread in a real laboratory scene, ORB-SLAM2 detected numerous feature points. However, ORB-SLAM2 fails to effectively exclude the influence of dynamic objects, such as the moving person in the images. In contrast, <xref ref-type="fig" rid="fig8">Figures 8C</xref>,<xref ref-type="fig" rid="fig8">D</xref> display the final retained feature points of ISFM-SLAM, clearly showing the absence of feature points on the moving person. This demonstrates that the PnP-based motion consistency detection can accurately distinguish between the motion and stationary states of objects, effectively removing feature points associated with moving objects. Simultaneously, our improved instance segmentation network accurately segments the moving person, preventing ISFM-SLAM from mistakenly removing feature points outside the segmentation boundary. Moreover, the retained feature points can be efficiently matched using the BEBLID feature matching approach, enabling ISFM-SLAM to achieve superior feature matching results.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec21"><label>6</label>
<title>Conclusion</title>
<p>This paper proposed a visual SLAM system named ISFM-SLAM for dynamic scenes based on the ORB-SLAM2 framework. To enhance the multi-sensory capabilities and prediction accuracy of the instance segmentation network, an improved YOLACT model was introduced into the ISFM-SLAM system, with the Res2Net model as its backbone and the CIoU_Loss as its loss function. Then, a PnP-based motion consistency detection approach is proposed to combined with the improved instance segmentation network, enabling the ISFM-SLAM system to effectively filter dynamic feature points. Moreover, the original BRIEF descriptor in the ORB-SLAM2 was replaced by the BEBLID descriptor to achieve efficient matching of ORB feature points. The simulation results demonstrate the effectiveness of the aforementioned improvements and the advantages of ISFM-SLAM over ORB-SLAM2 and other dynamic SLAM systems. Furthermore, real-world experiments conducted on mobile robots confirm that ISFM-SLAM can effectively mitigate the impact of dynamic objects during mapping, proving its feasibility in practical applications. In the future, we will lightweight the instance segmentation network proposed in this paper to improve its real-time performance, and modify the BEBLID descriptor so that the SLAM system can be implemented in more complex dynamic scenes.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec22">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: The TUM Dataset (Download Link): <ext-link xlink:href="https://cvg.cit.tum.de/rgbd/dataset/" ext-link-type="uri">https://cvg.cit.tum.de/rgbd/dataset/</ext-link>. The COCO Minitrain dataset (Download Link): <ext-link xlink:href="http://images.cocodataset.org/zips/train2017.zip" ext-link-type="uri">http://images.cocodataset.org/zips/train2017.zip</ext-link> and <ext-link xlink:href="http://images.cocodataset.org/zips/val2017.zip" ext-link-type="uri">http://images.cocodataset.org/zips/val2017.zip</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="sec23">
<title>Author contributions</title>
<p>CL: Conceptualization, Funding acquisition, Investigation, Methodology, Project administration, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. YH: Conceptualization, Data curation, Investigation, Methodology, Software, Validation, Writing &#x2013; original draft. JL: Data curation, Investigation, Software, Validation, Writing &#x2013; original draft. JJ: Software, Validation, Supervision, Writing &#x2013; review &#x0026; editing. JS: Funding acquisition, Investigation, Methodology, Supervision, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec24">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the Natural Science Foundation of Jiangsu Province (BK20221068), and in part by the National Natural Science Foundation of China (Nos. 62272202, 61672263).</p>
</sec>
<sec sec-type="COI-statement" id="sec25">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="sec26">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Badrinarayanan</surname> <given-names>V.</given-names></name> <name><surname>Kendall</surname> <given-names>A.</given-names></name> <name><surname>SegNet</surname> <given-names>R. C.</given-names></name></person-group> (<year>2015</year>). <article-title>A deep convolutional encoder-decoder architecture for image segmentation</article-title>. <source>Arxiv</source>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2016.2644615</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bescos</surname> <given-names>B.</given-names></name> <name><surname>F&#x00E1;cil</surname> <given-names>J. M.</given-names></name> <name><surname>Civera</surname> <given-names>J.</given-names></name> <name><surname>Neira</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>DynaSLAM: tracking, mapping, and inpainting in dynamic scenes</article-title>. <source>IEEE Robot. Automat. Lett.</source> <volume>3</volume>, <fpage>4076</fpage>&#x2013;<lpage>4083</lpage>. doi: <pub-id pub-id-type="doi">10.1109/LRA.2018.2860039</pub-id></citation></ref>
<ref id="ref3"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Bolya</surname> <given-names>D.</given-names></name> <name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Xiao</surname> <given-names>F.</given-names></name> <name><surname>Lee</surname> <given-names>Y. J.</given-names></name></person-group> (<year>2019</year>). <article-title>Yolact: real-time instance segmentation</article-title>. In <conf-name>Proceedings of the IEEE/CVF International conference on computer vision</conf-name> (pp. <fpage>9157</fpage>&#x2013;<lpage>9166</lpage>).</citation></ref>
<ref id="ref4"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>) <article-title>A robust SLAM for highly dynamic environments</article-title>. In <conf-name>2022 IEEE 17th conference on industrial electronics and applications (ICIEA)</conf-name> (pp. <fpage>780</fpage>&#x2013;<lpage>785</lpage>).</citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cui</surname> <given-names>L.</given-names></name> <name><surname>Ma</surname> <given-names>C.</given-names></name></person-group> (<year>2019</year>). <article-title>SOF-SLAM: a semantic visual SLAM for dynamic environments</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>166528</fpage>&#x2013;<lpage>166539</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2952161</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Davison</surname> <given-names>A. J.</given-names></name> <name><surname>Reid</surname> <given-names>I. D.</given-names></name> <name><surname>Molton</surname> <given-names>N. D.</given-names></name> <name><surname>Stasse</surname> <given-names>O.</given-names></name></person-group> (<year>2007</year>). <article-title>MonoSLAM: Real-time single camera SLAM</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>29</volume>, <fpage>1052</fpage>&#x2013;<lpage>1067</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2007.1049</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Engel</surname> <given-names>J.</given-names></name> <name><surname>Sch&#x00F6;ps</surname> <given-names>T.</given-names></name> <name><surname>Cremers</surname> <given-names>D.</given-names></name></person-group> (<year>2014</year>). &#x201C;<article-title>LSD-SLAM: Large-scale direct monocular SLAM</article-title>&#x201D; in <source>Computer Vision &#x2013; ECCV 2014. ECCV 2014. Lecture Notes in Computer Science</source>, Eds. <person-group person-group-type="editor"><name><surname>Fleet</surname> <given-names>D.</given-names></name> <name><surname>Pajdla</surname> <given-names>T.</given-names></name> <name><surname>Schiele</surname> <given-names>B.</given-names></name> <name><surname>Tuytelaars</surname> <given-names>T.</given-names></name></person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>834</fpage>&#x2013;<lpage>849</lpage>.</citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>S. H.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name> <name><surname>Zhao</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X. Y.</given-names></name> <name><surname>Yang</surname> <given-names>M. H.</given-names></name> <name><surname>Torr</surname> <given-names>P.</given-names></name></person-group> (<year>2019</year>). <article-title>Res2net: a new multi-scale backbone architecture</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>43</volume>, <fpage>652</fpage>&#x2013;<lpage>662</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2019.2938758</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gkioxari</surname> <given-names>G.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>Mask r-cnn</article-title>. In <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name> (pp. <fpage>2961</fpage>&#x2013;<lpage>2969</lpage>).</citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>OVD-SLAM: an online visual SLAM for dynamic environments</article-title>. <source>IEEE Sensors J.</source> <volume>23</volume>, <fpage>13210</fpage>&#x2013;<lpage>13219</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JSEN.2023.3270534</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lepetit</surname> <given-names>V.</given-names></name> <name><surname>Moreno-Noguer</surname> <given-names>F.</given-names></name> <name><surname>Fua</surname> <given-names>P.</given-names></name></person-group> (<year>2009</year>). <article-title>EP n P: an accurate O (n) solution to the P n P problem</article-title>. <source>Int. J. Comput. Vis.</source> <volume>81</volume>, <fpage>155</fpage>&#x2013;<lpage>166</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-008-0152-6</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Sun</surname> <given-names>X.</given-names></name> <name><surname>Meng</surname> <given-names>Y.</given-names></name> <name><surname>Liang</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>F.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Dice loss for data-imbalanced NLP tasks</article-title>. <source>Arxiv</source>. <fpage>1911.02855</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1911.02855</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>T. Y.</given-names></name> <name><surname>Maire</surname> <given-names>M.</given-names></name> <name><surname>Belongie</surname> <given-names>S.</given-names></name> <name><surname>Hays</surname> <given-names>J.</given-names></name> <name><surname>Perona</surname> <given-names>P.</given-names></name> <name><surname>Ramanan</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Microsoft coco: common objects in context</article-title>. In <conf-name>Computer vision&#x2013;ECCV 2014: 13th European conference</conf-name>, <publisher-loc>Zurich, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (pp. <fpage>740</fpage>&#x2013;<lpage>755</lpage>).</citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mur-Artal</surname> <given-names>R.</given-names></name> <name><surname>Tard&#x00F3;s</surname> <given-names>J. D.</given-names></name></person-group> (<year>2017</year>). <article-title>Orb-slam2: an open-source slam system for monocular, stereo, and rgb-d cameras</article-title>. <source>IEEE Trans. Robot.</source> <volume>33</volume>, <fpage>1255</fpage>&#x2013;<lpage>1262</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TRO.2017.2705103</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Pardoe</surname> <given-names>D.</given-names></name> <name><surname>Stone</surname> <given-names>P.</given-names></name></person-group> (<year>2010</year>). <article-title>Boosting for regression transfer</article-title>. In <conf-name>Proceedings of the 27th international conference on international conference on machine learning</conf-name>, <publisher-loc>Haifa, Israel</publisher-loc> (pp. <fpage>863</fpage>&#x2013;<lpage>870</lpage>).</citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qiu</surname> <given-names>S.</given-names></name> <name><surname>Wen</surname> <given-names>G.</given-names></name> <name><surname>Deng</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Fan</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>). <article-title>Accurate non-maximum suppression for object detection in high-resolution remote sensing images</article-title>. <source>Remote Sens. Lett.</source> <volume>9</volume>, <fpage>237</fpage>&#x2013;<lpage>246</lpage>. doi: <pub-id pub-id-type="doi">10.1080/2150704X.2017.1415473</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Riaz</surname> <given-names>H. U. M.</given-names></name> <name><surname>Benbarka</surname> <given-names>N.</given-names></name> <name><surname>Zell</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>) <article-title>Fouriernet: compact mask representation for instance segmentation using differentiable shape decoders</article-title>. In <conf-name>2020 25th international conference on pattern recognition (ICPR)</conf-name> (pp. <fpage>7833</fpage>&#x2013;<lpage>7840</lpage>). <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="ref18"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Samet</surname> <given-names>N.</given-names></name> <name><surname>Hicsonmez</surname> <given-names>S.</given-names></name> <name><surname>Akbas</surname> <given-names>E.</given-names></name></person-group> (<year>2020</year>). <article-title>Houghnet: integrating near and long-range evidence for bottom-up object detection</article-title>. In <conf-name>Computer vision&#x2013;ECCV 2020: 16th European conference, Glasgow, UK, august 23&#x2013;28, 2020, proceedings, part XXV 16</conf-name> (pp. <fpage>406</fpage>&#x2013;<lpage>423</lpage>). <publisher-name>Springer International Publishing</publisher-name>.</citation></ref>
<ref id="ref19"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Sturm</surname> <given-names>J.</given-names></name> <name><surname>Engelhard</surname> <given-names>N.</given-names></name> <name><surname>Endres</surname> <given-names>F.</given-names></name> <name><surname>Burgard</surname> <given-names>W.</given-names></name> <name><surname>Cremers</surname> <given-names>D.</given-names></name></person-group> (<year>2012</year>) <article-title>A benchmark for the evaluation of RGB-D SLAM systems</article-title>. In <conf-name>2012 IEEE/RSJ international conference on intelligent robots and systems</conf-name> (pp. <fpage>573</fpage>&#x2013;<lpage>580</lpage>). <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>P.</given-names></name> <name><surname>Luo</surname> <given-names>S.</given-names></name> <name><surname>Huang</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>Real-time dynamic SLAM algorithm based on deep learning</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>87754</fpage>&#x2013;<lpage>87766</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3199350</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su&#x00E1;rez</surname> <given-names>I.</given-names></name> <name><surname>Sfeir</surname> <given-names>G.</given-names></name> <name><surname>Buenaposada</surname> <given-names>J. M.</given-names></name> <name><surname>Baumela</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>BEBLID: boosted efficient binary local image descriptor</article-title>. <source>Pattern Recogn. Lett.</source> <volume>133</volume>, <fpage>366</fpage>&#x2013;<lpage>372</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patrec.2020.04.005</pub-id></citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Meng</surname> <given-names>M. Q. H.</given-names></name></person-group> (<year>2017</year>). <article-title>Improving RGB-D SLAM in dynamic environments: a motion removal approach</article-title>. <source>Robot. Auton. Syst.</source> <volume>89</volume>, <fpage>110</fpage>&#x2013;<lpage>122</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.robot.2016.11.012</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Taketomi</surname> <given-names>T.</given-names></name> <name><surname>Uchiyama</surname> <given-names>H.</given-names></name> <name><surname>Ikeda</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Visual SLAM algorithms: a survey from 2010 to 2016</article-title>. <source>IPSJ Trans. Comput. Vis. Appl.</source> <volume>9</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s41074-017-0027-2</pub-id></citation></ref>
<ref id="ref24"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>S.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name> <name><surname>Tu</surname> <given-names>Z.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name></person-group> (<year>2017</year>). <article-title>Aggregated residual transformations for deep neural networks</article-title>. In <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name> (pp. <fpage>1492</fpage>&#x2013;<lpage>1500</lpage>).</citation></ref>
<ref id="ref25"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>E.</given-names></name> <name><surname>Sun</surname> <given-names>P.</given-names></name> <name><surname>Song</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Liang</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Polarmask: single shot instance segmentation with polar representation</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name> (pp. <fpage>12193</fpage>&#x2013;<lpage>12202</lpage>).</citation></ref>
<ref id="ref26"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>N.</given-names></name> <name><surname>Gan</surname> <given-names>M.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>K.</given-names></name></person-group> (<year>2021</year>) <article-title>Drso-slam: a dynamic rgb-d slam algorithm for indoor dynamic scenes</article-title>. In <conf-name>2021 33rd Chinese control and decision conference (CCDC)</conf-name> (pp. <fpage>1052</fpage>&#x2013;<lpage>1058</lpage>). <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="ref27"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>X. J.</given-names></name> <name><surname>Xie</surname> <given-names>F.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Wei</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2018</year>) <article-title>DS-SLAM: a semantic visual SLAM towards dynamic environments</article-title>. In <conf-name>2018 IEEE/RSJ international conference on intelligent robots and systems (IROS)</conf-name> (pp. <fpage>1168</fpage>&#x2013;<lpage>1174</lpage>). <publisher-name>IEEE</publisher-name>.</citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y. F.</given-names></name> <name><surname>Ren</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Jia</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Tan</surname> <given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>Focal and efficient IOU loss for accurate bounding box regression</article-title>. <source>Neurocomputing</source> <volume>506</volume>, <fpage>146</fpage>&#x2013;<lpage>157</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2022.07.042</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Wu</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>ResNeSt: Split-attention networks</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name> (pp. <fpage>2736</fpage>&#x2013;<lpage>2746</lpage>).</citation></ref>
<ref id="ref30"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Ye</surname> <given-names>R.</given-names></name> <name><surname>Ren</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Distance-IoU loss: faster and better learning for bounding box regression</article-title>. In <conf-name>Proceedings of the AAAI conference on artificial intelligence</conf-name>, pp. <fpage>12993</fpage>&#x2013;<lpage>13000</lpage>).</citation></ref>
<ref id="ref31"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhong</surname> <given-names>F.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>) <article-title>Detect-SLAM: making object detection and SLAM mutually beneficial</article-title>. In <conf-name>2018 IEEE winter conference on applications of computer vision (WACV)</conf-name> (pp. <fpage>1001</fpage>&#x2013;<lpage>1010</lpage>). <publisher-name>IEEE</publisher-name>.</citation></ref>
</ref-list>
</back>
</article>