<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1469588</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2024.1469588</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Enhanced outdoor visual localization using Py-Net voting segmentation approach</article-title>
<alt-title alt-title-type="left-running-head">Wang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frobt.2024.1469588">10.3389/frobt.2024.1469588</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name>
<surname>Wang</surname>
<given-names>Jing</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="author-notes" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Guo</surname>
<given-names>Cheng</given-names>
</name>
<xref ref-type="author-notes" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2797947/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Shaoyi</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Yibo</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fan</surname>
<given-names>Xuhui</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff>
<institution>College of Communication and Information Engineering</institution>, <institution>Xi&#x2019;an University of Science and Technology</institution>, <addr-line>Xi&#x2019; an</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/623516/overview">Yinlong Liu</ext-link>, University of Macau, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1843027/overview">Zhe Min</ext-link>, Shandong University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1516631/overview">Xinyi Li</ext-link>, Technical University of Munich, Germany</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jing Wang, <email>ajing203@xust.edu.cn</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>10</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1469588</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>07</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>09</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Wang, Guo, Hu, Wang and Fan.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Wang, Guo, Hu, Wang and Fan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Camera relocalization determines the position and orientation of a camera in a 3D space. Althouh methods based on scene coordinate regression yield highly accurate results in indoor scenes, they exhibit poor performance in outdoor scenarios due to their large scale and increased complexity. A visual localization method, Py-Net, is therefore proposed herein. Py-Net is based on voting segmentation and comprises a main encoder containing Py-layer and two branch decoders. The Py-layer comprises pyramid convolution and 1 &#xd7; 1 convolution kernels for feature extraction across multiple levels, with fewer parameters to enhance the model&#x2019;s ability to extract scene information. Coordinate attention was added at the end of the encoder for feature correction, which improved the model robustness to interference. To prevent the feature loss caused by repetitive structures and low-texture images in the scene, deep over-parameterized convolution modules were incorporated into the seg and vote decoders. Landmark segmentation and voting maps were used to establish the relation between images and landmarks in 3D space, reducing anomalies and achieving high precision with a small number of landmarks. The experimental results show that, in multiple outdoor scenes, Py-Net achieves lower distance and angle errors compared to existing methods. Additionally, compared to VS-Net, which also uses a voting segmentation structure, Py-Net reduces the number of parameters by 31.85% and decreases the model size from 236MB to 170 MB.</p>
</abstract>
<kwd-group>
<kwd>camera relocalization</kwd>
<kwd>coordinate attention</kwd>
<kwd>pyramidal convolution</kwd>
<kwd>landmark segmentation map</kwd>
<kwd>landmark voting map</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Robot Vision and Artificial Perception</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Camera relocalization is a fundamental problem in computer vision tasks. It aims to infer a camera&#x2019;s translation vector and rotation angle in the world coordinate system from RGB images, determining the camera&#x2019;s precise position and orientation in a scene.Camera relocalization is the core of simultaneous localization and mapping as well as a key module in technologies such as virtual reality, augmented reality, and autonomous driving (<xref ref-type="bibr" rid="B11">Chen et al., 2021</xref>). Traditional camera relocalization methods often employ structure-from-motion (SFM) techniques to achieve high-precision camera localization by preserving the geometric information of a scene using three-dimensional (3D) point clouds (<xref ref-type="bibr" rid="B24">Shavit and Ferens, 2019</xref>). However, these methods use feature point matching that causes relocalization failure when dealing with complex scenes. Absolute pose estimation using deep learning (<xref ref-type="bibr" rid="B15">Kendall A. et al., 2015</xref>) overcomes the limitations of large memory occupancy and hardware of traditional methods. Scene coordinate regression (SCR)-based (<xref ref-type="bibr" rid="B18">Li et al., 2018</xref>) camera relocalization is a deep learning method and regresses two-dimensional (2D) image pixels to obtain a relation between the 2D pixels and 3D scene coordinates. Further, it uses a random sampling consistency (RANSAC) algorithm to select the best poses, considerably improving the relocalization accuracy. Additionally, image retrieval&#x2013;based camera relocalization methods require matching the query image with an image database to find the most similar image (<xref ref-type="bibr" rid="B3">Balntas et al., 2018</xref>; <xref ref-type="bibr" rid="B2">Arandjelovic et al., 2016</xref>) for calculating the relative position of a camera (<xref ref-type="bibr" rid="B17">Laskar et al., 2017</xref>). These methods yield good relocalization results in the absence of presented scenes but have low localization speed.</p>
<p>SCR-based camera relocalization methods perform well in indoor scenes. Networks with multiple viewpoint constraints converge more easily than single-viewpoint models. Cai et al. (<xref ref-type="bibr" rid="B9">Cai and Zhan, 2019</xref>) enhanced SCR networks using geometric constraints and self-supervision, allowing the networks to learn reliable 2D to 3D relations and improving the training efficiency. Hierarchical scene coordinate networks offer better performance. Li et al. (<xref ref-type="bibr" rid="B19">Li et al., 2020</xref>) proposed HSCNet, which accurately predicts pixel scene coordinates from a single RGB image through multiple output layers, with the final layer predicting the 3D coordinates. Yang et al. (<xref ref-type="bibr" rid="B29">Yang et al., 2019</xref>) proposed SANet, which decouples model parameters from the scene using hierarchical coding, enabling the localization of unknown scenes and estimation of camera poses.</p>
<p>Outdoor scene relocalization faces many challenges, such as differences between datasets and real environments and varying scene properties, which affect relocalization accuracy. In the presence of duplicate structures in a scene, an uncertainty is generated in the positional solution. Duong et al. (<xref ref-type="bibr" rid="B13">Duong et al., 2020</xref>) proposed an efficient multioutput scene coordinate (EMOSC) method that combines machine learning and geometric methods. It is a multioutput depth forest regression method based on sparse feature detection, which greatly reduces the algorithm running time and improves the prediction accuracy. Wald et al. (<xref ref-type="bibr" rid="B27">Wald et al., 2020</xref>) introduced RIO-10 and a new metric: dense correspondence reprojection error. Dong et al. (<xref ref-type="bibr" rid="B12">Dong et al., 2021</xref>) developed an outlier-aware neural tree for high-precision camera relocalization in dynamic indoor settings, featuring decision trees, neural routes, and dynamic point filtering. SCR-based methods balance accuracy and computation time. Turkoglu et al. (<xref ref-type="bibr" rid="B26">Turkoglu et al., 2021</xref>) combined graph neural networks with image retrieval using relative positional loss for training. Bui et al. (<xref ref-type="bibr" rid="B8">Bui et al., 2022</xref>) proposed a simpler SCR algorithm using perceptrons and sparse descriptors, resulting in a smaller model. PixLoc (<xref ref-type="bibr" rid="B22">Sarlin P. et al., 2021</xref>) is a scene-independent algorithm requiring only a query image, 3D model, and reference image with poses for camera relocalization. It uses metric learning for generalization across different scenes. KFNet (<xref ref-type="bibr" rid="B30">Zhou et al., 2020</xref>) combines a recursive network with Kalman filtering, extending SCR to the time domain for 2D to 3D correspondence. A system reported in a previous study (<xref ref-type="bibr" rid="B6">Brachmann E. and Rother C., 2021</xref>) estimated camera translation and orientation from RGB-D or RGB images. It was trained using a 3D environment model and required only RGB images and ground truth for training. Despite the strengths of SCR methods, their accuracy is limited by feature extraction, necessitating improvements in this regard.</p>
<p>Herein, the current research status of camera relocalization is reviewed. Existing deep learning&#x2013;based camera relocalization methods can be categorized into three types: direct regression methods, image retrieval&#x2013;based methods, and SCR methods. The specific principles of these methods are described as follows.</p>
<p>Direct regression methods: In these methods, the camera pose is directly estimated by inputting an image into a convolutional neural network and performing supervised learning. Although this approach is simple and requires only one neural network, its accuracy is generally low.</p>
<p>Image retrieval&#x2013;based methods: These methods initially involve the feature encoding of input images. Then, they can directly find an image in the database that is the most similar to the query image and estimate the camera pose by matching their features. Alternatively, they can estimate their relative poses and more accurately estimate the camera pose. These methods demonstrate good generalization and adapt well to large-scale scenes. However, finding the most similar image from the image database is time-consuming. Additionally, differences between database and query images often make the retrieval of the most similar image challenging. Similar to traditional camera relocalization methods that establish sparse point clouds, optimizing the image-retrieval step helps narrow down the search space, facilitating faster estimation of the camera pose. The algorithm efficiency can be enhanced to some extent using global descriptors as the retrieval criterion.</p>
<p>SCR: Unlike traditional camera relocalization methods that rely on feature matching to establish the 2D&#x2013;3D relation, SCR is more direct. By training a neural network, inputting an image, and obtaining the 3D positions of image pixels using the network, i.e., scene coordinates, the camera pose is calculated using the PnP-RANSAC algorithm based on the correspondence between 2D pixel points and 3D spatial coordinates. These methods considerably simplify the establishment of the 2D&#x2013;3D relation. Unlike traditional camera relocalization and image retrieval&#x2013;based methods, SCR does not directly store scene information in a database or 3D model. Instead, it implicitly expresses scene information using a neural network. A convolutional neural network is first trained to map 2D pixels to 3D spatial coordinates. Then, the spatial coordinates are input to the PnP-RANSAC algorithm for pose estimation. However, camera relocalization methods based on SCR may not perform as robustly in large-scale scenes as in small-scale scenes. In outdoor environments, the accuracy of camera relocalization may also be slightly influenced.</p>
<p>During camera relocalization in outdoor scenes, features extracted by an SCR network contain a large number of invalid scene coordinates, which can slightly affect the relocalization accuracy.To address this issue, herein, a voting segmentation network is adopted as the baseline model. However, using ResNet101 in the encoder of the voting segmentation network increases the model size and computational complexity; therefore, its immunity to external interference requires enhancement. To mitigate the impact of external noise on the encoder, pyramid convolution and 1 &#xd7; 1 convolution kernels are used for constructing the main encoder. This design reduces the size of the network model and effectively enhances the feature extraction capability. Coordinate attention is also introduced to improve the resistance of the model to interference. Additionally, a more efficient feature extraction backbone network is developed using pyramid convolution to address object occlusion in outdoor scenes. Due to low texture and repetitive structures present in outdoor environments, the camera perceives the same scene from different poses, which in turn decreases the accuracy of camera relocalization. Furthermore, the spatial scene information of local features gradually diminishes as features propagate forward in the network. Thus, the output scene coordinates of a network do not provide usable scene information for estimating camera poses. To address these two issues, deep over-parameterized convolution modules are introduced into seg and vote decoders to improve the quality of scene image features while maintaining the original computational complexity.</p>
<p>This paper makes the following significant contributions.<list list-type="simple">
<list-item>
<p>(1) We propose a new network architecture, Py-layer, which is an encoding unit composed of stacked pyramid convolutions and 1 &#xd7; 1 convolutions, with the addition of coordinate attention for feature correction. This simple and efficient network structure can extract multi-scale scene information, capture important features within the scene, and balance performance and efficiency.</p>
</list-item>
<list-item>
<p>(2) We use Py-layer to develop a camera relocalization solution based on a voting segmentation architecture. Deep over-parameterized convolution modules are integrated into the segmentation and voting decoders to address feature loss caused by repetitive structures and low-texture images in the scene.</p>
</list-item>
<list-item>
<p>(3) We conducted extensive experiments on the Cambridge Landmarks dataset to verify the effectiveness of our method in large-scale outdoor scenes. The experimental results show that, compared to VS-Net, which also uses a voting segmentation structure, our model improves average distance and angular accuracy by 29.41% and 33.33%, respectively, while reducing the number of parameters by 31.85%.</p>
</list-item>
</list>
</p>
<p>The remainder of this paper is organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> overviews deep learning&#x2013;based camera relocalization methods. <xref ref-type="sec" rid="s3">Section 3</xref> provides the details of the proposed Py-Net network and its components. <xref ref-type="sec" rid="s4">Section 4</xref> presents the experimental results, and <xref ref-type="sec" rid="s5">Section 5</xref> summarizes the study with concluding remarks.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<p>Herein, the proposed methodology as well as the hardware and software platforms and datasets required for experiments are discussed.</p>
<sec id="s2-1">
<title>2.1 Network architecture</title>
<p>The Py-Net architecture is a simple yet powerful encoder&#x2013;decoder network (<xref ref-type="fig" rid="F1">Figure 1</xref>) comprising three main components: the image encoder, seg decoder, and vote decoder. This design enables the generation of accurate landmark segmentation maps and voting maps for 2D-to-3D correspondence modeling. During training, Py-Net uses a landmark segmentation method based on patch labeling to generate segmentation coordinates for all pixels that correspond to patches surrounding the landmarks. Additionally, each pixel within the landmark patch predicts a 2D directional vector pointing toward the landmark, thereby enabling reliable and precise coordinate voting.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Py-Net network architecture. Py-Net architecture illustrates a encoder-decoder network, integrating an image encoder, segmentation decoder, and voting decoder, enabling accurate 2D-to-3D correspondence modeling.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g001.tif"/>
</fig>
<p>The main encoder comprises stacked Py-layers of different sizes. The Py-layer is a coding layer composed of 1 &#xd7; 1 and pyramid convolutions. 1 &#xd7; 1 convolution kernels perform dimensionality reduction and expansion, whereas pyramid convolutions use convolutions of multiple sizes to process the input image. It contains multiple levels of feature extraction layers, each with convolutions of different sizes and depths. Thus, the ability of the network to extract scene information enhances. Coordinate attention is added at the end of the Py-layer for feature correction. The output of the encoder is then fed into atrous spatial pyramid pooling, where features are sampled in parallel using dilated convolutions with different sampling rates. Finally, seg decoder and vote decoder branches produce landmark segmentation and voting maps, respectively.</p>
<p>In both decoder branches, depth over-parameterized convolution (DOConv) module was used as the convolution module. This design over-parameterizes the decoder, increasing the number of learnable parameters while maintaining the original computational complexity and thus enhancing the quality of scene image features.</p>
</sec>
<sec id="s2-2">
<title>2.2 Pyramidal convolution layer</title>
<p>Compared with standard convolution, pyramidal convolution offers a more robust capability to process input images. It uses multiple convolution sizes containing multiple levels of feature extraction layers, each with convolutions of different sizes and depths; thus, it captures rich details of the scene. Standard convolution contains only one convolution size, and the convolutional depth us equal to the depth of the feature map. In contrast, the convolution size increases and depth decreases in pyramidal convolution with increasing feature extraction levels. In outdoor scenes, the occlusion of building parts may occur, making it difficult for a single type of convolution to effectively capture details in the scene image. However, pyramidal convolution can use convolution kernels with different receptive fields to capture fine features, thereby improving the camera relocalization accuracy.</p>
<p>
<xref ref-type="fig" rid="F2">Figure 2</xref> shows that the number of residual blocks in the backbone network increases via pyramidal convolution, enabling the network to process images using multiple convolution sizes and multiple feature extraction levels. Thus, the ability of the network to extract scene information is enhanced. When optimizing the encoder of the voting segmentation network, the encoding function is mainly completed by 3 &#xd7; 3 convolution kernels because 1 &#xd7; 1 convolution kernels in each module serve to reduce and increase the dimensionality. Therefore, 3 &#xd7; 3 convolutions are first improved in each module of the backbone encoder. Using pyramidal convolution, the network model can process input images using multiple sizes of convolution containing multiple levels of feature extraction layers, each with convolutions of different sizes and depths. Thus, the ability of the network model to extract scene information is slightly enhanced without introducing additional parameters and computational overhead.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Py-layer network architecture. here pyramidal convolution increases the number of residual blocks. This allows the network to handle images with multiple convolution sizes and feature extraction levels, enhancing scene information extraction without increasing computational complexity.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g002.tif"/>
</fig>
<p>For the input feature map <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, the n-layer pyramidal convolution has n convolutional kernel sizes <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> and n convolutional kernel depths <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>; Output Features The <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>o</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is stitched together from the outputs of n convolutional kernels, where the size of <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is given by <xref ref-type="disp-formula" rid="e1">Equation 1</xref>:<disp-formula id="e1">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Here <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2026;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the output feature dimension corresponding to n convolutional kernels.</p>
<p>The number of parameters of the pyramidal convolution is given by <xref ref-type="disp-formula" rid="e2">Equation 2</xref>:<disp-formula id="e2">
<mml:math display="block" id="m8">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2219;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mfenced close=")" open="(" separators="|">
<mml:mfrac>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mfrac>
</mml:mfenced>
</mml:mfrac>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2219;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mfenced close=")" open="(" separators="|">
<mml:mfrac>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>3</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mfrac>
</mml:mfenced>
</mml:mfrac>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>03</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2219;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mfenced close=")" open="(" separators="|">
<mml:mfrac>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mfrac>
</mml:mfenced>
</mml:mfrac>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>02</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>01</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Therefore, if the number of output feature maps is equal for each layer of the pyramidal convolution, then its parameters and computational costs will be evenly distributed across each level of the pyramid.</p>
<p>As shown in <xref ref-type="fig" rid="F3">Figure 3</xref>, standard convolution can only have a single size convolution kernel, with the same kernel depth as the input features. Moreover, the number of computational parameters generated by a single convolution kernel is given by <xref ref-type="disp-formula" rid="e3">Equation 3</xref>:<disp-formula id="e3">
<mml:math id="m9">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>K</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2219;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mn>01</mml:mn>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Comparative Analysis of pyramidal convolution and standard convolution in multi-level feature extraction and parameter count.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g003.tif"/>
</fig>
<p>When multiple standard convolution kernels of different sizes are used to process input features, the as-generated computational and parameter quantities are greater than those generated via pyramidal convolution. This is because the input channels of these kernels must match the input features.</p>
</sec>
<sec id="s2-3">
<title>2.3 Coordinate attention</title>
<p>Attention mechanism is widely used in computer vision to enable network models to focus on relevant information. Some operations in convolutional neural networks, such as convolution, pooling, and fully connected layers, only consider nonself-desired clues. Contrarily, attention mechanism is purposeful and can explicitly model cues that align with its own intentions. As convolutions are conducted within local windows in the main encoder, their representation capability for global features is relatively weak. To address the issue of weak antiinterference capability in outdoor scenes with SCR networks, an attention mechanism is embedded in the encoder. This enables the network model to completely leverage global and local information. Consequently, the model can prioritize the areas of interest in an image, increase the corresponding weights of those areas, and ultimately highlight useful features while suppressing or ignoring irrelevant ones. These factors further enhance the localization accuracy.</p>
<p>Coordinate attention is a lightweight attention mechanism that considers channels and spaces in parallel (<xref ref-type="fig" rid="F4">Figure 4</xref>). With the input of a feature X, X &#x2208; <inline-formula id="inf7">
<mml:math id="m10">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, the coordinated attention mechanism first pools along the height and width of the feature using two pooling kernels of size H &#xd7; 1 and 1 &#xd7; W, respectively. The output of height h and channel c can be expressed as follows (<xref ref-type="disp-formula" rid="e4">Equation 4</xref>):<disp-formula id="e4">
<mml:math id="m11">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Visualization and Analysis of the coordinate attention mechanism.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g004.tif"/>
</fig>
<p>The output of width w and the <italic>c</italic>th channel can be expressed as <xref ref-type="disp-formula" rid="e5">Equation 5</xref>:<disp-formula id="e5">
<mml:math id="m12">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where <inline-formula id="inf8">
<mml:math id="m13">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> &#x2208;<inline-formula id="inf9">
<mml:math id="m14">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf10">
<mml:math id="m15">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> &#x2208; <inline-formula id="inf11">
<mml:math id="m16">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>After the two pooling operations, the two embedding features <inline-formula id="inf12">
<mml:math id="m17">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf13">
<mml:math id="m18">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> can be stitched together. Then, the stitched features are reduced by 1 &#xd7; 1 convolution and input to the sigmoid function (<xref ref-type="disp-formula" rid="e6">Equation 6</xref>):<disp-formula id="e6">
<mml:math id="m19">
<mml:mrow>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">&#x3b4;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>z</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>z</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf14">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="normal">f</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf15">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the convolution, and &#x3b4; denotes the sigmoid activation function. Then, the feature f is split in the spatial dimension using two 1 &#xd7; 1 convolutions to obtain two feature maps. After transforming these maps, the attention vector is obtained as follows (<xref ref-type="disp-formula" rid="e7">Equations 7</xref>&#x2013;<xref ref-type="disp-formula" rid="e8">8</xref>):<disp-formula id="e7">
<mml:math id="m22">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m23">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf16">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf17">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the 1 &#xd7; 1 convolutional transformation and <inline-formula id="inf18">
<mml:math id="m26">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf19">
<mml:math id="m27">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the split features <inline-formula id="inf20">
<mml:math id="m28">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mfrac>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf21">
<mml:math id="m29">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mfrac>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. Eventually, the input features can be corrected using the two attention vectors (<xref ref-type="disp-formula" rid="e9">Equation 9</xref>):</p>
<p>
<disp-formula id="inf22">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula> </p>
<p>CA has higher computational efficiency and less computational overhead in the spatial dimension than other attention modules.</p>
</sec>
<sec id="s2-4">
<title>2.4 Depthwise over-parameterized convolution</title>
<p>In standard convolution, all kernels in a convolutional layer are convolved with the input image. Standard convolution emphasizes the spatial relation of pixels and considers it as channels but at the expense of relatively high computational complexity. In contrast, depthwise convolution assigns each kernel to a specific input channel. Each channel of the input image is convolved with a dedicated kernel. Depthwise convolution focuses more on the features represented in the depth of an image and has relatively lower computational complexity.</p>
<p>Deep over-parameterized convolution (<xref ref-type="bibr" rid="B10">Cao et al., 2022</xref>) is an extension of standard convolution, in which an additional depthwise convolution is incorporated. This additional convolutional structure increases the number of trainable model parameters and enhances the learning capacity of the model. The quality of scene features extracted by this model is also consequently improved.</p>
<p>The feature quality in the SCR network deteriorates due to the influence of repetitive structures and low-texture objects in scene images. To address this issue, standard convolutions must be substituted with deep over-parameterized convolutions. This replacement will enhance the quality of features extracted by the network and improve the overall nonlinear capability of the network, making it more robust. <xref ref-type="fig" rid="F5">Figure 5</xref> shows the structure of deep over-parameterized convolution.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Depthwise over-parameterized convolution. By increasing the number of trainable model parameters, the learning capability of the model is enhanced. As a result, the quality of the extracted scene features is also improved.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g005.tif"/>
</fig>
<p>Deep over-parameterized convolution comprises standard and depthwise convolutions. First, the convolutional kernel <inline-formula id="inf200">
<mml:math id="m200">
<mml:mrow>
<mml:mi mathvariant="double-struck">D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> from the standard convolution and the depthwise convolution kernel <inline-formula id="inf201">
<mml:math id="m201">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are used to compute a new convolutional kernel <inline-formula id="inf202">
<mml:math id="m202">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Then, <inline-formula id="inf203">
<mml:math id="m203">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is convolved with the feature <inline-formula id="inf218">
<mml:math id="m218">
<mml:mrow>
<mml:mi mathvariant="double-struck">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the same manner as the standard convolutional kernel (<xref ref-type="disp-formula" rid="e9">Equation 10</xref>):<disp-formula id="e9">
<mml:math id="m31">
<mml:mrow>
<mml:mi mathvariant="double-struck">O</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="double-struck">D</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mo mathvariant="double-struck">&#x25cb;</mml:mo>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="double-struck">p</mml:mi>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>As shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, <inline-formula id="inf204">
<mml:math id="m204">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the convolutional kernels of standard and depthwise convolutions and <inline-formula id="inf206">
<mml:math id="m206">
<mml:mrow>
<mml:mi mathvariant="double-struck">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents the feature. <inline-formula id="inf207">
<mml:math id="m207">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in standard convolution is a 3D tensor, <inline-formula id="inf23">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="double-struck">w</mml:mi>
<mml:mo mathvariant="double-struck">&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf208">
<mml:math id="m208">
<mml:mrow>
<mml:mi mathvariant="double-struck">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a 2D tensor, <inline-formula id="inf24">
<mml:math id="m33">
<mml:mrow>
<mml:mi mathvariant="double-struck">p</mml:mi>
<mml:mo mathvariant="double-struck">&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. where <inline-formula id="inf25">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf26">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the output and input channel dimensions of the feature and <inline-formula id="inf27">
<mml:math id="m36">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the spatial dimensions of the feature. Each convolutional kernel on the Cout dimension of <inline-formula id="inf209"> <mml:math id="m209"> <mml:mrow> <mml:mi mathvariant="double-struck">W</mml:mi> </mml:mrow> </mml:math> </inline-formula> performs a dot product operation with <inline-formula id="inf210"> <mml:math id="m210"> <mml:mrow> <mml:mi mathvariant="double-struck">P</mml:mi> </mml:mrow> </mml:math> </inline-formula>, yielding the output <inline-formula id="inf211"> <mml:math id="m211"> <mml:mrow> <mml:mi mathvariant="double-struck">O</mml:mi> </mml:mrow> </mml:math> </inline-formula>(<xref ref-type="disp-formula" rid="e10">Equation 11</xref>):<disp-formula id="e10">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="double-struck">O</mml:mi>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi mathvariant="double-struck">w</mml:mi>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msup>
</mml:msub>
</mml:msub>
<mml:msub>
<mml:mi mathvariant="double-struck">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Computation methods of depthwise convolution and standard convolution. <bold>(A)</bold> Standard convolution, <bold>(B)</bold> DOConv.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g006.tif"/>
</fig>
<p>The dimensions of <inline-formula id="inf219">
<mml:math id="m219">
<mml:mrow>
<mml:mi mathvariant="double-struck">O</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> correspond to the output channel dimension of the convolutional kernel.</p>
<p>In depthwise convolution, each input channel of <inline-formula id="inf230">
<mml:math id="m230">
<mml:mrow>
<mml:mi mathvariant="double-struck">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> undergoes a dot product operation with <inline-formula id="inf28">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> channels of the convolutional kernel. The dimension of each input channel of <inline-formula id="inf215"> <mml:math id="m215"> <mml:mrow> <mml:mi mathvariant="double-struck">P</mml:mi> </mml:mrow> </mml:math> </inline-formula> is transformed from <inline-formula id="inf29">
<mml:math id="m39">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf30">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf31">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the depth multiplier. The final output <inline-formula id="inf216"> <mml:math id="m216"> <mml:mrow> <mml:mi mathvariant="double-struck">O</mml:mi> </mml:mrow> </mml:math> </inline-formula> is obtained accordingly (<xref ref-type="disp-formula" rid="e11">Equation 12</xref>):<disp-formula id="e11">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="double-struck">O</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi mathvariant="double-struck">w</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi mathvariant="double-struck">p</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where the depthwise convolutional kernel <inline-formula id="inf217">
<mml:math id="m217">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a 3D tensor, <inline-formula id="inf32">
<mml:math id="m43">
<mml:mrow>
<mml:mi mathvariant="double-struck">w</mml:mi>
<mml:mo mathvariant="double-struck">&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>DOConv and standard convolution have the same receptive field. For a feature input <inline-formula id="inf212">
<mml:math id="m212">
<mml:mrow>
<mml:mi mathvariant="double-struck">P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the output feature dimensions obtained via DOConv and output feature dimensions processed via standard convolution are identical. The linear transformation of standard convolution can be parameterized using <inline-formula id="inf33">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> Cout training weights. The linear transformation of DOConv can be parameterized using the training weights of two convolutional kernels. Only when <inline-formula id="inf34">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2265;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the newly combined convolutional kernel <inline-formula id="inf213">
<mml:math id="m213">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x2b9; can exhibit the same linear transformation as the standard convolution kernel <inline-formula id="inf214">
<mml:math id="m214">
<mml:mrow>
<mml:mi mathvariant="double-struck">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> when formed via combination. DOConv introduces over-parameterization to the network, thereby increasing the number of learnable parameters while maintaining the original computational complexity and enhancing the feature quality of scene images.</p>
</sec>
<sec id="s2-5">
<title>2.5 Loss function</title>
<p>The scene is first reconstructed in 3D using Py-Net. The resulting 3D surface is divided into 3D image patches using the center point of each patch as a 3D landmark, <inline-formula id="inf35">
<mml:math id="m46">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Segmentation maps S <inline-formula id="inf36">
<mml:math id="m47">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">Z</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and voting maps d <inline-formula id="inf37">
<mml:math id="m48">
<mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are obtained by projecting 3D image patches and 3D landmarks onto 2D images. To generate a segmentation map, values are assigned to each pixel point <inline-formula id="inf38">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> by determining its projected 3D image patch coordinate on the 2D image. A value of 0 is assigned to the pixel point coordinate <inline-formula id="inf39">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> if the associated area is not covered by the projected 3D surface, indicating that they did not influence the positioning. The landmark voting map is generated by first projecting the landmark <inline-formula id="inf40">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">q</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> on a 3D surface to a 2D plane to obtain 2D coordinates (<xref ref-type="disp-formula" rid="e12">Equation 13</xref>):<disp-formula id="e12">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">q</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">C</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <bold>K</bold> is the camera internal reference matrix and C is the camera pose parameter. Each pixel belonging to an image patch containing the coordinate <inline-formula id="inf41">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">q</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is responsible for projecting a 2D direction vector <inline-formula id="inf42">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> pointing to <inline-formula id="inf43">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. By unitizing this coordinate wth <inline-formula id="inf44">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the unit vector is obtained (<xref ref-type="disp-formula" rid="e13">Equation 14</xref>):<disp-formula id="e13">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>This vector is used to represent the orientation of the landmarks on the 2D plane. Supervised training of the voting segmentation network is possible by using the obtained segmentation map and voting map as training truths. This in turn establishes the relationship from 2D to 3D, thus enabling camera relocation.</p>
<p>The Vote Segmentation Network uses a prototype-based ternary loss function and a negative sample mining strategy to supervise the training of the network. Training the network in this way requires maintaining and updating a set of learnable class prototype embeddings P. Where each class prototype embedding represents a class and <inline-formula id="inf45">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the class prototype embedding of the <italic>j</italic>th class. Therefore, the pixel embeddings belonging to the <italic>j</italic>th category should be as close as possible to, and as far away as possible from, the class prototype embeddings of other categories.</p>
<p>Pixel embeddings can be obtained by voting on the segmentation branches of the segmentation network and the class of prototype embeddings P. Pixel embeddings can form a pixel-by-pixel embedding graph E. The prototype-based ternary loss function first first L2 normalizes each pixel embedding in the embedding graph E, and then minimizes the error between each pixel embedding <inline-formula id="inf46">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the prototype embedding <inline-formula id="inf47">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="disp-formula" rid="e14">Equation 15</xref>):<disp-formula id="e14">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mi>max</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">E</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:msup>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
</mml:msup>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">E</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:msup>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
</mml:msup>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>where<disp-formula id="equ1">
<mml:math id="m62">
<mml:mrow>
<mml:mtext>sim</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>a</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2219;</mml:mo>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>is used to measure the cosine similarity between the pixel embedding and prototype-like embedding. <inline-formula id="inf48">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:msup>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
</mml:msup>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the true value of prototype-like embedding corresponding to pixel i and <inline-formula id="inf49">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:msup>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
</mml:msup>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the true value of prototype-like embedding unrelated to pixel <inline-formula id="inf50">
<mml:math id="m65">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf51">
<mml:math id="m66">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denotes the boundary of the loss function.</p>
<p>The voting decoder of the voting segmentation network is then used to determine the landmark&#x2019;s cast position in the 2D image.The voting decoder is used to determine the casting position of landmarks in a two-dimensional image. Inputting an image, the voting decoder outputs a voting map d. For each pixel i in the input, it generates a two-dimensional direction vector <inline-formula id="inf52">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which points to the two-dimensional position of the landmark. The voting decoder is supervised trained under the voting loss function <xref ref-type="disp-formula" rid="e15">Equation 16</xref>: <disp-formula id="e15">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2260;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>Where one denotes the indicator function and <inline-formula id="inf54">
<mml:math id="m70">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf55">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denote the direction vectors predicted for pixel i and their direction vector truth values, respectively. The overall loss function of VS-Net is represented as in <xref ref-type="disp-formula" rid="e16">Equation 17</xref>:<disp-formula id="e16">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>where &#x3bb; denotes the weight of voting loss.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>This section evaluates the performance of our method, Py-Net, on the Cambridge Landmark dataset. We compared our results with those of existing camera relocalization methods, and the experiments demonstrated that the proposed method achieved state-of-the-art accuracy. Finally, we conducted ablation studies to investigate the contributions of individual components.</p>
<sec id="s3-1">
<title>3.1 Dataset</title>
<p>The improvement in the proposed model was validated using the Cambridge Landmarks (<xref ref-type="bibr" rid="B16">Kendall Alex et al., 2015</xref>) dataset containing five outdoor landmarks scenes: Great Court, Kings College, Old Hospital, Shop Facade, and St. Mary&#x2019;s Church. The dataset was more complex than indoor scenes; therefore, it better demonstrated the robustness of the model, as exterior environments undergo drastic environmental changes. For instance, the outdoor camera moves faster than the indoor camera, which may result in blurry images.</p>
</sec>
<sec id="s3-2">
<title>3.2 Evaluation metrics</title>
<p>As the camera relocalization error in outdoor environments is significant than that in indoor settings, using the a 1-cm distance error and 1&#xb0; angle error as well as 2-cm distance error and 2&#xb0; angle error are the evaluation metrics is not ideal. Therefore, the median distance error within 5 cm and median angle error of 5&#xb0; were used as the evaluation metrics instead. We also report the percentage of high-precision localization points with a distance error within 5 cm and an angle error within 5&#xb0;</p>
</sec>
<sec id="s3-3">
<title>3.3 Results and analysis</title>
<p>The performance of Py-Net was compared with existing visual localization systems on the Cambridge Landmarks dataset. Detailed results are shown in <xref ref-type="table" rid="T1">Table 1</xref>, where the primary indicators for evaluating the accuracy of camera relocalization are compared: translation error (m) and rotation error (&#xb0;). Experimental results indicate that Py-Net considerably outperforms the existing methods. The translation error decreased by 40% using the SCR method than DSAC. This indicates that the voting segmentation network can considerably reduce the number of outliers in the scene and improve the relocalization accuracy. Py-Net outperformed VS-Net, which also employs a voting segmentation architecture, in the majority of scenes, particularly in the Great Court scene, where the translation and rotation errors decreased by 14% and 50%, respectively. This suggests that using Py-layer to enhance the backbone encoder, the network can learn multiscale features. Thus, the ability of backbone encoder to extract scene information enhances, resulting in better performance.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Visual localization accuracy of state-of-the-art methods.We report the median translation error (m), rotation error (degrees), and localization precision, where the translation and rotation errors are within 5 cm and 5&#xb0;, respectively. NA indicates no available values, Best results are in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="2" align="center"/>
<th align="center">Great court</th>
<th align="center">Kings college</th>
<th align="center">Old hospital</th>
<th align="center">Shop facade</th>
<th align="center">St.Mary&#x2019;s church</th>
<th align="center">Avg</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="7" align="center">Direct Regression</td>
<td align="center">PoseNet (<xref ref-type="bibr" rid="B16">Kendall et al., 2015b</xref>)</td>
<td align="center">NA</td>
<td align="center">1.92 m,5.40 <inline-formula id="inf56">
<mml:math id="m73">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.31 m,5.38 <inline-formula id="inf57">
<mml:math id="m74">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.46 m,8.08 <inline-formula id="inf58">
<mml:math id="m75">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.65 m,8.48 <inline-formula id="inf59">
<mml:math id="m76">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.09 m,6.84 <inline-formula id="inf60">
<mml:math id="m77">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Dense PoseNet (<xref ref-type="bibr" rid="B25">Shen and Chen, 2019</xref>)</td>
<td align="center">NA</td>
<td align="center">1.66 m,4.86 <inline-formula id="inf61">
<mml:math id="m78">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.57 m,5.14 <inline-formula id="inf62">
<mml:math id="m79">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.41 m,7.18 <inline-formula id="inf63">
<mml:math id="m80">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.45 m,7.96 <inline-formula id="inf64">
<mml:math id="m81">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.02 m,6.29 <inline-formula id="inf65">
<mml:math id="m82">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">LSTM-Pose (<xref ref-type="bibr" rid="B21">Luo et al., 2018</xref>)</td>
<td align="center">NA</td>
<td align="center">0.99 m,3.65 <inline-formula id="inf66">
<mml:math id="m83">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.51 m,4.29 <inline-formula id="inf67">
<mml:math id="m84">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.18 m,7.44 <inline-formula id="inf68">
<mml:math id="m85">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.52 m,6.68 <inline-formula id="inf69">
<mml:math id="m86">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.30 m,5.52 <inline-formula id="inf70">
<mml:math id="m87">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">SVS-Pose (<xref ref-type="bibr" rid="B1">Abozeid et al., 2022</xref>)</td>
<td align="center">NA</td>
<td align="center">1.06 m,2.81 <inline-formula id="inf71">
<mml:math id="m88">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.50 m,4.03 <inline-formula id="inf72">
<mml:math id="m89">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.63 m,5.73 <inline-formula id="inf73">
<mml:math id="m90">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">2.11 m,8.11 <inline-formula id="inf74">
<mml:math id="m91">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">1.32 m,5.17 <inline-formula id="inf75">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td rowspan="3" align="center">NeRF-loc (<xref ref-type="bibr" rid="B20">Liu et al., 2023</xref>)<break/>Pixloc (<xref ref-type="bibr" rid="B23">Sarlin et al., 2021b</xref>)<break/>ST-Pixloc (<xref ref-type="bibr" rid="B28">Wang et al., 2024</xref>)</td>
<td align="center">0.25 m,<bold>0.1</bold> <inline-formula id="inf76">
<mml:math id="m93">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.07</bold> <bold>m</bold>,<bold>0.2</bold> <inline-formula id="inf77">
<mml:math id="m94">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.18 m,0.4 <inline-formula id="inf78">
<mml:math id="m95">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.11 m,<bold>0.2</bold> <inline-formula id="inf79">
<mml:math id="m96">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.04</bold> <bold>m</bold>,<bold>0.2</bold> <inline-formula id="inf80">
<mml:math id="m97">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.13 m,0.22 <inline-formula id="inf81">
<mml:math id="m98">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">0.42 m,0.18 <inline-formula id="inf82">
<mml:math id="m99">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.16 m,0.26 <inline-formula id="inf83">
<mml:math id="m100">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.49 m,0.79 <inline-formula id="inf84">
<mml:math id="m101">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.06 m,0.23 <inline-formula id="inf85">
<mml:math id="m102">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.14 m,0.36 <inline-formula id="inf86">
<mml:math id="m103">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.25 m,0.36 <inline-formula id="inf87">
<mml:math id="m104">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">0.24 m,0.13 <inline-formula id="inf88">
<mml:math id="m105">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.15 m,0.23 <inline-formula id="inf89">
<mml:math id="m106">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.42 m,0.69 <inline-formula id="inf90">
<mml:math id="m107">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.05 m,0.26 <inline-formula id="inf91">
<mml:math id="m108">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.13 m,0.31 <inline-formula id="inf92">
<mml:math id="m109">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.19 m,0.32 <inline-formula id="inf93">
<mml:math id="m110">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td rowspan="5" align="center">SCR</td>
<td align="center">DSAC (<xref ref-type="bibr" rid="B5">Brachmann et al., 2017</xref>)</td>
<td align="center">2.80 m,1.5 <inline-formula id="inf94">
<mml:math id="m111">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.30 m,0.5 <inline-formula id="inf95">
<mml:math id="m112">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.33 m,0.6 <inline-formula id="inf96">
<mml:math id="m113">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.09 m,0.40 <inline-formula id="inf97">
<mml:math id="m114">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.55 m,1.6 <inline-formula id="inf98">
<mml:math id="m115">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.81 m,0.92 <inline-formula id="inf99">
<mml:math id="m116">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">DSAC&#x2b;&#x2b; (<xref ref-type="bibr" rid="B7">Brachmann and Rother, 2021b</xref>)</td>
<td align="center">0.4 m,0.2 <inline-formula id="inf100">
<mml:math id="m117">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.18 m,0.3 <inline-formula id="inf101">
<mml:math id="m118">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.20 m,<bold>0.3</bold> <inline-formula id="inf102">
<mml:math id="m119">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.06 m,0.3 <inline-formula id="inf103">
<mml:math id="m120">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.13 m,0.4 <inline-formula id="inf104">
<mml:math id="m121">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.19 m,0.3 <inline-formula id="inf105">
<mml:math id="m122">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">ACE (<xref ref-type="bibr" rid="B4">Brachmann et al., 2023</xref>)</td>
<td align="center">0.42 m,0.2 <inline-formula id="inf106">
<mml:math id="m123">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.28 m,0.4 <inline-formula id="inf107">
<mml:math id="m124">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.31 m,0.6 <inline-formula id="inf108">
<mml:math id="m125">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.05</bold> <bold>m</bold>,0.3 <inline-formula id="inf109">
<mml:math id="m126">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.19 m,0.6 <inline-formula id="inf110">
<mml:math id="m127">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.25 m,0.42 <inline-formula id="inf111">
<mml:math id="m128">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">VS-Net (<xref ref-type="bibr" rid="B14">Huang et al., 2021</xref>)</td>
<td align="center">0.22 m,0.2 <inline-formula id="inf112">
<mml:math id="m129">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.16 m,0.3 <inline-formula id="inf113">
<mml:math id="m130">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.16</bold> <bold>m</bold>,0.4 <inline-formula id="inf114">
<mml:math id="m131">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.06 m,<bold>0.2</bold> <inline-formula id="inf115">
<mml:math id="m132">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.08 m,0.4 <inline-formula id="inf116">
<mml:math id="m133">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.14 m,0.3 <inline-formula id="inf117">
<mml:math id="m134">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Py-Net (<bold>ours</bold>)</td>
<td align="center">
<bold>0.19</bold> <bold>m</bold>,<bold>0.1</bold> <inline-formula id="inf118">
<mml:math id="m135">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.14 m,<bold>0.2</bold> <inline-formula id="inf119">
<mml:math id="m136">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.16</bold> <bold>m</bold>,<bold>0.3</bold> <inline-formula id="inf120">
<mml:math id="m137">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.05</bold>,m<bold>0.2</bold> <inline-formula id="inf121">
<mml:math id="m138">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">0.07 m,<bold>0.2</bold> <inline-formula id="inf122">
<mml:math id="m139">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<bold>0.12</bold> <bold>m</bold>, <bold>0.2</bold> <inline-formula id="inf123">
<mml:math id="m140">
<mml:mrow>
<mml:mo>&#xb0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F7">Figure 7</xref> compares the scene coordinate prediction between VS-Net and Py-Net. In outdoor scenes, the enlarged scene scale results in a substantial number of invalid scene coordinate points. This affects the prediction accuracy and limits the scene information available for camera relocation.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Comparison of scene coordinate predictions. <bold>(A)</bold> Great court snene; <bold>(B)</bold> Kings College Scene; <bold>(C)</bold> ShopFacade Scene. The network predicts the 2D-3D correspondences of the image, visualizing them as a scene coordinate map by rendering different coordinates in different colors. The richness of the scene information in the scene coordinate map significantly affects the accuracy of the PnP algorithm.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g007.tif"/>
</fig>
<p>However, as shown in <xref ref-type="fig" rid="F7">Figure 7A</xref>, Py-Net provides higher amounts of usable information within the predicted scene coordinate maps. In the Great Court scene, Py-Net generates a scene coordinate map with an increased level of scene information (<xref ref-type="fig" rid="F7">Figure 7A</xref>). In the Kings College scene, the scene coordinate map predicted by Py-Net contains noticeably increased usable information (<xref ref-type="fig" rid="F7">Figure 7B</xref>). The Shop Facade scene contains a substantial amount of usable scene information, with fewer background pixels representing the sky (<xref ref-type="fig" rid="F7">Figure 7C</xref>). This indicates that incorporating depthwise separable convolutional modules into the scene coordinate decoder effectively enriches the amount of scene information, thereby enhancing the information representation capability of the network.</p>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> shows the positioning trajectories of the improved Py-Net on the Cambridge Landmark dataset. The figure compares the distance errors between VS-Net and Py-Net. Panels (a), (c), and (e) show the positioning trajectories of VS-Net, and panels (b), (d), and (f) show the positioning trajectories of the improved Py-Net. When zooming in on the positioning errors in the Great Court scene, a certain degree of reduction in distance errors can be observed. In the Kings College scene, Py-Net reduces the distance errors of marked points. As the test samples of the Shop Facade scene were limited, only a few positioning points with reduced errors are shown in the figure.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Comparison of Localization Trajectories on the Cambridge Landmark Dataset: VS-Net <bold>(A, C, E)</bold> vs. Improved Py-Net <bold>(B, D, F)</bold>.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g008.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F9">Figure 9</xref> shows that in multiple scenes, Py-Net outperforms existing methods in the proportion of high-precision points with a distance error of less than 5 cm and an angle error of less than 5&#xb0;. The increase in the proportion of high-precision localization points indicates that the number of invalid localization points in the scene has decreased, meaning that outliers have been filtered out.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Percentage of localization points within 5 cm 5&#xb0; on the Cambridge Landmark Dataset.</p>
</caption>
<graphic xlink:href="frobt-11-1469588-g009.tif"/>
</fig>
<p>The Py-layer and depthwise separable convolution were introduced to enhance the performances of the main encoder and decoder, respectively. As a result, the size of Py-Net considerably compared with that of VS-Net (<xref ref-type="table" rid="T2">Table 2</xref>). Specifically, the model size decreased from 236 to 170 MB, and the parameter count was only 68.15% of the original. This improvement considerably enhanced the applicability of the model on devices with limited storage space.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison of model sizes (MB).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th align="left">VS-Net</th>
<th align="left">Py-Net</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Model Size</td>
<td align="left">236</td>
<td align="left">170</td>
</tr>
<tr>
<td align="left">Parameters</td>
<td align="left">61,864,994</td>
<td align="left">42,163,554</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Ablation experiments were conducted to demonstrate the effectiveness of the improvements proposed herein. By introducing the Py-layer and coordinated attention mechanism, Py-Net can effectively extract outdoor scene information and filter out invalid localization points in the scene. Thus, the model performance is improved across various scenes. In the Great Court scene, the distance error decreased by 16%, whereas in the Shop Facade and St. Mary&#x2019;s Church scenes, the angle error decreased by 33%. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, after incorporating the depthwise separable convolution module, the angle error of the network considerably increased across multiple scenes. This indicates that Py-Net can retain more scene information, thus achieving more accurate estimation results.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Ablation experiment&#x221a; indicates the component is used, &#xd7; indicates the component is not used.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Components</th>
<th colspan="3" align="left">&#x201c;&#x221a;&#x201d; considering component &#x201c;&#x2179;&#x201d; excluding component</th>
</tr>
<tr>
<th align="left">Py-layer &#x2b; CA</th>
<th align="left">&#x2179;</th>
<th align="left">&#x2179;</th>
<th align="left">&#x221a;</th>
</tr>
<tr>
<th align="left">DoConv</th>
<th align="left">&#x2179;</th>
<th align="left">&#x221a;</th>
<th align="left">&#x221a;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Great Court</td>
<td align="left">0.22 m, 0.2&#xb0;</td>
<td align="left">0.22 m, 0.1&#xb0;</td>
<td align="left">0.19 m, 0.1&#xb0;</td>
</tr>
<tr>
<td align="left">Kings College</td>
<td align="left">0.16 m, 0.3&#xb0;</td>
<td align="left">0.16 m, 0.2&#xb0;</td>
<td align="left">0.15 m, 0.2&#xb0;</td>
</tr>
<tr>
<td align="left">Old Hospital</td>
<td align="left">0.16 m, 0.4&#xb0;</td>
<td align="left">0.16 m, 0.3&#xb0;</td>
<td align="left">0.16 m, 0.3&#xb0;</td>
</tr>
<tr>
<td align="left">Shop Facade</td>
<td align="left">0.06 m, 0.2&#xb0;</td>
<td align="left">0.06 m, 0.3&#xb0;</td>
<td align="left">0.06 m, 0.2&#xb0;</td>
</tr>
<tr>
<td align="left">St. Mary&#x2019;s Church</td>
<td align="left">0.08 m, 0.4&#xb0;</td>
<td align="left">0.08 m, 0.3&#xb0;</td>
<td align="left">0.07 m, 0.2&#xb0;</td>
</tr>
<tr>
<td align="left">Average</td>
<td align="left">0.14 m, 0.3&#xb0;</td>
<td align="left">0.14 m, 0.2&#xb0;</td>
<td align="left">0.13 m, 0.2&#xb0;</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<title>4 Conclusion</title>
<p>This section summarizes the methods employed and the key findings of the study. The methods involved enhancing the performance of VS-Net for camera relocalization in outdoor scenes by optimizing its backbone network and improving feature extraction capability. The study resulted in a 14% increase in average translation accuracy on the Cambridge Landmark dataset, accompanied by a 30% reduction in model size.</p>
<p>Regarding potential real-time applicability, the optimized VS-Net model shows promise for real-time camera relocalization applications, particularly in outdoor environments. Future research directions may include further exploration of feature extraction techniques, investigating the model&#x2019;s robustness in various environmental conditions, and integrating additional sensor modalities for improved performance and versatility.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>JW: Methodology, Writing&#x2013;original draft, Writing&#x2013;review and editing, Conceptualization, Project administration, Supervision. CG: Methodology, Writing&#x2013;original draft, Writing&#x2013;review and editing, Investigation. SH: Data curation, Software, Writing&#x2013;review and editing. YW: Data curation, Software, Writing&#x2013;review and editing. XF: Project administration, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the National Natural Science Foundation of China under Grant (No. 62301414)Research on unconventional array beam synthesis method with limited excitation amplitude.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abozeid</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Farouk</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Mashali</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Depth-DensePose: an efficient densely connected deep learning model for camera-based localization</article-title>. <source>Int. J. Electr. and Comput. Eng.</source> <volume>12</volume>, <fpage>2792</fpage>. <pub-id pub-id-type="doi">10.11591/ijece.v12i3.pp2792-2801</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Arandjelovic</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gronat</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Torii</surname>
<given-names>A. N.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>CNN architecture for weakly supervised place recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source> (<publisher-loc>USA</publisher-loc>: <publisher-name>Las Vegas</publisher-name>), <fpage>26</fpage>&#x2013;<lpage>30</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Balntas</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Prisacariu</surname>
<given-names>V. R.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Continuous metric learning relocalisation using neural nets</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision</source> (<publisher-loc>Munich, Germany</publisher-loc>), <fpage>10</fpage>&#x2013;<lpage>14</lpage>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brachmann</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Cavallari</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Prisacariu</surname>
<given-names>V. A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Accelerated coordinate encoding: learning to relocalize in minutes using rgb and poses</article-title>. <source>CVPR</source>
<pub-id pub-id-type="doi">10.1109/cvpr52729.2023.00488</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Brachmann</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Krull</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nowozin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shotton</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gumhold</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>) &#x201c;<article-title>Dsac-differentiable ransac for camera localization</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brachmann</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rother</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Visual camera Re-localization from RGB and RGB-D images using DSAC</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>44</volume>, <fpage>5847</fpage>&#x2013;<lpage>5865</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2021.3070754</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brachmann</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rother</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Visual camera re-localization from RGB and RGB-D images using DSAC</article-title>. <source>IEEE Trans. pattern analysis Mach. Intell.</source> <volume>44</volume> (<issue>9</issue>), <fpage>5847</fpage>&#x2013;<lpage>5865</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3070754</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bui</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Fast and lightweight scene regressor for camera relocalization</article-title>. <source>arXiv</source>, <fpage>01830</fpage>. <comment>arXiv:2212</comment>.</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Camera relocalization by exploiting multi-view constraints for scene coordinates regression</article-title>,&#x201d; in <source>Proceed-ings of the IEEE/CVF international conference on computer vision workshops</source> (<publisher-loc>Seoul, Korea</publisher-loc>), <fpage>27</fpage>&#x2013;<lpage>18</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lischinski</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Cohen-Or</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Do-conv: depthwise over-parameterized convolutional layer</article-title>. <source>IEEE Trans. Image Pro-cessing</source>, <volume>31</volume>, <fpage>3726</fpage>, <lpage>3736</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2022.3175432</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Survey of monocular camera-based visual relocalization</article-title>. <source>Robot</source> <volume>43</volume>, <fpage>373</fpage>&#x2013;<lpage>384</lpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Robust neural routing through space partitions for camera relocalization in dynamic indoor environments</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>8540</fpage>&#x2013;<lpage>8550</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00844</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Soladie</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kacete</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Richard</surname>
<given-names>P. Y.</given-names>
</name>
<name>
<surname>Royan</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Efficient multi-output scene coordinate prediction for fast and accurate camera relocali-zation from a single RGB image</article-title>. <source>Comput. Vis. Image Underst.</source> <volume>190</volume>, <fpage>102850</fpage>. <pub-id pub-id-type="doi">10.1016/j.cviu.2019.102850</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>VS-net: voting with segmentation for visual localization</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF con-ference on computer vision and pattern recognition</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6097</fpage>&#x2013;<lpage>6107</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00604</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kendall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grimes</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cipolla</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2015a</year>). &#x201c;<article-title>Posenet: a convolutional network for real-time 6-DOF camera relocalization</article-title>,&#x201d; in <source>Pro-ceedings of the IEEE international conference on computer vision</source>, <fpage>11</fpage>&#x2013;<lpage>17</lpage>. <comment>Santiago, Chile</comment>.</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kendall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grimes</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cipolla</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2015b</year>) &#x201c;<article-title>Posenet: a convolutional network for real-time 6-dof camera relocali-zation</article-title>,&#x201d; in <source>Proceedings of the IEEE international conference on computer vision</source>.</citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Laskar</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Melekhov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kalia</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Camera relocalization by computing pairwise relative poses using convolutional neural network</article-title>,&#x201d; in <source>Proceedings of the IEEE international conference on computer vision workshops</source>, <fpage>22</fpage>&#x2013;<lpage>29</lpage>. <comment>Venice,Italy</comment>.</citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Scene coordinate regression with angle-based reprojection loss for camera relocalization</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV) workshops</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Hierarchical scene coordinate classification and regression for visual localization</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<publisher-loc>Seattle, USA</publisher-loc>), <fpage>14</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Nerf-loc: visual localization ith conditional neural radiance field</article-title>,&#x201d; in <source>Icra</source>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>) &#x201c;<article-title>Lstm pose machines</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>.</citation>
</ref>
<ref id="B22">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sarlin</surname>
<given-names>P. E.</given-names>
</name>
<name>
<surname>Unagar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Larsson</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Germain</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Toft</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Larsson</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Back to the Feature: Learning Robust Camera Localization from Pixels to Pose</article-title>,&#x201d; in <source>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source>(<publisher-name>Nashville</publisher-name>), <fpage>19</fpage>&#x2013;<lpage>25</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sarlin</surname>
<given-names>P.-E.</given-names>
</name>
<etal/>
</person-group> (<year>2021b</year>) &#x201c;<article-title>Back to the feature: learning robust camera localization from pixels to pose</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shavit</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ferens</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Introduction to camera pose estimation with deep learning</article-title>. <source>arXiv</source>, <fpage>05272</fpage>. <comment>arXiv:1907</comment>.</citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>Li</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>) &#x201c;<article-title>Supervised high-dimension endecoder net: 3D end to end prediction network for mark-less human pose estimation from single depth map</article-title>,&#x201d; in <source>2019 5th international conference on control, automation and robotics (ICCAR)</source>. <publisher-name>IEEE</publisher-name>.</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Turkoglu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brachmann</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Schindler</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Visual camera Re-localization using graph neural networks and relative pose supervision</article-title>,&#x201d; in <source>Proceedings of the international conference on 3D vision</source> (<publisher-name>Online</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>3</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wald</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sattler</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Golodetz</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Beyond controlled environments: 3D camera Re-localization in changing indoor scenes</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision</source> (<publisher-loc>Glasgow, UK</publisher-loc>), <fpage>23</fpage>&#x2013;<lpage>28</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>ST-PixLoc: a scene-agnostic network for enhanced camera localization</article-title>. <source>IEEE Access</source> <volume>12</volume>, <fpage>105294</fpage>&#x2013;<lpage>105308</lpage>. <pub-id pub-id-type="doi">10.1109/access.2024.3435851</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>SANet: scene agnostic network for camera localization</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF interna-tional conference on computer vision</source> (<publisher-loc>Seoul, Korea</publisher-loc>).</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>KFNet: learning temporal camera relocalization using kalman filtering</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source> (<publisher-loc>Seattle, USA</publisher-loc>), <fpage>14</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>