<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Earth Sci.</journal-id>
<journal-title>Frontiers in Earth Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Earth Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-6463</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1108403</article-id>
<article-id pub-id-type="doi">10.3389/feart.2023.1108403</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Earth Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A-SATMVSNet: An attention-aware multi-view stereo matching network based on satellite imagery</article-title>
<alt-title alt-title-type="left-running-head">Lin et&#xa0;al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/feart.2023.1108403">10.3389/feart.2023.1108403</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Lin</surname>
<given-names>Li</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1847370/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Yuanben</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Zongji</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2102975/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Lili</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Xiongfei</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2245516/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Qianqian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1940141/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Optics and Photonics</institution>, <institution>Beijing Institute of Technology</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Network Information System Technology (NIST)</institution>, <institution>Aerospace Information Research Institute</institution>, <institution>Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1449331/overview">Athos Agapiou</ext-link>, Cyprus University of Technology, Cyprus</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2081857/overview">Ming Hao</ext-link>, China University of Mining and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2221205/overview">Rostam Affendi Hamzah</ext-link>, Technical University of Malaysia Malacca, Malaysia</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2008053/overview">Nurulfajar Abd Manap</ext-link>, Technical University of Malaysia Malacca, Malaysia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Qianqian Wang, <email>qqwang@bit.edu.cn</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Environmental Informatics and Remote Sensing, a section of the journal Frontiers in Earth Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>04</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>11</volume>
<elocation-id>1108403</elocation-id>
<history>
<date date-type="received">
<day>26</day>
<month>11</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>03</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Lin, Zhang, Wang, Zhang, Liu and Wang.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Lin, Zhang, Wang, Zhang, Liu and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> The stereo matching technology of satellite imagery is an important way to reconstruct real world. Most stereo matching technologies for satellite imagery are based on depth learning. However, the existing depth learning based methods have the problems of holes and matching errors in stereo matching tasks.</p>
<p>
<bold>Methods:</bold> In order to improve the effect of satellite image stereo matching results, we propose a satellite image stereo matching network based on attention mechanism (A-SATMVSNet). To solve the problem of insufficient extraction of surface features, a new feature extraction module based on triple dilated convolution with attention module is proposed, which solves the problem of matching holes caused by insufficient extraction of surface features. At the same time, compared with the traditional weighted average method, we design a novel cost-volume method that integrates attention mechanism to reduce the impact of matching errors to improve the accuracy of matching.</p>
<p>
<bold>Results and discussion:</bold> Experiments on public multi-view stereo matching dataset based on satellite imagery demonstrate that the proposed method significantly improves the accuracy and outperforms various previous methods. Our source code is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/MVSer/A-SATMVSNet">https://github.com/MVSer/A-SATMVSNet</ext-link>.</p>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>satellite imagery</kwd>
<kwd>multi-view stereo matching</kwd>
<kwd>convolutional neural network</kwd>
<kwd>attention module</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>3D reconstruction is a key process to restore the geometry of real world. The 3D reconstruction technology of real scenes has been developed extensively, and they all have unique advantages and disadvantages in specific scenes. According to the way of obtaining input data, 3D reconstruction technology can be divided into active reconstruction and passive reconstruction. Active reconstruction technology uses hardwares to directly obtain the geometric information of the shot scene, to obtain the depth information of the target scene. Passive reconstruction technology uses cameras to obtain some images to reconstruct the target scene. Compared with active 3D reconstruction, the camera equipment used in passive reconstruction technology has the advantages of low energy consumption and no direct contact with the real scene. In addition, with the popularity of digital cameras and smart phone lenses, the cost of camera hardware has decreased significantly over the past decade. This means that most people can have a camera and can contribute data to visual databases around the world. Organizing and using these rich and diverse photo data, and reconstructing high-precision, real three-dimensional models, has a very wide range of application scenarios. Thus, how to reconstruct the real world <italic>via</italic> multi-stereo matching is a big challenge that has brought much attention in recent years.</p>
<p>There are many classic methods in multi-view stereo matching based on handcrafted algorithms. These methods can be divided into 4 categories according to the format of output 3D model: voxel-based method, triangular mesh surface based method, point cloud based method and depth map based method (<xref ref-type="bibr" rid="B31">Seitz&#xa0;et&#xa0;al., 2006</xref>).</p>
<p>The voxel based algorithm divides the 3D space into discrete voxel grids, and judges whether voxels belong to the target scene surface by calculating the multi-view photometric consistency metric, so as to realize the reconstruction of the 3D scene (<xref ref-type="bibr" rid="B32">Seitz and Dyer, 1999</xref>; <xref ref-type="bibr" rid="B20">Kutulakos and Seitz, 2000</xref>). <xref ref-type="bibr" rid="B5">Collins (1996)</xref> divided several equidistant planes parallel to the camera plane in the camera space of the reference picture, and divided each plane into a grid. This method back projects the feature points of the multi-view camera into 3D space, and determines whether the plane grid belongs to the scene surface by calculating the number of back projection rays in the plane grid area. Although the representation is slightly different from the voxel grid, this method basically establishes the rudiment of voxel division, but is limited by the computer hardware level of that year, it can only divide the three-dimensional space by dividing the plane first, and then dividing the plane into grids. However, since the voxel partition itself is a kind of spatial discretization, the 3D model reconstructed by voxel based multi-view stereo vision algorithm inevitably has discrete errors. In addition, voxels often occupy a higher storage space, and are stored in limited memory or video memory.</p>
<p>Multi-view stereo vision algorithm based on triangular mesh surface first initializes a shape close to the real object, which is usually obtained by calculating the visible hull (<xref ref-type="bibr" rid="B21">Laurentini, 1994</xref>). <xref ref-type="bibr" rid="B7">Esteban and Schmitt (2004)</xref> proposed to optimize the mesh shape by using texture photometric consistency constraints and contour constraints. <xref ref-type="bibr" rid="B13">Hiep&#xa0;et&#xa0;al. (2009)</xref> first gridded the initial point cloud, and then repaired the grid according to detail restoration, adaptive resolution and photometric consistency. <xref ref-type="bibr" rid="B42">Zaharescu&#xa0;et&#xa0;al. (2010)</xref> proposed a mesh evolution framework based on the self intersection elimination algorithm, which can solve the problem that surface intersection cannot be fused in the iterative optimization process of triangular meshes. In this framework, triangular meshes are optimized based on multi-view photometric consistency constraints. Triangular meshes are easy to store and render, and especially suitable for visual computing. However, algorithms based on triangular meshes often require a better initial shape.</p>
<p>Multi-view stereo vision algorithm based on point cloud directly outputs point cloud 3D model. In order to densify the output point cloud, these methods often use propagation strategies to propagate good matching relationships or 3D points to the neighborhood space. <xref ref-type="bibr" rid="B8">Furukawa and Ponce (2009)</xref> used Harris Corner (<xref ref-type="bibr" rid="B11">Harris and Stephens, 1988</xref>) and Difference of Gaussian (<xref ref-type="bibr" rid="B28">Pitas, 2000</xref>) for feature matching to obtain initial sparse point cloud, and constructed and optimized block model on each point based on multi-view photometric consistency. Because point clouds are reconstructed directly in 3D space, and the distribution of point clouds is not as regular as pixels and voxels, it is difficult for point cloud based 3D reconstruction algorithms to use GPU parallelism to accelerate, which indirectly limits the performance and performance of such methods.</p>
<p>Multi-view stereo vision algorithm based on depth map estimates depth map or even normal vector map for each input image according to multi-view information, and then converts it into point cloud or triangular grid model through depth map fusion strategy. In the multi-view stereo vision algorithm based on depth map, the stereo matching process usually adopts the idea of patch match. Block matching was first proposed by <xref ref-type="bibr" rid="B1">Barnes&#xa0;et&#xa0;al. (2009)</xref>. Its core idea is to first randomly initialize the matching relationship between the pixels of two photos, and then repeatedly spread the matching relationship with high matching degree to the neighborhood for optimization. <xref ref-type="bibr" rid="B3">Bleyer&#xa0;et&#xa0;al. (2011)</xref> first applied the idea of block matching to the field of binocular stereo vision matching, which regards rectangular pixel blocks in block matching as projections of square blocks in 3D space, and can be deformed according to projection transformation rules, and the matching search domain is the projection transformation under polar geometry. <xref ref-type="bibr" rid="B9">Galliani&#xa0;et&#xa0;al. (2015)</xref> improved the neighborhood propagation mode of block matching, enabling it to be parallelized on the GPU, greatly increasing the operation efficiency of the algorithm.</p>
<p>In recent years, deep convolutional neural networks (DCNNs) have been applied to multi-view stereo matching. In comparison with handcrafted algorithms, deep learning-based methods learn features automatically and can obtain low error rates. Early learning based multi-view stereo vision algorithms are all based on voxels. The SurfaceNet proposed in <xref ref-type="bibr" rid="B17">Ji&#xa0;et&#xa0;al. (2017)</xref> learned the weighted average probability of each voxel on the scene surface according to multiple groups of photos. Voxels with a probability greater than a certain threshold were identified as on the scene surface to reconstruct the three-dimensional model of the target scene. However, SurfaceNet is a voxel-based deep learning method, which consumes a lot of memory, thus its reconstruction model can only express limited scenes.</p>
<p>Similar to handcrafted based methods, deep learning based multi-view stereo matching <italic>via</italic> depth map is the best and most popular research direction in this field. DeepMVS (<xref ref-type="bibr" rid="B15">Huang&#xa0;et&#xa0;al., 2018</xref>) is the first network to obtain the depth map based on deep learning. DeepMVS divides a number of front parallel planes in front of the reference camera, and then transforms the projection of multi-view photos onto these planes. The depth neural network is used to extract and aggregate the multi-view information, and regularize the cost volume to estimate the probability of the reference picture pixels in each plane, so as to achieve depth map estimation. The MVSNet (<xref ref-type="bibr" rid="B39">Yao&#xa0;et&#xa0;al., 2018</xref>) first uses the U-Net network to extract the feature map of multi-view photos and project and transform it to the front parallel planes of multiple reference cameras, and then estimates the depth map by constructing a 3D cost volume on the front parallel planes and using 3D convolution neural network to regularize it. To improve the effectiveness of MVSNet, the R-MVSNet was proposed (<xref ref-type="bibr" rid="B40">Yao&#xa0;et&#xa0;al., 2019</xref>). R-MVSNet constructed 2D cost map and used sequential regularization instead of 3D cost volume and 3D convolutional neural network regularization, which improved the scalability of the network to a certain extent. However, the quality of its network output depth map is slightly worse than MVSNet, and the final reconstruction point cloud is better than MVSNet only by using the variational depth map repair algorithm to repair the depth map. The PVA-MVSNet (<xref ref-type="bibr" rid="B41">Yi&#xa0;et&#xa0;al., 2020</xref>) fills the area of high resolution stereo matching errors by building a pyramid structure to aggregate low resolution reliable depth estimates, and improves the reconstruction quality through adaptive perspective aggregation. The MVSNet-based multi-stereo matching methods are the mainstream in current study. To improve the accuracy and completion, a cascade network with a multiple cost volume aggregation module is proposed <xref ref-type="bibr" rid="B24">Li&#xa0;et&#xa0;al., 2022b</xref>. <xref ref-type="bibr" rid="B43">Zhang&#xa0;et&#xa0;al. (2023)</xref> explicitly infered and integrated the pixel-wise occlusion information in the MVSNet <italic>via</italic> the matching uncertainty estimation. Attention module and Transformer are the hot spots in current study, thus a few algorithms based on attention and transformer were proposed <xref ref-type="bibr" rid="B25">Liao&#xa0;et&#xa0;al. (2022)</xref>; <xref ref-type="bibr" rid="B38">Weilharter and Fraundorfer, 2022</xref>; <xref ref-type="bibr" rid="B22">Li&#xa0;et&#xa0;al., 2022a</xref>; <xref ref-type="bibr" rid="B36">Wan&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B37">Wang&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B18">Jia&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B6">Ding&#xa0;et&#xa0;al., 2022</xref>.</p>
<p>However, the large-scale reconstructed scene <italic>via</italic> the current attention based MVSNet is inaccurate and incomplete. To further improve the effect on multi-view stereo matching, in this study, we propose a novel attention-aware multi-view stereo network based on satellite imagery, namely, A-SATMVSNet. To solve the problem of insufficient extraction of surface features, a new feature extraction module based on triple dilated convolution with attention module is proposed, which solves the problem of matching holes caused by insufficient extraction of surface features. At the same time, compared with the traditional weighted average method, we design a novel cost-volume method that integrates attention mechanism to reduce the impact of matching errors to improve the accuracy of matching. Experiments on public multi-view stereo matching dataset based on satellite imagery demonstrate that the proposed method significantly improves the highest accuracy and outperforms various previous methods.</p>
<p>We explicitly state our original contributions as follows:<list list-type="simple">
<list-item>
<p>1. We propose a new feature extraction module based on triple dilated convolution with attention model to solve the problem of insufficient extraction of surface feature;</p>
</list-item>
<list-item>
<p>2. Compared with traditional weighted average method, we propose a novel const-volume method that integrates attention mechanism to reduce the impact of matching errors to improve the accuracy in matching stage;</p>
</list-item>
<list-item>
<p>3. We achieve a new state-of-the-art on public multi-view stereo matching dataset based on satellite imagery.</p>
</list-item>
</list>
</p>
<p>The remainder of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section&#xa0;2</xref> presents the components of our proposed A-SATMVSNet. <xref ref-type="sec" rid="s3">Section&#xa0;3</xref> reports the extensive experimental results and evaluates the performance of the proposed method. <xref ref-type="sec" rid="s4">Section&#xa0;4</xref> presents the discussion. Finally, <xref ref-type="sec" rid="s5">Section&#xa0;5</xref> provides the conclusions and hints at plausible future research.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Problem definition</title>
<p>In the satellite MVS task, our goal is to use an end-to-end coarse-to-fine framework to predict the height map <italic>H</italic> by leveraging the matching relationship between <italic>N</italic> &#x2212; 1 adjacent views and the corresponding camera parameters. First of all, we extract the image features <inline-formula id="inf1">
<mml:math id="m1">
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> from the reference images <italic>I</italic>
<sub>0</sub> and source images <inline-formula id="inf2">
<mml:math id="m2">
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>. Then the cost volume <italic>V</italic> is constructed by the differentiable RPC warping based on hypothetical height planes <italic>D</italic>. Next, a regularization process is executed on cost volume by a 3D Unet. After regularization, the regularized cost volume <italic>V</italic>
<sub>
<italic>re</italic>
</sub> regresses a probability volume <italic>P</italic> by the softmax operation. Lastly, the final height map <italic>H</italic> is calculated by the hypothetical height planes <italic>D</italic> and probability volume <italic>P</italic>. At the inference stage, the trained model on the satellite MVS task must infer the depth maps of all views of all scenes. Finally, a depth map fusion method is used to obtain point clouds.</p>
</sec>
<sec id="s2-2">
<title>2.2 Overview of the proposed framework</title>
<p>Our proposed A-SATMVSNet is an trainable framework, which consists of two import parts: feature extraction and cost volume construction. As shown in <xref ref-type="fig" rid="F1">Figure&#xa0;1</xref>, the N input images <inline-formula id="inf3">
<mml:math id="m3">
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> are sent to multi-scale feature extraction module. After feature extraction, the multi-scale feature maps <inline-formula id="inf4">
<mml:math id="m4">
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mfenced open="" close="}">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> are fed into the cost volume <italic>C</italic> construction part in three stages. The cost volume <italic>C</italic> is constructed by the differentiable rpc warping (<xref ref-type="sec" rid="s2-5">Section&#xa0;2.5</xref>). Then, the obtain cost volume <italic>C</italic> are regularized to generate probability volumes <italic>P</italic> by the softmax operation. Finally, the height maps can be obtained through regression.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The framework of the proposed A-SATMVSNet.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g001.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 Attention-aware multi-scale feature extraction module</title>
<p>In this section, we mainly describe the proposed attention-aware multi-scale feature extraction module. There are many popular feature extraction modules such as UNet-based (<xref ref-type="bibr" rid="B29">Ronneberger&#xa0;et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B16">Isensee&#xa0;et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B23">Li&#xa0;et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B27">Oktay&#xa0;et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B14">Huang&#xa0;et&#xa0;al., 2020</xref>), feature pyramid network-based (<xref ref-type="bibr" rid="B26">Lin&#xa0;et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B19">Kim&#xa0;et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B30">Seferbekov&#xa0;et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B44">Zhao&#xa0;et&#xa0;al., 2021</xref>), resnet-based (<xref ref-type="bibr" rid="B12">He&#xa0;et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B35">Targ&#xa0;et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B33">Szegedy&#xa0;et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B2">Bharati&#xa0;et&#xa0;al., 2021</xref>), <italic>etc.</italic> All the above feature extraction modules perform well in multi-view stereo matching tasks. In our study, we propose a new feature extraction module based on (<xref ref-type="bibr" rid="B4">Cheng&#xa0;et&#xa0;al., 2020</xref>) where it is combined with an attention module. The basic module consists of an encoder and a decoder with skip connection. The module outputs a three-scale feature pyramid whose size is {1/16, 1/4, 1} of the input satellite image size, and the number of feature channels is 32, 16, and 8 respectively. In the encoder part, an attention module is designed. The attention module and feature extraction layer in encoder and decoder part are shown in <xref ref-type="fig" rid="F2">Figure&#xa0;2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The architecture of the proposed feature extraction module. <bold>(A)</bold> Represents the architecture of the detailed feature extraction layer in encoder network with attention module. <bold>(B)</bold> Represents the architecture of the attention module. <bold>(C)</bold> Is the decoder network.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g002.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F2">Figure&#xa0;2A</xref> shows the architecture of the detailed feature extraction layer in encoder network with attention module. First, a convolution layer with 3 &#xd7; 3 kernel size is used to extract features. After that, the feature map is sent to three different dilated convolution layers with dilation rate of 2, 3 and 4 respectively. Then, all the three output feature maps are sent to a 3 &#xd7; 3 convolution layer with an attention module. Finally, the three output feature maps are concatenated to generate a new feature map, as the final feature map. The formulation of our triple dilate convolution is defined as follow:<disp-formula id="e1">
<mml:math id="m5">
<mml:mtable class="align" columnalign="left">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mfenced open="" close="]">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(1)</label>
</disp-formula>where &#x2297; represents the multiply operation, &#x2299; denotes the element-wise product, <italic>w</italic>
<sub>
<italic>i</italic>
</sub> represents the <italic>ith</italic> weights of dilate convolution.</p>
<p>
<xref ref-type="fig" rid="F2">Figure&#xa0;2B</xref> shows the architecture of the attention module. The input feature map is defined as <italic>F</italic>
<sub>
<italic>in</italic>
</sub>. Two convolution layers with a kernel of 3 &#xd7; 3 are employed to generate further features <inline-formula id="inf50">
<mml:math id="m58">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. Then, a sigmoid function is used to obtain attention weights defined as <italic>F</italic>
<sub>
<italic>w</italic>
</sub>. The final output feature is defined as <italic>F</italic>
<sub>
<italic>out</italic>
</sub>, which is calculated as:<disp-formula id="e2">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>
<xref ref-type="fig" rid="F2">Figure&#xa0;2C</xref> is the architecture of the decoder network, which consists of a deconvolution layer with a kernel size of 3 &#xd7; 3, stride size of 2 and a convolution layer with a stride size of 1.</p>
</sec>
<sec id="s2-4">
<title>2.4 Rational polynomial camera model (RPC)</title>
<p>The rational polynomial camera model (RPC) is extensively used in satellite imagery processing, which connects the image points and corresponding world coordinate points with cubic rational polynomial coefficients (<xref ref-type="bibr" rid="B10">Gao&#xa0;et&#xa0;al., 2021</xref>). We define the world coordinates as <inline-formula id="inf5">
<mml:math id="m7">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> which represents the latitude, longitude and height. The corresponding normalized image coordinates are defined as <inline-formula id="inf6">
<mml:math id="m8">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. <italic>P</italic>
<sup>
<italic>fwd</italic>
</sup> and <italic>P</italic>
<sup>
<italic>inv</italic>
</sup> are both cubic polynomials. The transformation between world coordinates and image coordinates are shown as bellow:<disp-formula id="e3">
<mml:math id="m9">
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">inv</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">inv</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">inv</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">inv</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>s</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">fwd</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">fwd</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">fwd</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">fwd</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Z</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ijk</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>In multi-view stereo matching task <italic>via</italic> satellite imagery, the RPC model is a widely used geometric model, which can provide a high accuracy to the rigorous sensor model (RSM) (<xref ref-type="bibr" rid="B34">Tao and Hu, 2001</xref>).</p>
</sec>
<sec id="s2-5">
<title>2.5 Differentiable RPC warping</title>
<p>Currently, most state-of-the-art MVS methods warp the source views to a reference view to obtain per-view matching feature volumes by a homography matrix and a set of fronto-parallel depth hypotheses planes <italic>D</italic>. The definition of the differentiable homography based pin-hole camera model is as bellow:<disp-formula id="e4">
<mml:math id="m10">
<mml:msubsup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>d</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mspace width="0.1em"/>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mspace width="0.1em"/>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>where <italic>T</italic> and <italic>K</italic> denote camera extrinsic and intrinsic respectively. Compared with pin-hole camera model, the cubic rational polynomial camera (RPC) model is widely used in satellite domain, which has the advantage than all camera models, <italic>e.g</italic>., projective, affine and the linear pushbroom. A matrix alone cannot formulate the warping of the RPC model due to its complexity. In this regard, SatMVS proposes a rigorous and efficient RPC warping module that is fundamentally a high-order tensor transformation, which is fundamental to the structure of SatMVS. Using a set of hypothetical height planes in the world coordinate system, the RPC warping module projects images from different views to the reference view, instead of the fronto-parallel planes of a reference view, because the RPC model does not include explicit physical parameters for defining the front of a camera.</p>
<p>Firstly, SatMVS transforms the ternary cubic polynomial by using cubic polynomials to a quaternion cubic homogeneous polynomial <inline-formula id="inf7">
<mml:math id="m11">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, where <inline-formula id="inf8">
<mml:math id="m12">
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>1,2,3,4</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. And <italic>X</italic> is expressed as a tensor, which consists of four variables <italic>x</italic>
<sub>1</sub>, <italic>x</italic>
<sub>2</sub>, <italic>x</italic>
<sub>3</sub>, <italic>x</italic>
<sub>4</sub>, <italic>i.e</italic>., <inline-formula id="inf9">
<mml:math id="m13">
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. Besides, <italic>T</italic> is also expressed as the polynomial coefffcients, whose shape is 4 &#xd7; 4 &#xd7; 4. After the tensor contraction operation, the definition of the numerator and denominator of the RPC model can be defined as bellow:<disp-formula id="e5">
<mml:math id="m14">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ijk</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>Extendedly, the formulation of the RPC model with a set of points is defined as bellow:<disp-formula id="e6">
<mml:math id="m15">
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ijk</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf10">
<mml:math id="m16">
<mml:msup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> represents the <italic>mth</italic> point in the <italic>bth</italic> batch and <inline-formula id="inf11">
<mml:math id="m17">
<mml:msup>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> represents the coefficient tensor in the <italic>bth</italic> batch. Through element-wise division, the RPC warping of all the points in a batch can be calculated in one shot.</p>
</sec>
<sec id="s2-6">
<title>2.6 Feature volume adaptive aggregation</title>
<p>Previous methods usually aggregate the feature volumes to a cost volume by leveraging the cost metric (<xref ref-type="bibr" rid="B13">Hiep&#xa0;et&#xa0;al., 2009</xref>). The common practice is to use the variance-based cost metric (CM) to average <italic>N</italic> &#x2212; 1 feature volumes. CM considers that the confidence values of the corresponding pixels between the corresponding feature volumes of each view are equally important. The formulation of variance-based cost metric is defined as bellow:<disp-formula id="e7">
<mml:math id="m18">
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>M</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf12">
<mml:math id="m19">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> represents the average volume among all feature volumes. However, equal importance is obviously not reasonable, because the satellite images taken by the Ziyuan-3 (ZY-3) satellite have varying shooting camera angles may affect the confidences in the feature volumes due to the matching errors caused by different conditions such as occlusion and non-Lambertian surfaces. If we utilize Eq.&#xa0;<xref ref-type="disp-formula" rid="e4">4</xref> to calculate the cost volume, it will affect the final height map estimation.</p>
<p>Therefore, as illustrated in <xref ref-type="fig" rid="F3">Figure&#xa0;3</xref>, we design an adaptive feature volume aggregation module to calculate an aggregation weighting volume for each feature volume to achieve unequally confidence aggregation. The definition of our module is defined as Eq.&#xa0;<xref ref-type="disp-formula" rid="e5">5</xref>:</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Illustration of the proposed adaptive feature volume aggregation module.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g003.tif"/>
</fig>
<p>In this way, pixels that may cause the matching errors are suppressed, <italic>i.e</italic>., the confidences corresponding to pixels are allocated the lower weight, while those with critical feature information are given higher weight. We also formulate our adaptive feature volume aggregation module as follows:<disp-formula id="e8">
<mml:math id="m20">
<mml:msup>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3c9;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:math>
<label>(8)</label>
</disp-formula>where &#x2299; denotes Hadamard multiplication and <italic>&#x3c9;</italic>(&#x2026;) is the pixel-wise attention maps adaptively yielded according to per-view cost volumes.</p>
</sec>
<sec id="s2-7">
<title>2.7 Cost volume regularization</title>
<p>Cost volume regularization (regression to obtain height map) can be seen as a segmentation problem and is handled using the UNet commonly used for semantic segmentation tasks. Therefore, similar to the UNet-shape network used by the previous methods for cost volume regularization, we adopt a similar multi-stage 3D UNet to aggregate neighboring information from a large receptive field, which is composed of three stages (downsample, bottleneck, upsample). First, in the downsampling stage, we leverage the ordinary convolution to obtain the intermediate volume <inline-formula id="inf13">
<mml:math id="m21">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> by three times downsampling. Then we use a bottleneck to learn the high level depth features. We obtain the final regularized cost volume by multiple deconvolutions and skip connections. And the skip connections are used to transfer the corresponding scale intermediate volume <inline-formula id="inf14">
<mml:math id="m22">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. The details for the network are shown in <xref ref-type="table" rid="T1">Table&#xa0;1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>The detailed architecture of the 3D UNet. Each convolutional layer represents a block of convolution, batch normalization (BN) and ReLU; &#x201c;sp&#x201d; means skip connection; &#x201c;H&#x201d; and &#x201c;W&#x201d; denote the height and width of the reference image, respectively.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="4" align="center">Cost volume size: <inline-formula id="inf15">
<mml:math id="m23">
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Name</td>
<td colspan="2" align="center">Layer Description</td>
<td align="center">Output Size</td>
</tr>
<tr>
<td colspan="4" align="center">Downsample Layers</td>
</tr>
<tr>
<td align="center">&#xa0;conv0&#x5f;1</td>
<td align="center">3D-Conv</td>
<td align="center">3 &#xd7; 3 &#xd7; 3,stride&#x3d;1</td>
<td align="center">
<inline-formula id="inf16">
<mml:math id="m24">
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">1 &#xd7; 1 &#xd7; 1,stride&#x3d;1</td>
<td align="left"/>
</tr>
<tr>
<td align="center">&#xa0;conv1&#x5f;0/conv2&#x5f;0/conv3&#x5f;0</td>
<td align="center">3D-Conv</td>
<td align="center">3 &#xd7; 3 &#xd7; 3,stride&#x3d;2</td>
<td align="center">
<inline-formula id="inf17">
<mml:math id="m25">
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf18">
<mml:math id="m26">
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf19">
<mml:math id="m27">
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">1 &#xd7; 1 &#xd7; 1,stride&#x3d;1</td>
<td align="left"/>
</tr>
<tr>
<td align="center">&#xa0;conv1&#x5f;1/conv2&#x5f;1/conv3&#x5f;1</td>
<td align="center">3D-Conv</td>
<td align="center">3 &#xd7; 3 &#xd7; 3,stride&#x3d;1</td>
<td align="center">
<inline-formula id="inf20">
<mml:math id="m28">
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf21">
<mml:math id="m29">
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf22">
<mml:math id="m30">
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">1 &#xd7; 1 &#xd7; 1,stride&#x3d;1</td>
<td align="left"/>
</tr>
<tr>
<td colspan="4" align="center">Bottleneck</td>
</tr>
<tr>
<td align="center">&#xa0;bc0&#x5f;1</td>
<td colspan="2" align="center">3 &#xd7; 3 &#xd7; 3&#xa0;3D CNN, stride&#x3d;2</td>
<td align="center">
<inline-formula id="inf23">
<mml:math id="m31">
<mml:mn>128</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">&#xa0;bc0&#x5f;2</td>
<td colspan="2" align="center">1 &#xd7; 1 &#xd7; 1&#xa0;3D CNN, stride&#x3d;1</td>
<td align="center">
<inline-formula id="inf24">
<mml:math id="m32">
<mml:mn>128</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">&#xa0;bc0&#x5f;3</td>
<td colspan="2" align="center">3 &#xd7; 3 &#xd7; 3&#xa0;transpose&#xa0;3D CNN, stride&#x3d;2</td>
<td align="center">
<inline-formula id="inf25">
<mml:math id="m33">
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td colspan="4" align="center">Upsample Layers</td>
</tr>
<tr>
<td align="center">&#xa0;conv4&#x5f;0/conv5&#x5f;0/conv6&#x5f;0</td>
<td align="center">transpose&#xa0;3D-Conv</td>
<td align="center">3 &#xd7; 3 &#xd7; 3,stride&#x3d;2</td>
<td align="center">
<inline-formula id="inf26">
<mml:math id="m34">
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf27">
<mml:math id="m35">
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf28">
<mml:math id="m36">
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">1 &#xd7; 1 &#xd7; 1,stride&#x3d;1</td>
<td align="left"/>
</tr>
<tr>
<td align="center">&#xa0;sp</td>
<td colspan="2" align="center">conv2&#x5f;1 feature add conv4&#x5f;0 feature</td>
<td align="center">
<inline-formula id="inf29">
<mml:math id="m37">
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf30">
<mml:math id="m38">
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>/<inline-formula id="inf31">
<mml:math id="m39">
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">&#xa0;conv7</td>
<td colspan="2" align="center">3 &#xd7; 3 &#xd7; 3 3D-Conv, stride&#x3d;1, no BN and ReLU</td>
<td align="center">
<inline-formula id="inf32">
<mml:math id="m40">
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-8">
<title>2.8 Implementation detail</title>
<sec id="s2-8-1">
<title>2.8.1 Training</title>
<p>Our A-SATMVSNet is implemented using PyTorch, which is trained on TLC SatMVS training dataset for evaluation on TLC SatMVS testing dataset. The preprocessing strategies and selection of input views follow common strategies in a representative previous work (<xref ref-type="bibr" rid="B10">Gao&#xa0;et&#xa0;al., 2021</xref>). We train and validate our model on the TLC SatMVS training set and evaluating set respectively. In the training process, we set the input image resolution to 768 &#xd7; 384, and the number of training views to <italic>N</italic> &#x3d; 3. The hypothetical height planes of our framework are set as {64, 32, 8}. We optimize our model for 16 epochs with Adam optimizer, meanwhile, the learning rate is set as 0.001. Besides, we set the batch size as 4 and train our model on 4 NVIDIA GTX 2080 GPU devices. We adopt the metrics (MAE, RMSE, &#x3c;2.5<italic>m</italic>, &#x3c;7.5<italic>m</italic>, Comp and Runtime) provided by the SatMVS to evaluate the quality of the height maps obtained by our model.</p>
</sec>
<sec id="s2-8-2">
<title>2.8.2 Testing</title>
<p>We test on TLC SatMVS testing dataset with our best result. And we set the adjacent image number <italic>N</italic> &#x3d; 3, the image resolution as 768 &#xd7; 384, and the hypothetical height places for testing as {64, 32, 8}.</p>
</sec>
<sec id="s2-8-3">
<title>2.8.3 Evaluation metrics</title>
<p>We adopt four common metrics to eval the quantitative results of the final height maps.<list list-type="simple">
<list-item>
<p>1. <bold>MAE:</bold> the mean absolute error, <italic>i.e.,</italic> the mean of the <italic>L</italic>
<sub>1</sub> distance over all pixels between the GT height map and predicted height map.</p>
</list-item>
<list-item>
<p>2. <bold>RMSE:</bold> the root-mean-square-error, <italic>i.e.,</italic> the standard deviation of the residuals between the GT height and predicted height map.</p>
</list-item>
<list-item>
<p>3. <bold>
<italic>&#x3c;</italic>
</bold>
<bold>2.5</bold>&#xa0;<bold>m</bold>, <bold>
<italic>&#x3c;</italic>
</bold>
<bold>7.5</bold>&#xa0;<bold>m:</bold> percentage of all pixels with <italic>L</italic>
<sub>1</sub> distance errors below the 2.5<italic>m</italic> and 7.5<italic>m</italic> thresholds.</p>
</list-item>
<list-item>
<p>4. <bold>Comp:</bold> percentage of all pixels with valid height values in the final height map.</p>
</list-item>
</list>
</p>
</sec>
</sec>
<sec id="s2-9">
<title>2.9 Experimental dataset</title>
<p>In this paper, we adopt the TLC SatMVS dataset proposed by SatMVS. The TLC SatMVS dataset consists of the triple-view images, and the height maps, which are generated by projecting the GT DMS with the corresponding RPC parameters of TLC cameras, which are mounted by Ziyuan-3 (ZY-3) satellite, as shown in <xref ref-type="fig" rid="F4">Figure&#xa0;4</xref>. And the GT DSMs are obtained from high-accuracy LiDAR observations and ground control point (GCP)-supported photogrammetric software. The dataset consists of 5,011 image patches with resolution 768 &#xd7; 384.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Samples of TLC SatMVS dataset. <bold>(A)</bold> Represents the sample of input images in the dataset. <bold>(B)</bold> Represents the Ground Truth depth maps corresponding to the input images.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g004.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Evaluation on the TLC SatMVS dataset</title>
<p>
<xref ref-type="fig" rid="F5">Figure&#xa0;5</xref> shows the visualization results of the proposed method on TLC SatMVS Dataset. We restore the depth map from three images. It can be seen that in mountainous areas with large topographic relief, this method has certain effect on depth estimation of multi-view remote sensing images, which verifies the effectiveness of this method.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Visualization results of the proposed methods on TLC SatMVS Dataset.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g005.tif"/>
</fig>
<p>
<xref ref-type="table" rid="T2">Table&#xa0;2</xref> shows the quantitative results on TLC SatMVS Dataset. We compare with traditional and deep learning based MVS methods for satellite images to demonstrate the effectiveness of our model. We have the following observations: 1) We can observe that our method achieves the best among current state-of-the-arts methods in the metrics (MAE:1.597, RMSE:2.036, &#x3c;2.5<italic>m</italic>:82.68, Comp:84.32). 2) For traditional MVS method which adopts the pin-hole camera model, <italic>e.g</italic>., adapted COLMAP, our model outperforms it in all metrics (MAE, RMSE, <inline-formula id="inf33">
<mml:math id="m41">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>2.5</mml:mn>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, <inline-formula id="inf34">
<mml:math id="m42">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>7.5</mml:mn>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, Comp, Runtime). 3) Compared with RED-Net, CasMVSNet and UCS-Net, which adopt the pin-hole camera model, although our model has lower scores in <inline-formula id="inf35">
<mml:math id="m43">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>7.5</mml:mn>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> and Runtime, our model achieves SOTA results in other metrics. 4) Furthermore, we also compare the proposed method with the state-of-the-art models in satellite MVS domain, <italic>e.g</italic>., SatMVS(REDNet), SatMVS(CasMVSNet), SatMVS(UCS-Net). We can observe that our model achieves SOTA results in MAE, RMSE, <inline-formula id="inf36">
<mml:math id="m44">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>2.5</mml:mn>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, Comp. Besides, we are very close to the current SOTA in terms of <inline-formula id="inf37">
<mml:math id="m45">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>7.5</mml:mn>
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>, which exhibits that our method has comparable performance. 5) We can observe that our A-SATMVSNet has a competitive inference time in <xref ref-type="table" rid="T2">Table&#xa0;2</xref>. Specifically, our method increases only a slight time in inference Runtime, while extremely outperforms other satellite domain methods in most of metrics, <italic>e.g</italic>., MAE, RMSE, &#x3c;2.5<italic>m</italic> and Comp.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Quantitative results of the different MVS methods on the TLC SatMVS dataset. The proposed SatMVS with RPC warping implements three different learning-based MVS methods for height inference. Numbers in bold indicate the best results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">MAE(m) <italic>&#x2193;</italic>
</th>
<th align="center">RMSE(m) <italic>&#x2193;</italic>
</th>
<th align="center">
<inline-formula id="inf38">
<mml:math id="m46">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m (%) <italic>&#x2191;</italic>
</th>
<th align="center">
<inline-formula id="inf39">
<mml:math id="m47">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m (%) <italic>&#x2191;</italic>
</th>
<th align="center">Comp (%) <italic>&#x2191;</italic>
</th>
<th align="center">Runtime (min:s) <italic>&#x2193;</italic>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">adapted COLMAP</td>
<td align="center">2.227</td>
<td align="center">5.291</td>
<td align="center">73.35</td>
<td align="center">96.00</td>
<td align="center">79.10</td>
<td align="center">77&#xa0;min2&#xa0;s</td>
</tr>
<tr>
<td align="center">RED-Net</td>
<td align="center">2.171</td>
<td align="center">4.514</td>
<td align="center">74.13</td>
<td align="center">95.91</td>
<td align="center">81.82</td>
<td align="center">9&#xa0;min:15&#xa0;s</td>
</tr>
<tr>
<td align="center">CasMVSNet</td>
<td align="center">2.031</td>
<td align="center">4.351</td>
<td align="center">77.39</td>
<td align="center">96.53</td>
<td align="center">82.33</td>
<td align="center">4&#xa0;min:02&#xa0;s</td>
</tr>
<tr>
<td align="center">UCS-Net</td>
<td align="center">2.039</td>
<td align="center">4.084</td>
<td align="center">76.40</td>
<td align="center">96.66</td>
<td align="center">82.08</td>
<td align="center">
<bold>3</bold>&#xa0;<bold>min:47</bold>&#xa0;<bold>s</bold>
</td>
</tr>
<tr>
<td align="center">SatMVS(RED-Net)</td>
<td align="center">1.945</td>
<td align="center">4.070</td>
<td align="center">77.93</td>
<td align="center">96.59</td>
<td align="center">82.29</td>
<td align="center">13&#xa0;min:52&#xa0;s</td>
</tr>
<tr>
<td align="center">SatMVS(CasMVSNet)</td>
<td align="center">2.020</td>
<td align="center">3.841</td>
<td align="center">76.79</td>
<td align="center">
<bold>96.73</bold>
</td>
<td align="center">81.54</td>
<td align="center">12&#xa0;min:20&#xa0;s</td>
</tr>
<tr>
<td align="center">SatMVS(UCS-Net)</td>
<td align="center">2.026</td>
<td align="center">3.921</td>
<td align="center">77.01</td>
<td align="center">96.54</td>
<td align="center">82.21</td>
<td align="center">13&#xa0;min:17&#xa0;s</td>
</tr>
<tr>
<td align="center">A-SATMVSNet</td>
<td align="center">
<bold>1.597</bold>
</td>
<td align="center">
<bold>2.036</bold>
</td>
<td align="center">
<bold>82.68</bold>
</td>
<td align="center">96.48</td>
<td align="center">
<bold>84.32</bold>
</td>
<td align="center">14&#xa0;min:53&#xa0;s</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="fig" rid="F6">Figure&#xa0;6</xref> shows the visualization results of different compared methods on TLC SatMVS dataset. It can be seen that the details of the SatMVS(CasMVSNet) method at the corners are seriously missing. However, the results of the method in this paper are more realistic at detail, which are closer to the truth value.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visualization results of different compared methods. The differences are highlighted <italic>via</italic> red boxes in the results.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g006.tif"/>
</fig>
<p>The graph representations of MAE, RMSE, <inline-formula id="inf40">
<mml:math id="m48">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m, and <inline-formula id="inf41">
<mml:math id="m49">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m are visualized in <xref ref-type="fig" rid="F7">Figure&#xa0;7</xref>, and it is observed that all four metrics exhibit convergence as the epoch increases. This indicates that the model&#x2019;s performance gradually improves with additional training data. Overall, the visualization of these metrics serves as a useful tool for monitoring and evaluating the performance of the model during training.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Visualization results of the graph representations of MAE, RMSE, <inline-formula id="inf42">
<mml:math id="m50">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m, and <inline-formula id="inf43">
<mml:math id="m51">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g007.tif"/>
</fig>
<p>Additionally, we also exhibit the visualisation results of our proposed method for four different types of areas: a) darker areas, b) discontinuous areas, c) weakly textured areas, and d) areas with strong height variations, as shown in <xref ref-type="fig" rid="F8">Figure&#xa0;8</xref>. Our observations are as follows: In a), despite the overall darkness of the scene, our model effectively estimates the height map of the red-boxed area and accurately describes the undulations of the terrain. In b), the red-boxed area is clearly discontinuous with the surrounding area, but our method still produces accurate height estimations without any false noise heights. In c), even though the texture of terrain in the red-boxed area is not very distinct, our method effectively estimates the height map for each pothole. In d), the image contains significant sharp height shifts and some colour noise, but this does not affect the effectiveness of our model in estimating the height map.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Visualization results of four different types of areas. <bold>(A)</bold> Darker areas. <bold>(B)</bold> Discontinuous areas. <bold>(C)</bold> Weakly textured areas. <bold>(D)</bold> Areas with strong hight variations.</p>
</caption>
<graphic xlink:href="feart-11-1108403-g008.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Evaluation on the large-size satellite images</title>
<p>We train and validate the large-size satellite images (5,120 &#xd7; 5,120) with RED-Net, SatMVS(RED-Net) and our A-SATMVSNet on a NVIDIA RTX 3090. The results are shown in <xref ref-type="table" rid="T3">Table&#xa0;3</xref>. We can observe that although using the large-size satellite images as input data, which affects the performance of our model, our method still maintain a relatively excellent performance compared with other methods. This confirms the effectiveness and advantage of our proposed model.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Quantitative results of the SatMVS(RED-Net) and the RED-Net (with fitted pinhole model) on the TLC SatMVS dataset with different sizes. Numbers in bold indicate the best results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">MAE(m) <italic>&#x2193;</italic>
</th>
<th align="center">RMSE(m) <italic>&#x2193;</italic>
</th>
<th align="center">
<inline-formula id="inf44">
<mml:math id="m52">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m (%) <italic>&#x2191;</italic>
</th>
<th align="center">
<inline-formula id="inf45">
<mml:math id="m53">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m (%) <italic>&#x2191;</italic>
</th>
<th align="center">Comp (%) <italic>&#x2191;</italic>
</th>
<th align="center">Runtime (min:s) <italic>&#x2193;</italic>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">RED-Net</td>
<td align="center">2.517</td>
<td align="center">4.873</td>
<td align="center">66.42</td>
<td align="center">95.53</td>
<td align="center">81.44</td>
<td align="center">
<bold>4</bold>&#xa0;<bold>min:17</bold>&#xa0;<bold>s</bold>
</td>
</tr>
<tr>
<td align="center">SatMVS(RED-Net)</td>
<td align="center">1.946</td>
<td align="center">4.224</td>
<td align="center">77.88</td>
<td align="center">
<bold>96.54</bold>
</td>
<td align="center">82.35</td>
<td align="center">5&#xa0;min:52&#xa0;s</td>
</tr>
<tr>
<td align="center">A-SATMVSNet</td>
<td align="center">
<bold>1.603</bold>
</td>
<td align="center">
<bold>2.279</bold>
</td>
<td align="center">
<bold>80.24</bold>
</td>
<td align="center">96.46</td>
<td align="center">
<bold>83.24</bold>
</td>
<td align="center">6&#xa0;min:22&#xa0;s</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>In this section, we provide ablation experiments to quantitatively and qualitatively evaluate the effectiveness of each of our proposed modules. The following proposed ablation results are processed on TLC SatMVS dataset using the similar parameters as Section&#xa0;2.5. We adopt the decreasing comparison to validate the effectiveness of each of our proposed modules. The quantitative results are shown in <xref ref-type="table" rid="T4">Table&#xa0;4</xref>. Comparing Row1 with Row3 in <xref ref-type="table" rid="T4">Table&#xa0;4</xref> indicates that our proposed Feature Extraction Network significantly improves the performance, <italic>i.e</italic>., MAE from 1.892 to 1.603 (SOTA). And we also observe that our Adaptive Volume Aggregation module can effectively improve the performance, <italic>i.e</italic>., RMSE from 2.253 to 2.270. It is clear that each individual module can significantly enhance model effects, and the two modules are complementary in A-SATMVSNet to achieve the best performance. Furthermore, we conducted sensitivity experiments on the depth Hypothesis Numbers and resolution of images, as presented in <xref ref-type="table" rid="T5">Table&#xa0;5</xref>. Specifically, we compared the reconstruction quality of <bold>D</bold>
<italic>num</italic>&#x3d;[48, 16, 8], <bold>D</bold>
<italic>num</italic>&#x3d;[64, 16, 8], and <bold>D</bold>
<italic>num</italic>&#x3d;[64, 32, 8], while keeping <italic>N</italic> fixed at 3 and image resolution at 384 &#xd7; 768. The results in <xref ref-type="table" rid="T5">Tab.5</xref> demonstrate that finer depth division <bold>D</bold>
<italic>num</italic> can enhance the reconstruction quality across all metrics.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Ablation results. &#x2018;FE&#x2019; represents feature extraction module; &#x2018;VA&#x2019; represents adaptive volume aggregation. Numbers in bold indicate the best results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">MAE(m)</th>
<th align="center">RMSE(m)</th>
<th align="center">
<inline-formula id="inf46">
<mml:math id="m54">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m (%)</th>
<th align="center">
<inline-formula id="inf47">
<mml:math id="m55">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">FE</td>
<td align="center">1.892</td>
<td align="center">2.413</td>
<td align="center">78.91</td>
<td align="center">95.79</td>
</tr>
<tr>
<td align="center">VA</td>
<td align="center">1.684</td>
<td align="center">2.253</td>
<td align="center">80.28</td>
<td align="center">96.02</td>
</tr>
<tr>
<td align="center">Overall</td>
<td align="center">
<bold>1.603</bold>
</td>
<td align="center">
<bold>2.279</bold>
</td>
<td align="center">
<bold>80.24</bold>
</td>
<td align="center">
<bold>96.46</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Ablation study on number of depth hypothesis planes <bold>D</bold>
<sub>
<italic>num</italic>
</sub> and resolutions of input images <italic>W</italic> and <italic>H</italic> on TLC SatMVS Dataset. Numbers in bold indicate the best results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">
<italic>N</italic>
</th>
<th align="center">
<bold>D</bold>
<sub>
<italic>num</italic>
</sub>
</th>
<th align="center">Resolution&#xa0;(<italic>H</italic> &#xd7; <italic>W</italic>)</th>
<th align="center">MAE(m)</th>
<th align="center">RMSE(m)</th>
<th align="center">
<inline-formula id="inf48">
<mml:math id="m56">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>2.5&#xa0;m (%)</th>
<th align="center">
<inline-formula id="inf49">
<mml:math id="m57">
<mml:mo>&#x3c;</mml:mo>
</mml:math>
</inline-formula>7.5&#xa0;m (%)</th>
<th align="center">Comp (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">3</td>
<td align="center">[64,32,8]</td>
<td align="center">192 &#xd7; 384</td>
<td align="center">1.732</td>
<td align="center">2.983</td>
<td align="center">74.23</td>
<td align="center">87.11</td>
<td align="center">75.34</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">[48,16,8]</td>
<td align="center">384 &#xd7; 768</td>
<td align="center">1.640</td>
<td align="center">2.503</td>
<td align="center">78.92</td>
<td align="center">92.76</td>
<td align="center">79.68</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">[64,16,8]</td>
<td align="center">384 &#xd7; 768</td>
<td align="center">1.610</td>
<td align="center">2.174</td>
<td align="center">80.19</td>
<td align="center">95.33</td>
<td align="center">82.89</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">[64,32,8]</td>
<td align="center">384 &#xd7; 768</td>
<td align="center">
<bold>1.597</bold>
</td>
<td align="center">
<bold>2.036</bold>
</td>
<td align="center">
<bold>82.68</bold>
</td>
<td align="center">
<bold>96.48</bold>
</td>
<td align="center">
<bold>84.32</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>In this paper, we have proposed a satellite image stereo matching network based on attention mechanism (A-SATMVSNet). We design a feature extraction module based on triple dilated convolution with attention module to solve the problem of matching holes caused by insufficient extraction of surface features. Furthermore, compared with the traditional weighted average method, we design a novel cost-volume method that integrates attention mechanism to reduce the impact of matching errors to improve the accuracy of matching. As a result, our method achieves SOTA results on TLC SatMVS Dataset, showing better performance than many existing learning-based MVS methods in satellite images domain.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://github.com/WHU-GPCV/SatMVS">https://github.com/WHU-GPCV/SatMVS</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>QW was the project manager. LL conceived and designed the study. YZ and ZW conducted the experiments. LZ, XL, and QW analysed the data. LL and ZW wrote the manuscript. YZ, LZ, XL, and QW revised the manuscript.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barnes</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Shechtman</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Finkelstein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goldman</surname>
<given-names>D. B.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Patchmatch: A randomized correspondence algorithm for structural image editing</article-title>. <source>ACM Trans. Graph.</source> <volume>28</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1145/1531326.1531330</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bharati</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Podder</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mondal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Prasath</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Co-resnet: Optimized resnet model for Covid-19 diagnosis from x-ray images</article-title>. <source>Int. J. Hybrid Intelligent Syst.</source> <volume>17</volume>, <fpage>71</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.3233/his-210008</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bleyer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rhemann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Rother</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Patchmatch stereo-stereo matching with slanted support windows</article-title>. <source>Bmvc</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L. E.</given-names>
</name>
<name>
<surname>Ramamoorthi</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Deep stereo using adaptive thin volume representation with uncertainty awareness</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>2524</fpage>&#x2013;<lpage>2534</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Collins</surname>
<given-names>R. T.</given-names>
</name>
</person-group> (<year>1996</year>). &#x201c;<article-title>A space-sweep approach to true multi-image matching</article-title>,&#x201d; in <source>Proceedings CVPR IEEE computer society conference on computer vision and pattern recognition</source> (<publisher-name>Ieee</publisher-name>), <fpage>358</fpage>&#x2013;<lpage>363</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Transmvsnet: Global context-aware multi-view stereo network with transformers</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>8585</fpage>&#x2013;<lpage>8594</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Esteban</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Schmitt</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Silhouette and stereo fusion for 3d object modeling</article-title>. <source>Comput. Vis. Image Underst.</source> <volume>96</volume>, <fpage>367</fpage>&#x2013;<lpage>392</lpage>. <pub-id pub-id-type="doi">10.1016/j.cviu.2004.03.016</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Furukawa</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ponce</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Accurate, dense, and robust multiview stereopsis</article-title>. <source>IEEE Trans. pattern analysis Mach. Intell.</source> <volume>32</volume>, <fpage>1362</fpage>&#x2013;<lpage>1376</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2009.161</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Galliani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lasinger</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Schindler</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Massively parallel multiview stereopsis by surface normal diffusion</article-title>. <source>Proc. IEEE Int. Conf. Comput. Vis.</source>, <fpage>873</fpage>&#x2013;<lpage>881</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Rational polynomial camera model warping for deep learning based satellite multi-view stereo matching</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>6148</fpage>&#x2013;<lpage>6157</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Harris</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Stephens</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>1988</year>). &#x201c;<article-title>A combined corner and edge detector</article-title>,&#x201d; in <source>Alvey vision conference</source> <publisher-loc>Manchester, UK</publisher-loc>, <volume>15</volume>, <fpage>10</fpage>&#x2013;<lpage>5244</lpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hiep</surname>
<given-names>V. H.</given-names>
</name>
<name>
<surname>Keriven</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Labatut</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pons</surname>
<given-names>J.-P.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Towards high-resolution large-scale multi-view stereo</article-title>,&#x201d; in <source>2009 IEEE conference on computer vision and pattern recognition</source> (<publisher-name>IEEE</publisher-name>), <fpage>1430</fpage>&#x2013;<lpage>1437</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Iwamoto</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Unet 3&#x2b;: A full-scale connected unet for medical image segmentation</article-title>,&#x201d; in <source>ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1055</fpage>&#x2013;<lpage>1059</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>P.-H.</given-names>
</name>
<name>
<surname>Matzen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kopf</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ahuja</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.-B.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Deepmvs: Learning multi-view stereopsis</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>2821</fpage>&#x2013;<lpage>2830</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Isensee</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Petersen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zimmerer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jaeger</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Kohl</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <source>nnu-net: Self-adapting framework for u-net-based medical image segmentation</source>. <comment>arXiv preprint arXiv:1809.10486</comment>.</citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ji</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gall</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Surfacenet: An end-to-end 3d neural network for multiview stereopsis</article-title>,&#x201d; in <source>Proceedings of the IEEE international conference on computer vision</source>, <fpage>2307</fpage>&#x2013;<lpage>2315</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Mvs-t: A coarse-to-fine multi-view stereo network with transformer for low-resolution images 3d reconstruction</article-title>. <source>Sensors</source> <volume>22</volume>, <fpage>7659</fpage>. <pub-id pub-id-type="doi">10.3390/s22197659</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S.-W.</given-names>
</name>
<name>
<surname>Kook</surname>
<given-names>H.-K.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.-Y.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>M.-C.</given-names>
</name>
<name>
<surname>Ko</surname>
<given-names>S.-J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Parallel feature pyramid network for object detection</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision</source> (<publisher-loc>Munich, Germany</publisher-loc>: <publisher-name>ECCV</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>250</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kutulakos</surname>
<given-names>K. N.</given-names>
</name>
<name>
<surname>Seitz</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>A theory of shape by space carving</article-title>. <source>Int. J. Comput. Vis.</source> <volume>38</volume>, <fpage>199</fpage>&#x2013;<lpage>218</lpage>. <pub-id pub-id-type="doi">10.1023/a:1008191222954</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Laurentini</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>The visual hull concept for silhouette-based image understanding</article-title>. <source>IEEE Trans. pattern analysis Mach. Intell.</source> <volume>16</volume>, <fpage>150</fpage>&#x2013;<lpage>162</lpage>. <pub-id pub-id-type="doi">10.1109/34.273735</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2022a</year>). &#x201c;<article-title>Feature pyramid multi-view stereo network based on self-attention mechanism</article-title>,&#x201d; in <source>2022 the 5th international conference on image and graphics processing</source> (<publisher-name>ICIGP</publisher-name>), <fpage>226</fpage>&#x2013;<lpage>233</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Dou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>C.-W.</given-names>
</name>
<name>
<surname>Heng</surname>
<given-names>P.-A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>H-Denseunet: Hybrid densely connected unet for liver and tumor segmentation from ct volumes</article-title>. <source>IEEE Trans. Med. imaging</source> <volume>37</volume>, <fpage>2663</fpage>&#x2013;<lpage>2674</lpage>. <pub-id pub-id-type="doi">10.1109/tmi.2018.2845918</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022b</year>). <article-title>Adr-mvsnet: A cascade network for 3d point cloud reconstruction with pixel occlusion</article-title>. <source>Pattern Recognit.</source> <volume>125</volume>, <fpage>108516</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108516</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shavit</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <source>Wt-mvsnet: Window-based transformers for multi-view stereo</source>. <comment>arXiv preprint arXiv:2205.14319</comment>.</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>T.-Y.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hariharan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Belongie</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Feature pyramid networks for object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Oktay</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Schlemper</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Folgoc</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heinrich</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Misawa</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <source>Attention u-net: Learning where to look for the pancreas</source>. <comment>arXiv preprint arXiv:1804.03999</comment>.</citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pitas</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2000</year>). <source>Digital image processing algorithms and applications</source>. <publisher-name>John Wiley and Sons</publisher-name>.</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: Convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <source>International Conference on Medical image computing and computer-assisted intervention</source> (<publisher-name>Springer</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Seferbekov</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Iglovikov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Buslaev</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shvets</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Feature pyramid network for multi-class land segmentation</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition workshops</source>, <fpage>272</fpage>&#x2013;<lpage>275</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Seitz</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Curless</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Diebel</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Scharstein</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Szeliski</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>A comparison and evaluation of multi-view stereo reconstruction algorithms</article-title>,&#x201d; in <source>2006 IEEE computer society conference on computer vision and pattern recognition (CVPR&#x2019;06)</source> (<publisher-name>IEEE</publisher-name>), <volume>1</volume>, <fpage>519</fpage>&#x2013;<lpage>528</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seitz</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Dyer</surname>
<given-names>C. R.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Photorealistic scene reconstruction by voxel coloring</article-title>. <source>Int. J. Comput. Vis.</source> <volume>35</volume>, <fpage>151</fpage>&#x2013;<lpage>173</lpage>. <pub-id pub-id-type="doi">10.1023/a:1008176507526</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vanhoucke</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Alemi</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Inception-v4, inception-resnet and the impact of residual connections on learning</article-title>,&#x201d; in <source>Thirty-first AAAI conference on artificial intelligence</source>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tao</surname>
<given-names>C. V.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>A comprehensive study of the rational function model for photogrammetric processing</article-title>. <source>Photogrammetric Eng. remote Sens.</source> <volume>67</volume>, <fpage>1347</fpage>&#x2013;<lpage>1358</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Targ</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Almeida</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lyman</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Resnet in resnet: Generalizing residual architectures</source>. <comment>
<italic>arXiv preprint arXiv:1603.08029</italic>
</comment>.</citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Multi-view stereo network with attention thin volume</article-title>,&#x201d; in <source>Pricai 2022: Trends in artificial intelligence: 19th pacific rim international conference on artificial intelligence, PRICAI 2022, shanghai, China, november 10&#x2013;13, 2022, proceedings, Part III</source> (<publisher-name>Springer</publisher-name>), <fpage>410</fpage>&#x2013;<lpage>423</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Mvster: Epipolar transformer for efficient multi-view stereo</article-title>,&#x201d; in <source>Computer vision&#x2013;ECCV 2022: 17th European conference, tel aviv, Israel, october 23&#x2013;27, 2022, proceedings, Part XXXI</source> (<publisher-name>Springer</publisher-name>), <fpage>573</fpage>&#x2013;<lpage>591</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Weilharter</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fraundorfer</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Atlas-mvsnet: Attention layers for feature extraction and cost volume regularization in multi-view stereo</article-title>,&#x201d; in <source>2022 26th international conference on pattern recognition (ICPR)</source> (<publisher-name>IEEE</publisher-name>), <fpage>3557</fpage>&#x2013;<lpage>3563</lpage>.</citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Mvsnet: Depth inference for unstructured multi-view stereo</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision</source> (<publisher-name>ECCV</publisher-name>), <fpage>767</fpage>&#x2013;<lpage>783</lpage>.</citation>
</ref>
<ref id="B40">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Recurrent mvsnet for high-resolution multi-view stereo depth inference</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>5525</fpage>&#x2013;<lpage>5534</lpage>.</citation>
</ref>
<ref id="B41">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Pyramid multi-view stereo net with self-adaptive view aggregation</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>766</fpage>&#x2013;<lpage>782</lpage>.</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zaharescu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Boyer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Horaud</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Topology-adaptive mesh deformation for surface evolution, morphing, and multiview reconstruction</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>33</volume>, <fpage>823</fpage>&#x2013;<lpage>837</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2010.116</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Vis-mvsnet: Visibility-aware multi-view stereo network</article-title>. <source>Int. J. Comput. Vis.</source> <volume>131</volume>, <fpage>199</fpage>&#x2013;<lpage>214</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-022-01697-3</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Graphfpn: Graph feature pyramid network for object detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>2763</fpage>&#x2013;<lpage>2772</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>