<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Signal Process.</journal-id>
<journal-title>Frontiers in Signal Processing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Signal Process.</abbrev-journal-title>
<issn pub-type="epub">2673-8198</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1405808</article-id>
<article-id pub-id-type="doi">10.3389/frsip.2025.1405808</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Signal Processing</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Sparse camera volumetric video applications. A comparison of visual fidelity, user experience, and adaptability</article-title>
<alt-title alt-title-type="left-running-head">Remde et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frsip.2025.1405808">10.3389/frsip.2025.1405808</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Remde</surname>
<given-names>Christopher</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2396282/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sauer</surname>
<given-names>Igor M.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2070710/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Queisner</surname>
<given-names>Moritz</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2839257/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Charit&#xe9; &#x2013; Universit&#xe4;tsmedizin Berlin</institution>, <institution>Corporate Member of Freie Universit&#x00E4;t Berlin and Humboldt-Universit&#x00E4;t zu Berlin</institution>, <institution>Department of Surgery</institution>, <institution>Campus Charit&#x00E9; Mitte</institution>, <institution>Campus Virchow-Klinikum</institution>, <addr-line>Berlin</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Humboldt Universit&#xe4;t zu Berlin</institution>, <institution>Cluster of Excellence Matters of Activity</institution>, <addr-line>Berlin</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2334294/overview">Pan Gao</ext-link>, Nanjing University of Aeronautics and Astronautics, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2294145/overview">Gareth W. Young</ext-link>, Trinity College Dublin, Ireland</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2922958/overview">Kang You</ext-link>, Nanjing University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Christopher Remde, <email>christopher.remde@charite.de</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>10</day>
<month>03</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>5</volume>
<elocation-id>1405808</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>01</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Remde, Sauer and Queisner.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Remde, Sauer and Queisner</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Volumetric video production in commercial studios is predominantly produced using a multi-view stereo process that relies on a high two-digit number of cameras to capture a scene. Due to the hardware requirements and associated processing costs, this workflow is resource-intensive and expensive, making it unattainable for creators and researchers with smaller budgets. Low-cost volumetric video systems using RGBD cameras offer an affordable alternative. As these small, mobile systems are a relatively new technology, the available software applications vary in terms of workflow and image quality. In this paper we provide an overview of the technical capabilities of sparse camera volumetric video capture applications and assess their visual fidelity and workflow.</p>
</sec>
<sec>
<title>Materials and methods</title>
<p>We selected volumetric video applications that are publicly available, support capture with multiple <italic>Microsoft Azure Kinect</italic> cameras and run on consumer-grade computer hardware. We compared the features, usability, and workflow of each application and benchmarked them in five different scenarios. Based on the benchmark footage, we analyzed spatial calibration accuracy, artifact occurrence and conducted a subjective perception study with 19 participants from a game design study program to assess the visual fidelity of the captures.</p>
</sec>
<sec>
<title>Results</title>
<p>We evaluated three applications, <italic>Depthkit Studio</italic>, <italic>LiveScan3D</italic> and <italic>VolumetricCapture.</italic> We found Depthkit <italic>Studio</italic> to provide the best experience for novel users, while <italic>LiveScan3D</italic> and <italic>VolumetricCapture</italic> require advanced technical knowledge to be operated. The footage captured by <italic>Depthkit Studio</italic> showed the least amount of artifacts by a larger margin, followed by <italic>LiveScan3D</italic> and <italic>VolumetricCapture</italic>. These findings were confirmed by the participants who preferred <italic>Depthkit Studio</italic> over <italic>LiveScan3D</italic> and <italic>VolumetricCapture</italic>.</p>
</sec>
<sec>
<title>Discussion</title>
<p>Based on the results, we recommend <italic>Depthkit Studio</italic> for the highest fidelity captures. <italic>LiveScan3D</italic> produces footage of only acceptable fidelity but is the only candidate that is available as open-source software. We therefore recommend it as a platform for research and experimentation. Due to the lower fidelity and high setup complexity, we recommend <italic>VolumetricCapture</italic> only for specific use-cases where its ability to handle a high number of sensors in a large capture volume is required.</p>
</sec>
</abstract>
<kwd-group>
<kwd>volumetric video</kwd>
<kwd>depth camera</kwd>
<kwd>visual fidelity</kwd>
<kwd>benchmark</kwd>
<kwd>user experience</kwd>
</kwd-group>
<contract-num rid="cn002">390648296</contract-num>
<contract-sponsor id="cn001">Bundesministerium f&#xfc;r Bildung und Forschung<named-content content-type="fundref-id">10.13039/501100002347</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Deutsche Forschungsgemeinschaft<named-content content-type="fundref-id">10.13039/501100001659</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Image Processing</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Most videos today are produced and consumed in a traditional format, where a camera sensor captures a 2D projection of a scene from a fixed perspective. This method lacks spatial depth and confines viewers to a predetermined viewpoint. While this is sufficient for many applications, certain use-cases benefit from a capture method that includes spatial information and allows observers to freely change their perspective (<xref ref-type="bibr" rid="B1">Alain et al., 2023</xref>). For example, instructional volumetric videos for spatially complex processes, such as machine operation, garment construction or even surgical procedures would be able to depict these workflows in their full spectrum. More immersive tele-conferencing systems and virtual site visits could reduce emission associated with traveling, and photorealistic animated assets for media production could be created with low expenditure. While volumetric videos today are almost exclusively produced in studios, all these use cases call for mobile, low-cost and adaptable capture systems. 3D scanning of static scenes is already a well-established and mature process, widely adopted across various disciplines. Recent advances in this field, such as Neural Radiance Fields (<xref ref-type="bibr" rid="B3">Barron et al., 2022</xref>) or Gaussian Splatting (<xref ref-type="bibr" rid="B13">Kerbl et al., 2023</xref>) have significantly enhanced the fidelity of these scans, pushing them towards photorealism. In static scenes, a single camera can be moved around the subject to produce a dataset. In dynamic scenes however, a dense array of cameras is needed to capture all angles of the scene simultaneously. This increases productions costs and limits the production of volumetric video to film studios, research facilities, and large tech companies. To reduce production costs as well as increase accessibility and mobility, methods that require fewer cameras and compute time are essential. Currently, the most common method for capturing volumetric video from sparse viewpoints involves the use of RGBD-cameras. These cameras can capture both the color (RGB) and depth (D) value for each pixel and are available at an affordable consumer price point. Moreover, they are compatible with consumer-grade PC hardware, making them suitable for low-cost setups. The emergence of affordable RGBD-cameras, alongside more widely available presentation devices (i.e., virtual reality head-mounted displays, VR-HMDs) have since created a market niche for low-cost volumetric video capture systems. Several commercial and non-commercial research applications are available. These applications provide access to volumetric video production for more creators, due to their low entry barrier both in terms of technical complexity and costs. The purpose of this paper is to provide an overview of affordable and accessible volumetric video capture applications, especially for first-time users. We highlight the strengths and limitations of each system to help readers make informed decisions based on their specific needs. To facilitate the comparison between these systems and their future iterations, we propose a benchmark for evaluating the visual fidelity of sparse camera volumetric video applications.</p>
<p>Although the RGBD camera based workflow remains the most popular method for creating sparse camera volumetric videos, a variety of other experimental approaches exist. The common goal of all methods is to extract the spatial information of a scene from a given optical input. Existing approaches can be sorted into four categories:</p>
<sec id="s1-1">
<title>1.1 Depth sensors</title>
<p>Depth sensors physically capture the spatial information of the scene, most often by measuring the time it takes for light to travel from the camera to the scene and back. Popularized by the <italic>Xbox Kinect</italic> (Microsoft Corporation, 2009), nowadays many affordable RGBD cameras are available from various manufacturers, such as the <italic>Realsense D455</italic> (Intel Corporation, 2024), <italic>Azure Kinect</italic> (Microsoft Corporation, 2020), <italic>Femto Mega/Bolt</italic> (Orbbec, 2023), or <italic>ZED2i</italic> (Stereolabs Inc. 2023). These sensors provide scene depth based on physical measurements. However, the resolution of these sensors is still relatively low, with no consumer model exceeding one megapixel, and the measurements can be distorted by reflective, transparent and emissive objects.</p>
</sec>
<sec id="s1-2">
<title>1.2 Monocular depth estimation</title>
<p>Monocular depth estimation methods aim to estimate the scene depth using only two-dimensional photographs as input. A neural network is trained on a large dataset of 2D photos paired with a depth map, which can then be used to infer a depth map from unseen photos. While recent models such as <italic>ZoeDepth</italic> (<xref ref-type="bibr" rid="B6">Bhat et al., 2023</xref>), <italic>Depth Anything</italic> (<xref ref-type="bibr" rid="B29">Yang et al., 2024</xref>) and Depth Pro (<xref ref-type="bibr" rid="B7">Bochkovskii et al., 2024</xref>) perform well on single images, these models are not yet suitable for dynamic sequences as they lack temporal coherence. As the techniques improve, and models targeted towards video monocular depth estimation with higher temporal coherency may be developed, depth estimation could improve or replace RGBD sensors.</p>
</sec>
<sec id="s1-3">
<title>1.3 Sparse photogrammetry</title>
<p>Photogrammetric approaches try to exploit recognizable landmarks in the images, which can be used to infer the spatial relationship between different images and camera poses. This process is also the basis of the SfM process, although classical methods require a large amount of input data. Recent advances in this field have shown to produce usable outputs even from sparse input data (<xref ref-type="bibr" rid="B10">Chibane et al., 2021</xref>; <xref ref-type="bibr" rid="B24">Truong et al., 2023</xref>). Although the number of input images required has been greatly reduced, a setup of about 10&#x2013;20 cameras is still needed to cover a captured subject from all angles. Further advances in this area, combined with approaches using very low-cost cameras, such as the <italic>Raspberry Pi camera module</italic>, may make these setups economically available to amateur creators in the future (<xref ref-type="bibr" rid="B8">B&#xf6;nsch et al., 2019</xref>).</p>
</sec>
<sec id="s1-4">
<title>1.4 Foundation models</title>
<p>Foundation models are deep learning networks that have been trained on large datasets in their domain. While these models have been widely used in other applications, such as large language models or image generation, this approach has only recently been applied to scene reconstruction, with the introduction of <italic>Dust3r</italic> (<xref ref-type="bibr" rid="B27">Wang et al., 2024</xref>). Using the learned priors, the model can efficiently fill the data gaps between very sparse input images, allowing full scene reconstruction with as little as two opposing views of the subject. This method has already been applied to dynamic scenes and shows promising results for single-viewpoint videos (<xref ref-type="bibr" rid="B32">Zhang et al., 2024</xref>). However, full dynamic scene reconstruction using multiple camera angles has yet to be demonstrated and the high GPU processing requirements could make this method expensive.</p>
</sec>
<sec id="s1-4-1">
<title>1.5 Comparisions</title>
<p>Since RGBD based approaches remain the most popular and widespread technology for capturing sparse camera volumetric video at the time of publication, our paper focuses on this technique. While publications which represent the common techniques and challenges in the field of RGBD camera-based volumetric video applications exist (<xref ref-type="bibr" rid="B12">Jin et al., 2024</xref>), to our knowledge, no published work to date has undertaken a comparison of the features, workflow, or visual quality generated with these systems. However, there are several related publications that address segments of this evaluation process: The image quality of RGBD cameras has been well studied (<xref ref-type="bibr" rid="B23">T&#xf6;lgyessy et al., 2021</xref>; <xref ref-type="bibr" rid="B18">Rijal et al., 2023</xref>), but the scope of these evaluations focusses on the performance of only a single device and emphasizes specific technical parameters. The work of <xref ref-type="bibr" rid="B30">Zerman et al. (2019)</xref> and <xref ref-type="bibr" rid="B31">Zerman et al. (2020)</xref> assesses the subjective perception of the effects of compression algorithms and different rendering techniques on volumetric videos. Similarly, <xref ref-type="bibr" rid="B22">Subramanyam et al. (2020)</xref> evaluate the impact of different point cloud compression methods but extend the study environment to display the data on virtual reality (VR) HMD instead of conventional two-dimensional displays. The three degrees of freedom (3DOF) study setup used in this paper was the basis of our study environment. Our subjective assessment methods were inspired by the work of Zerman et al. and Subramanyam et al. There is no literature that defines a benchmarking setup for comparing sparse RGBD camera volumetric video applications. We therefore propose a new benchmark, tuned to challenge the unique capabilities of these systems.</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Selection criteria of volumetric video capture software</title>
<p>For the evaluation, we focused on free or commercially, publicly available, volumetric video capture applications. All selected applications need to be able to capture images from multiple RGBD cameras, process the sensor data into a coherent spatial and temporal representation, and export the output into a common and widespread file format. The hardware requirements posed by the applications should be able to be fulfilled with widely available consumer-grade components. Including commercial software into a scientific comparison presents challenges for the reproducibility of the results, as access to these applications may be limited and can become unavailable on the market. However, open-source alternatives currently show a notable gap in fidelity compared to commercial solutions. To more accurately represent the state-of-the-art capabilities available today, we included commercial applications in the comparison. Although this article focuses on low-cost systems, we imposed no specific restrictions on the costs of the software applications. To allow readers to assess the affordability of each application we inform about their associated licensing costs (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<p>Variations in image quality across different camera models can significantly impact the quality of the volumetric video produced. Hence, achieving a reliable comparison between different software applications requires the use of an identical camera model across all tests. Among all available options, the Microsoft Azure Kinect was selected due to its universal support across all software applications and its provision of state-of-the-art image quality (<xref ref-type="bibr" rid="B18">Rijal et al., 2023</xref>; <xref ref-type="bibr" rid="B23">T&#xf6;lgyessy et al., 2021</xref>). At the time of writing this paper, the production of the Azure Kinect has been discontinued, but the underlying sensor hardware continues to be manufactured as the <italic>Orbecc Femto Bolt/Mega</italic> (Orbbec 3D Technology International Inc., 2024). These cameras utilize an identical depth sensor and only a slightly modified color sensor compared to the <italic>Azure Kinect</italic> models. The hardware similarities suggest that the results obtained using the <italic>Azure Kinect</italic> are transferable to these newer models.</p>
<p>To identify suitable candidates for comparison, we conducted an extensive internet search. Given the relatively novel and niche factor of the volumetric video market, we were unable to find any repositories, articles or reviews, that offered comprehensive lists of potential software applications. To address this issue, we crawled several internet archives using search terms such as &#x201c;volumetric video,&#x201d; &#x201c;depth sensors,&#x201d; &#x201c;RGBD&#x201d; and &#x201c;4D scanning.&#x201d; Additionally, commercial software applications were identified by searching company databases such as Crunchbase (Crunchbase Inc., 2024), while non-commercial, research and open-source candidates, were located through open source repositories and scientific databases such as Github (Microsoft Corporation, 2024), ArXiv (Cornell University, 2024) and IEEE Explore (IEEE, 2024). Through this process we identified six candidates which fulfilled our requirements. Listed in no particular order, the candidates are: <italic>Depthkit Studio</italic> (<xref ref-type="bibr" rid="B19">Scatter, 2024</xref>), <italic>SOAR</italic> (Stream Soar, 2023), <italic>EF EVE</italic> (Experimental Foundation, 2023), <italic>LiveScan3D</italic> (Kowalski, Naruniec, Daniluk., 2015), <italic>Brekel Point-Cloud v3</italic> (Brekel, 2024) and <italic>VolumetricCapture (</italic>
<xref ref-type="bibr" rid="B21">Sterzentsenko et al., 2018</xref>). During the benchmarking phase, the commercial candidates <italic>SOAR</italic> and <italic>EF EVE</italic> became permanently unavailable due to restructuring processes in the authoring companies. While we were able to conduct some tests with <italic>Brekel Pointcloud v3</italic>, a major bug prevented the capture of volumetric videos. All software authors were contacted to confirm that the software, or a bug fix, will not become available in the midterm. Therefore, only the candidates <italic>Depthkit Studio</italic>, <italic>LiveScan3D</italic> and <italic>VolumetricCapture</italic> could be included into the comparison.</p>
</sec>
<sec id="s2-2">
<title>2.2 Benchmark</title>
<p>To establish a standardized framework for assessing the volumetric video quality of each software application, we captured footage of five predefined benchmark scenes using three different camera configurations. These scenes were selected to represent varying capture conditions, covering a range of spatial complexities and adaptability requirements. Rigorous control measures were implemented to ensure the consistency of the benchmark environment, mitigating the influence from any variable beyond the software application under evaluation. It is important to note that all software applications presented in this paper are capable of producing higher quality captures than those shown in the benchmarks, when capture setups are optimized to their specific needs. In some cases, limitations within the software required adjustments to the physical benchmarking setup, which were accommodated accordingly.</p>
<sec id="s2-2-1">
<title>2.2.1 Hardware</title>
<p>Cost-effective setups have to balance the hardware quantity against its qualitative gains in fidelity. During testing, we found that using four cameras placed at regular intervals around the scene provided sufficient coverage, but some elevated areas could be obstructed. Adding one additional camera above the scene provided more seamless coverage from all viewing angles. Therefore, we decided to use <bold>five Microsoft Azure Kinect</bold> cameras in all our benchmarks.</p>
<p>Each volumetric capture software poses different requirements on the computing platform used to control, record, and process the captures. The most important difference is the use of a centralized or a distributed capture system. In a centralized system, all cameras are connected to a single computer, which must have sufficient bandwidth to communicate with the cameras and computational power to handle the incoming data streams. This approach requires more specialized and expensive hardware, at the benefit of an overall less complex hardware setup. In a distributed system each camera is connected to its own PC, called a client over a local area network. The clients are controlled by a PC acting as server. With this approach, each individual client only requires a small amount of processing power. Multiple lower-end PCs might be easier to acquire than a single high-end PC, however, the complexity of this networked approach results in a more difficult user experience. We based our centralized capture PC setup on the requirements (<xref ref-type="table" rid="T1">Table 1</xref>) of <italic>Depthkit Studio</italic> (Scatter, 2023), as this candidate has the highest hardware requirements. It is equipped with an <italic>AMD Ryzen 9 5950X CPU</italic> (Advanced Micro Devices Inc., 2020), <italic>Geforce RTX 3090 GPU</italic> (Nvidia Corporation, 2020), 64GB of RAM, and a 2TB M.2 SSD storage. Connectivity was provided by two onboard USB 3.2 ports and gigabit LAN, extended by a Startech PCIe (Startech, <ext-link ext-link-type="uri" xlink:href="http://Startech.com">Startech.com</ext-link> Ltd., 2017) extension card that provided four additional USB 3.2 ports. This PC was also used as the server for the distributed setup. For the clients, we used various laptop models, which all far exceeded the minimum specifications required for the clients (<xref ref-type="table" rid="T1">Table 1</xref>). All PCs were connected using a gigabit LAN switch and CAT6 cables. We verified that all candidates were compatible with this hardware setup and were able to smoothly capture the sensor data at their maximum framerate. The cameras were connected via five and 10&#xa0;m active USB 3.2 extension cables to allow for a larger placement range. For lighting, we used four consumer-grade LED-Panels from Elgato with up to 2,800 Lumens and an adjustable color range up to 7000K.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Overview of features for all candidates. Not all information about the supported cameras, number of camera and export formats could be verified.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Software</th>
<th align="left">Depthkit studio</th>
<th align="left">VolumetricCapture</th>
<th align="left">LiveScan3D</th>
<th align="left">Brekel PointCloud v3</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Supported camera models</td>
<td align="left">Microsoft Azure Kinect, Orbecc Femto Bolt</td>
<td align="left">Microsoft Azure Kinect, Intel Realsense D415</td>
<td align="left">Microsoft Azure Kinect</td>
<td align="left">Microsoft Kinect<break/>Azure Kinect<break/>Kinect v2<break/>Orbbec: Astra, Astra Pro, Embedded S, Femto Bolt, Femto Mega. StereoLabs ZED 2, Intel RealSense (no specified model)<break/>Generic 2D Webcams</td>
</tr>
<tr>
<td align="left">Maximum number of connected cameras</td>
<td align="left">10</td>
<td align="left">No limit specified, but at least up to 16</td>
<td align="left">10</td>
<td align="left">15</td>
</tr>
<tr>
<td align="left">Application architecture</td>
<td align="left">Centralized</td>
<td align="left">Distributed</td>
<td align="left">Centralized or distributed</td>
<td align="left">Centralized or distributed</td>
</tr>
<tr>
<td align="left">Operating System</td>
<td align="left">Microsoft Windows</td>
<td align="left">Microsoft Windows</td>
<td align="left">Microsoft Windows</td>
<td align="left">Microsoft Windows</td>
</tr>
<tr>
<td align="left">Minimum PC system requirements</td>
<td align="left">For up to 6 sensors: Intel Core i9-11900K or above, NVIDIA RTX 4070 Ti GPU or above, 64GB of RAM, six USB 3.1 ports</td>
<td align="left">Server: Intel i7 7700k or above, Nvidia GTX 960 or above, 16GB Ram, Gigabit-Lan<break/>Clients: Intel i5 or above, Integrated graphics, 8&#xa0;GB Ram, Gigabit-Lan, one USB 3.1 port</td>
<td align="left">Not specified</td>
<td align="left">For one sensor: Current Intel i5 or above, Geforce GTX 1070 or above, 8&#xa0;GB Ram, one USB 3.1 port<break/>Requirements for more sensors are not specified</td>
</tr>
<tr>
<td align="left">Spatial calibration Method, hardware required</td>
<td align="left">Marker based<break/>One or multiple markers printed on paper, attached to a sturdy surface</td>
<td align="left">Geometric structure based: Multiple cardboard moving boxes with accurate dimensions</td>
<td align="left">Marker based<break/>Markers printed on paper and attached to a box</td>
<td align="left">Marker based<break/>A single marker printed on paper, attached to a sturdy surface</td>
</tr>
<tr>
<td align="left">Export modes</td>
<td align="left">Mesh, textures, proprietary format</td>
<td align="left">Pointclouds, proprietary format</td>
<td align="left">Pointclouds, raw color, depth images</td>
<td align="left">Pointclouds, mesh, textures, raw color, depth images</td>
</tr>
<tr>
<td align="left">Export file formats</td>
<td align="left">Mesh: .obj; .ply texture: .jpg; .png proprietary: .mp4; .png</td>
<td align="left">Pointclouds: .ply raw color: .jpg<break/>raw depth: .png</td>
<td align="left">Pointclouds: .ply raw color: .jpg<break/>raw depth: .tiff</td>
<td align="left">Pointclouds: Alembic; Realflow BIN; .bgeo; .e57; .geo; .obj; .pcd; .pda; .pdb; .pdc; .ply; .prt; .ptc; .pts; .ptx; .rib; .xyz, UnityCoder Point Cloud<break/>Mesh: Alembic; Realflow BIN; .obj; .ply; SenseXR; Holo CatchLight<break/>Texture/Raw color: .jpg; .png; .tga; .tiff; .mp4<break/>Raw depth: .exr; .png</td>
</tr>
<tr>
<td align="left">Relative export file size (based on &#x201c;Static Human&#x201d; sequence), highest export settings</td>
<td align="left">Mesh and Texture<break/>773&#xa0;MB/s<break/>Proprietary: 27,75&#xa0;MB/s</td>
<td align="left">Pointclouds: 106&#xa0;MB/s</td>
<td align="left">Pointclouds: 280&#xa0;MB/s</td>
<td align="left">No data available</td>
</tr>
<tr>
<td align="left">Business model</td>
<td align="left">Commercial</td>
<td align="left">Free</td>
<td align="left">Free and open source</td>
<td align="left">Commercial</td>
</tr>
<tr>
<td align="left">Costs</td>
<td align="left">3000 USD per month, subscription</td>
<td align="left">0</td>
<td align="left">0</td>
<td align="left">300 USD (multi-camera version) 150 USD (single camera version), one-time purchase</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Camera arrangements</title>
<p>A major advantage of sparse-camera setups over traditional dense-camera volumetric capture setups is their adaptability and portability. To test for this adaptivity, we captured footage from a total of three different camera arrangements. The arrangements were selected to represent typical usage scenarios for volumetric video capture systems (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Camera and lighting arrangements used for the benchmark. The diameter of the orbit used to arrange the cameras is marked as d, the height of the orbit from the capture volume ground plane as h and the height of the of the centered overhead camera with s. The capture volume is shown as a pink bounding box. From left to right, top to bottom: <italic>Arrangement</italic> <bold>(A)</bold> (d &#x3d; 2.8&#xa0;m, h &#x3d; 1.5&#xa0;m, s &#x3d; 1.9&#xa0;m), <italic>Arrangement</italic> <bold>(B)</bold> (d &#x3d; 4&#xa0;m, h &#x3d; 1.5&#xa0;m, s &#x3d; 2.8&#xa0;m), <italic>Arrangement</italic> <bold>(C)</bold> (d &#x3d; 1.2&#xa0;m, h &#x3d; 0.2&#xa0;m, s &#x3d; 0.8&#xa0;m), <italic>Arrangement</italic> <bold>(D)</bold> (d &#x3d; 4&#xa0;m, h &#x3d; 1.5&#xa0;m).</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g001.tif"/>
</fig>
<sec id="s2-2-2-1">
<title>2.2.2.1 Camera arrangement A</title>
<p>One of the most common use cases for volumetric capture is to capture a single isolated person who is confined to a limited range of motion, such as sitting or standing in one position. The captured person is often in the role of an instructor, presenter, moderator, or theater performer. For this arrangement, we positioned four cameras in an orbital array around the subject, with an orbital diameter of 2.8&#xa0;m and a height of 1.5&#xa0;m above the ground. Each camera was separated by 90&#xb0; on the orbital plane. To allow for clean face captures, a fifth &#x201c;hero&#x201d; camera was placed close to the subject&#x2019;s face, just above eye level (1.9&#xa0;m). All cameras were rotated by 90&#xb0; on their camera axis, as this slightly increases the vertical field of view. LED light panels were placed above each of the cameras in the orbit. The total capture volume for this arrangement is about 1&#xa0;m &#x2a; 1&#xa0;m &#x2a; 2&#xa0;m (length &#xd7; width &#xd7; height) (<xref ref-type="fig" rid="F1">Figure 1A</xref>)</p>
</sec>
<sec id="s2-2-2-2">
<title>2.2.2.2 Camera arrangement B</title>
<p>In cases where more than a single person needs to be captured, or when interaction with larger objects is required, the capture volume must be increased. In <italic>camera arrangement B</italic>, the volume is enlarged to a total size of approximately 2&#xa0;m &#x2a; 2&#xa0;m &#x2a; 2&#xa0;m (L &#xd7; W &#xd7; H), by increasing the camera orbit diameter to 4&#xa0;m. As this is a more general arrangement with no specified position for the subjects, the fifth camera was repositioned centrally to a height of 2.8&#xa0;m above the volume, pointing downwards. The four light panels were again placed above the four cameras in the orbit (<xref ref-type="fig" rid="F1">Figure 1B</xref>).</p>
</sec>
<sec id="s2-2-2-3">
<title>2.2.2.3 Camera arrangement C</title>
<p>For <italic>camera arrangement C</italic>, the capture volume has been reduced to about 0.4&#xa0;m &#x2a; 0.4&#xa0;m &#x2a; 0.4&#xa0;m (L &#xd7; W &#xd7; H). This allows the sensors to be placed closer to a subject, increasing pixel density. This setup is therefore ideal for close-up shots of fine structures, such as hands, hand-object interactions, or faces. At the same time, this presents a challenge to the candidates&#x2019; calibration method, which must also adapt to the smaller volume. The camera orbit was decreased to a diameter of 1.2&#xa0;m at a height of 0.2&#xa0;m. The fifth camera was placed 0.8&#xa0;m above the ground, looking down on it. We focused on hand interactions with this arrangement, so all cameras were mounted on a table to act as a ground plane for the capture volume. Two light panels were placed approximately 1.2&#xa0;m above the scene, facing the capture volume (<xref ref-type="fig" rid="F1">Figure 1C</xref>).</p>
</sec>
<sec id="s2-2-2-4">
<title>2.2.2.4 Camera arrangement D</title>
<p>Due to limitations in the calibration procedure of the <italic>VolumetricCapture</italic> software, a fourth unique arrangement had to be created. <italic>Arrangement D</italic> is identical to <italic>Arrangement B</italic>, but without the fifth overhead camera. This setup was used for all scenes captured with the <italic>VolumetricCapture</italic> software (<xref ref-type="fig" rid="F1">Figure 1D</xref>).</p>
</sec>
</sec>
<sec id="s2-2-3">
<title>2.2.3 Sensor and software settings</title>
<p>While certain settings, particularly the sensor settings, are shared between all tested applications, each software provides a range of modifiers that can improve capture quality. We optimized these settings according to the documentation guidelines and consulted the software&#x2019;s authors to ensure that the configurations were ideal given the capture environments, maximizing fidelity. Since most applications offer a large number of adjustable parameters, only the settings that deviate from the defaults are documented here. For <italic>Depthkit Studio</italic> and <italic>LiveScan3D</italic>, the centralized system mode was used, as it reduced setup time and hardware management complexity. For <italic>VolumetricCapture</italic> we used the distributed system mode as it doesn&#x2019;t support a centralized setup.</p>
<sec id="s2-2-3-1">
<title>2.2.3.1 Common and sensor settings</title>
<p>All <italic>Azure Kinect</italic> units were updated to the firmware version <italic>1.16.110079014</italic> and the <italic>Azure Kinect SDK v1.4.1</italic> was installed on all host PCs. The unbinned near field of view (NFOV) mode of the depth sensor of the Azure Kinect units was used for all volumetric capture applications, giving a depth resolution of 640 &#xd7; 576 pixels and a field of view of 75&#xb0; in the horizontal axis and 65&#xb0; in the vertical axis. This mode was selected based on its favorable balance between depth accuracy and resolution and is also recommended by most volumetric capture applications. The color resolution was set to 1,920 &#xd7; 1,080 pixels, as this was the maximum resolution that could be smoothly handled by the recommended PC hardware specifications. We enabled the Azure Kinect temporal synchronization feature for all applications by connecting the cameras via 3.5&#xa0;mm audio cables in a daisy-chain configuration. Enabling the synchronization requires switching to manual exposure, which guarantees consistent frame timings across all devices. A manual exposure intensity appropriate for the environment was used to avoid under- or overexpose. The powerline frequency setting was set to 50&#xa0;Hz, which matches the power frequency in the country where the benchmark was performed. If this parameter is set incorrectly, lights might show as having a flickering or strobing effect in the captured footage.</p>
</sec>
<sec id="s2-2-3-2">
<title>2.2.3.2 Depthkit studio</title>
<p>
<italic>Depthkit Studio version 0.8.0</italic> and its accompanying Unity package <italic>Depthkit Core Expansion Package Phase 9</italic> were used in the benchmark. The calibration <italic>refinement parameters</italic> (<italic>Spatial Error, Sheer Angle and Temporal stability</italic>) will also need to be adjusted for each calibration pass individually but should firmly lean towards the <italic>Precision</italic> side. For the mesh export settings, the Mesh Density parameter has been set to a value of 200, Depth Bias compensation to 7&#xa0;mm, Surface Infill to 0 and Surface Smoothing to 5&#xa0;mm. For the texture export settings, the texture Blend parameter was set to 1, Texture Spill Correction Intensity to 44 and Texture Spill Correction Feather was set to 0.7.</p>
</sec>
<sec id="s2-2-3-3">
<title>2.2.3.3 LiveScan3D</title>
<p>
<italic>LiveScan Pre-Release Build v.1.2alpha1</italic> from the <italic>BuildingVolumes</italic> repository has been used during the benchmarking process. We built a calibration cube according to the instructions and used the <italic>Calibration_Cube_4S_A4.txt</italic> preset for the configuration. The <italic>Depth Map Filter</italic> was enabled for all cameras and set to a value of 5.</p>
</sec>
<sec id="s2-2-3-4">
<title>2.2.3.4 VolumetricCapture</title>
<p>
<italic>VolumetricCapture v5.0.0</italic> was used for benchmarking. <italic>VolumetricCapture</italic> relies on several sub dependencies that need to be installed along the main application. <italic>RabbitMQ v3.12.13 was used</italic>, as well as <italic>Erlang Compiler v25.2.3</italic> and <italic>Python 3.7</italic>. We note that it is important that only <italic>Python 3.7</italic> is installed on the host machine, and to follow the instructions in the <italic>installations.txt</italic> file, instead of the automatic installation during the configuration of the calibration software.</p>
</sec>
</sec>
<sec id="s2-2-4">
<title>2.2.4 Scenes</title>
<p>Four benchmarking scenes with differing camera arrangements were captured. The duration of each scene was targeted to be approximately 15&#x2013;20&#xa0;s. Due to limitations in the calibration procedure of <italic>VolumetricCapture</italic>, it was not possible to use it with other camera arrangements than <italic>arrangement D</italic>. This arrangement is not suitable for closeup scenes, therefore the <italic>Hand</italic> and <italic>Hand Interaction</italic> could not be captured for this candidate. <xref ref-type="table" rid="T2">Table 2</xref> provides a comprehensive overview of the scenes with their corresponding arrangement and candidate:</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Benchmark scene, camera arrangement and candidate correlation.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Software/Scenes</th>
<th align="left">Static person</th>
<th align="left">Dynamic person</th>
<th align="left">Object interaction</th>
<th align="left">Hand</th>
<th align="left">Hand<break/>Interaction</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Depthkit Studio</td>
<td align="left">Arrangement A</td>
<td align="left">Arrangement A</td>
<td align="left">Arrangement B</td>
<td align="left">Arrangement C</td>
<td align="left">Arrangement C</td>
</tr>
<tr>
<td align="left">LiveScan</td>
<td align="left">Arrangement A</td>
<td align="left">Arrangement A</td>
<td align="left">Arrangement B</td>
<td align="left">Arrangement C</td>
<td align="left">Arrangement C</td>
</tr>
<tr>
<td align="left">VolumetricCapture</td>
<td align="left">Arrangement D</td>
<td align="left">Arrangement D</td>
<td align="left">Arrangement D</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">Brekel PointCloud v3</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s2-2-4-1">
<title>2.2.4.1 Static person</title>
<p>With this scene, our goal was to provide ideal and non-challenging conditions for the candidates, that would result in captures with high video quality. <italic>Camera arrangement A</italic> was used to maximize the sensor coverage. The scene shows a single person standing upright with little body motion.</p>
</sec>
<sec id="s2-2-4-2">
<title>2.2.4.2 Dynamic person</title>
<p>This scene corresponds to <italic>Static Person</italic> in the general setup and use of <italic>camera arrangement A</italic>, but the subject makes much more physical movement, particularly through their hands and upper body. These conditions enable us to test the candidate&#x2019;s ability to handle fast movements and are expected to result in some artifacts.</p>
</sec>
<sec id="s2-2-4-3">
<title>2.2.4.3 Object interaction</title>
<p>This scene presents a challenge to the candidate&#x2019;s ability to capture complex interactions between objects within larger volumes. <italic>Camera arrangement B</italic>, with the largest capture volume, was employed for this scene. This scene shows a small choreography of a person sitting on a chair, who then stands up, walks around the chair and puts on a jacket. They then proceed to lift a small box from the floor and leave the capture volume with it. The presence of various objects in the scene results in more obstructions, leading to fewer cameras observing the same parts of the scene, decreasing data density. Moreover, the larger capture volume reduces data density and the scene is therefore expected to be of lower quality in general compared to other scenes.</p>
</sec>
<sec id="s2-2-4-4">
<title>2.2.4.4 Hand</title>
<p>This scene uses <italic>camera arrangement C</italic>, with a relatively small capture volume. As the increased pixel density allows for more granular objects to be captured, we show a single hand in motion, making different gestures. This scene allows us to test the adaptability and scalability of the candidates to more extreme camera arrangements and their ability to visualize finer structures.</p>
</sec>
<sec id="s2-2-4-5">
<title>2.2.4.5 Hand interaction</title>
<p>The setup in this scene is based on the <italic>Hand</italic> scene but introduces a more complex hand-object interaction: Using a few wooden blocks, the two hands build a small structure. Candidates are challenged by the increased complexity of the scene, coupled with the presence of fine structures.</p>
</sec>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Visual fidelity</title>
<p>Human perception of any media is a highly complex, multidimensional, and subjective experience. Analyzing and rating the overall quality and effect of a specific medium therefore remains a challenging task. In order to keep the subjective and objective evaluation of the captured benchmarks within a manageable context, we decided to rate the footage purely on the basis of visual fidelity. The fidelity of a given medium describes its ability to mimic the source scene as closely as possible. To produce footage with high visual fidelity, sparse camera volumetric video software needs to address sensor errors caused by the hardware itself and fuse multiple camera perspectives into a single coherent image while working with relatively little information compared to dense-camera setups. In order to assess the fidelity of the captured benchmarks, we use both an objective and a subjective approach. Some features of the fidelity can be assessed objectively, such as the accuracy of the spatial calibration and occurrence of certain artifacts. The overall image fidelity, which is the collective effect of many known and unknown factors, remains difficult to assess objectively. For this reason, we conducted a subjective perception study in which participants were asked to rate the fidelity of the candidates on a comparative basis.</p>
<sec id="s2-3-1">
<title>2.3.1 Spatial calibration</title>
<p>All volumetric video capture software needs to fuse the image data from multiple sensors into a single consistent representation. At the basis of this process is the transformation of the independent local coordinate system of each sensor into a shared global coordinate system. This process is commonly known as spatial calibration. The final image quality of a volumetric video quickly degrades if the calibration contains even small errors and is therefore critical for fidelity. A variety of approaches have been developed, often using calibration reference objects with known dimensions and features. <xref ref-type="bibr" rid="B5">Beck and Froehlich (2015)</xref> proposes a checkerboard-marker based calibration approach, where the color and depth pixels of an individual sensor are directly mapped into a joint coordinate system. <xref ref-type="bibr" rid="B20">Sterzentsenko et al. (2020)</xref> utilize a physical geometric structure in combination with shape analysis to estimate sensor poses. We want to familiarize the reader with the approaches used by the candidates, before analyzing the specific implementations.</p>
<sec id="s2-3-1-1">
<title>2.3.1.1 Marker based calibration</title>
<p>Marker based calibration is one of the most widely used calibration approaches. It involves the use of two-dimensional fiducial markers. Common marker formats include ArUco or checkerboard patterns. If the dimensions of the marker and the intrinsic parameters of the camera are known, the relative transformation (position, rotation and scale) between the camera and the marker can be estimated. If two or more sensors can see a marker at the same time, the relative transformation between the sensors can be measured and a shared coordinate system can be established between the sensors. Additional strategies need to be employed in configurations where not all cameras can observe the marker simultaneously. The marker can either be moved from one camera pair to another in a daisy-chained style, or a structure, where markers are visible from any angle can be used. In general, the more observations of a marker at different positions within a captured volume exist, the better the calibration can be estimated.</p>
</sec>
<sec id="s2-3-1-2">
<title>2.3.1.2 Structure based calibration</title>
<p>Structure based calibration methods harness the ability of depth-sensors to directly capture three-dimensional data of a scene. A structure of known dimensions is constructed and placed in the center of the capture volume, so that all sensors can observe it. The three-dimensional shape of the structure is then searched for within the depth sensor image. The orientation and position of the structure relative to the sensor can be used to infer the position of each sensor. For this method to work, it is important that the structure looks unique from all perspectives, otherwise a false match can occur.</p>
<p>The calibration process is a key component of any volumetric capture workflow and must be performed each time a camera is moved. Accordingly, this does not only affect the final quality, but the convenience of the calibration workflow is also an important usability factor. To quantify the spatial calibration quality of the candidates, the dimensions of captured objects were compared to their known physical dimensions. Since the Azure Kinect provides depth data in metric units, the measurements can be taken directly from the exported sequences. For each of the sequences <italic>Dynamic Person</italic>, <italic>Object Interaction</italic> and <italic>Hand</italic>, we measured the dimensions of the same objects on the X, Y and Z-axes of the Cartesian coordinate system over multiple frames. The values for all axes and samples are averaged into a single value for each scene and candidate. Additionally, we provide the minimum and maximum deviation measured in each scene.</p>
</sec>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Artifacts of volumetric video</title>
<p>To create the final three-dimensional image, the applications need to fuse observations from many different cameras and sensors into a unified representation. Due to imperfections in the sensor hardware, spatial calibration or post-processing, flaws and imperfections are introduced into the final image. These are commonly referred to as <italic>artifacts</italic>. We visually inspect the entire benchmark sequences of the candidates for the occurrence of artifacts and describe their occurrence rate, as well as their intensity. In addition, we ask which artifacts dominate the visual appearance of each candidate. To objectively measure the occurrence and intensity of artifacts in volumetric captures, we first need to define the different types of artifacts and their appearance. To achieve higher fidelity, filtering strategies or data refinement can be employed. However, these processes themself can also introduce new artifacts. While artifacts generally result in a lower image fidelity, they might be perceived differently depending on the use case. Artifacts can be desirable in videos used for artistic contexts, such as games, or Virtual reality experiences, but a strong adherence to the ground truth is needed for other use cases, such as medical training, or documentary films.</p>
<p>
<xref ref-type="fig" rid="F2">Figure 2</xref> shows a non-exhaustive collection of the visually most prominent artifacts in the captured benchmark footage, which are described in more detail below. For evaluation purposes we distinguish between the RGBD camera artifacts, data fusion artifacts and visualization artifacts.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Examples of different types of artifacts. <bold>(A)</bold> Depth noise on a flat wall, <bold>(B)</bold> Holes and missing pixels, <bold>(C)</bold> Flying pixels, <bold>(D)</bold> Incorrectly projected color, <bold>(E)</bold> Spatial calibration error, <bold>(F)</bold> Overlap, <bold>(G)</bold> Color mismatch, <bold>(H)</bold> Point cloud (left) and mesh and texture (right) renderings.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g002.tif"/>
</fig>
</sec>
<sec id="s2-3-3">
<title>2.3.3 RGBD camera artifacts</title>
<p>RGBD cameras, such as the Azure Kinect used in this benchmark are a unit of multiple different optical sensors that need to work together precisely to produce the combined color and depth data streams. While digital color cameras are a well-established technology, depth-sensing cameras are relatively new as a commodity technology. There are different technologies to estimate the depth of a scene, but we focus on the artifacts caused by the Azure Kinect&#x2019;s near-infrared Time-of-Flight (ToF) technique:</p>
<sec id="s2-3-3-1">
<title>2.3.3.1 Depth noise</title>
<p>Like traditional RGB cameras, ToF sensors also suffer from image noise. This noise is most visible as a high frequency jitter of pixels along the depth axis of the sensor. For the Azure Kinect, this jitter can range from 1 to 8&#xa0;mm (<xref ref-type="bibr" rid="B18">Rijal et al., 2023</xref>) and increases with the distance from the captured objects. In addition to affecting the precision of the depth measurements, this noise is a highly visible artifact in any video footage captured by this system. The noise can be reduced by temporal filters, such as adopted versions of the Kalman Filter (<xref ref-type="bibr" rid="B2">Amamra and Aouf, 2018</xref>).</p>
</sec>
<sec id="s2-3-3-2">
<title>2.3.3.2 Holes or missing pixels</title>
<p>Under certain conditions, the ToF sensor can&#x2019;t correctly measure the distance in parts of the image, resulting in gaps or holes in parts of the scene. Incorrect measurements can be caused by multipath interference, materials absorbing the infrared laser illumination, or objects being too close or too far from the sensor. Deep neural networks, which have been trained on RGBD image sets can provide a possible solution to this problem (<xref ref-type="bibr" rid="B33">Zhang and Funkhouser, 2018</xref>). These networks can complete the depth maps and fill any remaining holes but can also introduce new artifacts and hallucinations.</p>
</sec>
<sec id="s2-3-3-3">
<title>2.3.3.3 Flying pixels</title>
<p>When depth data is incorrectly placed on the depth axis, pixels appear to float or fly around in the capture volume. Sometimes these are just discrete outlier pixels that look like floating particles. Often, however, these pixels appear more systematically between two objects that are in front of each other. They seem to connect the objects like glue (<xref ref-type="bibr" rid="B23">T&#xf6;lgyessy et al., 2021</xref>). This artifact is particularly present on Azure Kinect devices and is likely caused by errors or inaccuracies in the depth map generation algorithm. Most isolated flying pixels can be filtered by using statistical outlier detection. Flying pixels that occur systematically between two objects can be removed using an erosion filter, which removes all pixels around these objects on the XY image plane.</p>
</sec>
</sec>
<sec id="s2-3-4">
<title>2.3.4 Data fusion artifacts</title>
<p>Because each camera in a volumetric video setup observes a different perspective of the scene, the volumetric video capture system has to fuse all these perspectives into a coherent representation that accurately represents the ground truth. The fusion algorithm needs to deal with several possible artifacts:</p>
<sec id="s2-3-4-1">
<title>2.3.4.1 Overlapping data</title>
<p>Due to calibration or sensor inaccuracies, data points that are observed by two or more cameras simultaneously will never be perfectly aligned, which results in overlapping. While larger geometric inaccuracies should be addressed with better calibration methods, small overlapping regions can be masked by depth fusion algorithms (<xref ref-type="bibr" rid="B15">Meerits et al., 2018</xref>; <xref ref-type="bibr" rid="B16">Newcombe et al., 2011</xref>). Similarly, overlapping regions in the color data can be fused by texture fusion algorithms (<xref ref-type="bibr" rid="B26">Waechter et al., 2014</xref>).</p>
</sec>
<sec id="s2-3-4-2">
<title>2.3.4.2 Color mismatch</title>
<p>Even with perfect spatial calibration and no overlapping data, differences in the color sensor data from two cameras can create visual seams. Due to differences in the hardware, color space, exposure, ISO or white balance between sensors, this is often unavoidable to some extent but can be dealt with by color matching (<xref ref-type="bibr" rid="B26">Waechter et al., 2014</xref>), band separation (<xref ref-type="bibr" rid="B4">Baumberg, 2002</xref>) and smoothing between the two perspectives near the seam.</p>
</sec>
<sec id="s2-3-4-3">
<title>2.3.4.3 Incorrectly projected color</title>
<p>As the color and depth cameras are in physically different locations, they observe slightly different perspectives of the scene. To fuse both image modalities into a single unified coordinate system, the lens distortion (intrinsics) and the orientation of the sensors relative to each other (extrinsics) must be precisely measured and corrected for. Errors in this calibration process will result in an offset in the color data projected onto the depth pixels. For example, parts of the foreground of a scene might appear projected onto the background. Another possible cause of misprojection is when larger areas of the depth map are missing. In this case, the color information might get incorrectly projected onto nearby geometry instead.</p>
</sec>
</sec>
<sec id="s2-3-5">
<title>2.3.5 Visualization format</title>
<p>The data captured by the volumetric system needs to be quantized into a format that can be used for storage and playback. Classical representation formats for three-dimensional data in computer graphics include <italic>point clouds</italic> and <italic>meshes</italic>, which are also used by the candidates. Each format can produce specific artifacts:</p>
<sec id="s2-3-5-1">
<title>2.3.5.1 Point cloud-based artifacts</title>
<p>Point clouds consist of many discrete, colored points that are located in a three-dimensional coordinate system. The size of the points must be adjusted in relation to the distance of the point of observation to create the appearance of a continuous shape. This illusion quickly breaks down when the distance is changed and can therefore result in artifacts with a patchy appearance. Additionally, volumetric video rendered as a point cloud can appear noisy, as it is composed out of many discrete objects. Rendering the points as splats, where the transparency of the points increases towards its edges, can result in smoother looking visualizations, but is not yet widely supported.</p>
</sec>
<sec id="s2-3-5-2">
<title>2.3.5.2 Mesh-based artifacts</title>
<p>Mesh-based formats describe 3D objects as continuous surfaces consisting of many small polygons. This format has the advantage of being visually smoother looking, as well as taking advantage of the high-resolution color texture captured by the sensors. Surface reconstruction algorithms are used to create a mesh from the point cloud or depth data, but these require a certain level of information density. Regions of the volumetric image that cannot provide this density, or contain structures that are too thin, may be missing from the mesh. When this happens, the color texture cannot be projected onto the missing geometry and may be incorrectly projected onto other parts of the model.</p>
</sec>
</sec>
<sec id="s2-3-6">
<title>2.3.6 Perception study setup</title>
<p>The objective visual analysis can only capture certain factors that contribute to the fidelity of a video at a technical level but cannot show how the fidelity of a candidate might be perceived by an audience. For this reason, a subjective fidelity study was conducted to evaluate which candidates are perceived to possess higher fidelity. The design of the study is based on the ITU-T P.910 2022 recommendation <italic>Subjective video quality assessment methods for multimedia applications (ITU-T, 2022)</italic>. As the goal of this study is to compare the fidelity of the candidates relative to each other, and not in a broader context, we used a pair-comparison method (P.910 2022 section 7.4), where participants judge which element in a pair of sequences is preferred. We complemented the pair-comparison method with a simultaneous presentation (SP) (P.910 2022 Annex C) of two sequences from different candidates to facilitate the decision process for participants. This accounts for the fact that volumetric video is a relatively unknown medium with unfamiliar visual patterns and artifacts. The recommendation demands to show the sequences on a traditional two-dimensional display. However, we argue that the experience of the spatial dimension is a crucial factor in the perception of a volumetric medium. Therefore, we implemented the study into an Extended Reality (XR) environment, while keeping other presentation parameters as specified in the recommendation, similar to <xref ref-type="bibr" rid="B22">Subramanyam et al. (2020)</xref>. We set up a three-dimensional study environment within the <italic>Unity3D</italic> game engine (Unity Technologies, 2023). As specified in the ITU-T P.910 recommendation, the environment is kept in a neutral gray, except for a blue grid on the virtual floor, to facilitate the navigation and orientation for the participants. Two podiums, which are positioned about 2&#xa0;m in front of the participants, act as playback locations for the volumetric video. The position and scale of the volumetric videos were adjusted in such a way that both videos are fully visible at the same time, without requiring the user to turn their head. Participants were asked not to leave this centered position during the study but were allowed to move their head in all dimensions. The <italic>Depthkit Expansion Package Phase 10</italic> (<xref ref-type="bibr" rid="B19">Scatter, 2024</xref>), included with <italic>Depthkit Studio</italic> was used to playback the scenes for this candidate. As the other candidates do not provide a native playback solution, we used the open-source volumetric video playback solution <italic>Unity Geometry Sequence Streamer</italic> (<xref ref-type="bibr" rid="B9">BuildingVolumes, 2023</xref>). During the study, all benchmark sequences were shown. The <italic>Static Person</italic>, <italic>Dynamic Person</italic> and <italic>Object Interaction</italic> sequences were compared across all candidates, while the <italic>Hand</italic> and <italic>Hand Interaction</italic> scenes were only compared across the candidates <italic>LiveScan3D</italic> and <italic>Depthkit Studio</italic>. In each sequence, every candidate was paired with every other possible candidate. All candidate pairings were shown twice, with the podium position (left or right) swapped on the second viewing. After one sequence pair finished playing, the participants could interactively vote for their preferred sequence within the XR study environment or choose to watch the sequence again once. Before the participants began the study, they were shown a training sequence, which was not included in the benchmark sequences, to familiarize them to the study procedure and test conditions. Participants were asked to vote solely based on visual fidelity, trying to avoid any bias stemming from the aesthetics or stylization of the footage. The study was conducted with a <italic>Meta Quest 3</italic> headset (Meta Platforms, Inc., 2023). The full dataset from the study, along with the code used for data analysis and visualization is provided.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Candidates</title>
<sec id="s3-1-1">
<title>3.1.1 Features</title>
<p>Although all candidates share a certain set of core features required for volumetric video capture, the features beyond the required functions vary significantly. <xref ref-type="table" rid="T1">Table 1</xref> provides a comprehensive overview for most of the candidates&#x2019; features. We were able to test the majority of these features during our benchmark and evaluation phase, but not all features could be verified. This especially applies to the list of supported camera models, the maximum number of cameras and the export formats. The feature list was partially taken from the available documentation and was manually completed during the evaluation process. We recommend reading <xref ref-type="table" rid="T1">Table 1</xref> for full information on the feature set of the candidates.</p>
<p>
<italic>Brekel Pointcloud v3</italic> and <italic>Depthkit Studio</italic> are both commercial applications. While <italic>Depthkit Studio</italic> is distributed in a subscription model for 3000 USD per month, access to the open beta of <italic>Brekel Pointcloud v3</italic> can be purchased for one time charge of 300 USD. <italic>VolumetricCapture</italic> and <italic>LiveScan3D</italic> are freely available on Github<xref ref-type="fn" rid="fn2">
<sup>2</sup>
</xref>
<sup>,</sup>
<xref ref-type="fn" rid="fn3">
<sup>3</sup>
</xref>, but only the code base of <italic>LiveScan3D</italic> is open-sourced.</p>
<p>
<italic>Livescan3D</italic> and <italic>Depthkit Studio</italic> support capture with up to ten <italic>Azure Kinect</italic> sensors at the time of writing. <italic>Brekel Pointcloud v3</italic> supports a wide sensor range, such as the <italic>Kinect v1/v2/Azure</italic>, <italic>Orbecc Astra</italic> series, <italic>Intel Realsense</italic> series and <italic>StereoLabs ZED</italic> series. <italic>VolumetricCapture</italic> supports the <italic>Intel Realsense D415</italic> in addition to the <italic>Azure Kinect</italic>. It allows recordings with at least sixteen simultaneous sensors due to its strictly distributed software architecture, where each sensor is connected to its own host PC. <italic>LiveScan3D</italic> and <italic>Brekel Pointcloud v3</italic> can be operated in either a centralized mode, where all sensors are connected to the same PC, or the distributed mode. <italic>Depthkit Studio</italic> operates only in a centralized mode, which requires a capable host machine. At the same time, only <italic>Depthkit Studio</italic> allows to post-process the captured video and export it as a textured mesh sequence. The other applications export the video as a nearly unprocessed pointcloud sequence.</p>
</sec>
<sec id="s3-1-2">
<title>3.1.2 User experience</title>
<p>Due to the complexity and novelty of volumetric capture systems, a solid user experience and comprehensive documentation are the foundation for successful volumetric captures. We evaluated the availability and quality of documentation, the usability experience of the graphical user interface (GUI), and at the stability of the system. Particular attention was directed to the spatial calibration methods, which are one of the most time-consuming tasks in the capture pipeline (<xref ref-type="sec" rid="s3-2-1">Section 3.2.1</xref>).</p>
<p>
<bold>Depthkit Studio</bold> provides extensive and comprehensive documentation resources in the form of a website, video tutorials and a community forum. We found the GUI to be intuitive and well-structured and didn&#x2019;t experience any crashes or errors. The software ran smoothly and was easy to set up with the provided installers. <italic>Depthkit Studio</italic> uses a marker-based calibration approach. One or more ArUco marker boards need to be printed out in DIN A3 format and attached to a solid surface. The cameras are calibrated in daisy chained pairs. For each pair, multiple samples of the marker must be taken throughout the capture volume. To capture a sample, the marker must be kept stationary and a sample phase needs to be manually activated for approximately 5&#xa0;s. Due to the number of samples that need to be taken, the calibration routine for five cameras took approximately 15&#x2013;25&#xa0;min to complete. While the material requirements are low and the process works well, the calibration routine took by far the longest compared to the other applications. Not every calibration run produces the desired results and may have to be repeated, resulting in calibration times of up to an hour.</p>
<p>
<bold>LiveScan3D</bold> only provides little documentation, which is scattered throughout the software repository and is therefore difficult to find. We found the GUI to be generally clear and intuitive, although the program did occasionally freeze or crash. For calibration, LiveScan3D uses a marker-based approach, that requires the construction of a multi-marker calibration cube. The calibration cube can be made of different materials and its size can be adjusted for different capture scenarios. This initial construction step is time-consuming and difficult as the dimensions and angles need to be carefully observed. The calibration cube must be placed in a part of the scene that is visible to all cameras and is then automatically recognized by the software. It was sometimes necessary to adjust the lighting for the marker to be recognized. The calibration routine itself takes about 2&#xa0;min. There is also an option to refine the calibration using an iterative closest point algorithm, but this did not reliably improve the calibration quality. While the calibration routine itself is quick, the initial construction step may make it difficult for users without access to laser cutters or 3D printers to achieve a successful calibration.</p>
<p>
<bold>VolumetricCapture</bold> provides robust online documentation and support on its Github repository page. Of all the applications, <italic>VolumetricCapture</italic> offers the most sophisticated approach to the distributed architecture. The clients can be run completely headless, with no peripherals and no direct interaction with the clients other than physically turning them on and off. To configure the distributed system, multiple sub-programs, ports and services needed to be set up for each client PC. We found the GUI rather difficult to use, due to the complex layout and many non-functioning elements. Disconnections and crashes were regular problems and could only be resolved by restarting the application. <italic>VolumetricCapture</italic> is the only candidate to employ a structure-based calibration approach. The structure consists of four <italic>IKEA J&#xe4;ttene</italic> moving boxes, which have been discontinued in production. Due to the dimensions of the box being prescribed, they had to be manually reconstructed from flat cardboard. The calibration routine is not included with the binaries and must be downloaded and installed via a Python script. We had to implement workarounds to run the script successfully. For non-technical users, this setup step can be particularly difficult. The calibration routine itself can be performed in about 2&#xa0;min, including the structure setup, but often fails and needs to be repeated multiple times.</p>
<p>
<bold>Brekel PointCloud v3</bold> comes with an installer that makes initial setup easy, and comprehensive documentation in the form of an offline PDF document. The wide range of supported features results in a sometimes cluttered and overloaded, but well-structured interface. Although the application ran smoothly and without crashes, we were unable to capture benchmark footage with this candidate due to a bug in the calibration process. We have confirmed the existence of this bug with the author of the application to rule out operational errors on our part. Brekel Pointcloud v3 is advertised as being in a beta version on the manufacturer&#x2019;s website.</p>
</sec>
</sec>
<sec id="s3-2">
<title>3.2 Visual fidelity</title>
<p>All five scenes were successfully captured for <italic>LiveScan3D</italic> and <italic>Depthkit Studio</italic>. Due to limitations in the calibration approach used for <italic>VolumetricCapture</italic>, only three scenes could be captured. <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> show a visual overview of all the footage that was captured during the benchmark phase and subsequently used for the analysis and study, including the ground truth captured with the color camera of the <italic>Kinects</italic>. A video showing these scenes in motion is available.<xref ref-type="fn" rid="fn4">
<sup>4</sup>
</xref> In some scenes, green areas might be noticeable. These are the result of the presence of a green screen in the recording studio, which is falsely being projected onto parts of the capture. This is an artifact that would occur in any capture environment but is more noticeable here due to the vibrant color.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Frames of the captured benchmark footage with all candidates. From top to bottom: GT) Ground truth, A) <italic>VolumetricCapture</italic>, B) <italic>LiveScan3D</italic>, C) <italic>Depthkit Studio</italic>. From left to right: <sup>1</sup>) <italic>Static Person</italic> scene, <sup>2</sup>) <italic>Dynamic Person</italic> scene, <sup>3</sup>) <italic>Object Interaction</italic> scene.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Frames of the captured benchmark footage with <italic>camera arrangement C</italic>. From top to bottom: GT) Ground truth, B) <italic>LiveScan3D</italic>, C) <italic>Depthkit Studio</italic>. From left to right: <sup>4</sup>) <italic>Hand</italic> scene, <sup>5</sup>) <italic>Hand Interaction</italic> scene.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g004.tif"/>
</fig>
<sec id="s3-2-1">
<title>3.2.1 Spatial calibration analysis</title>
<p>The spatial calibration was measured for each scene individually by calculating the average deviation in centimeters between the dimensions of virtual objects and their real counterparts. The results show that the size of the capture volume is directly correlated to the calibration error, with a larger capture volume resulting in a larger error (<xref ref-type="fig" rid="F5">Figure 5</xref>). Depthkit Studio consistently showed the least amount of deviation. For the <italic>Dynamic Perso</italic>n scene, the average deviation was 8&#xa0;mm, for <italic>Object Interaction</italic> 13&#xa0;mm and for <italic>Hand</italic> 2&#xa0;mm. The measured deviation for LiveScan3D is on average twice as large as in <italic>Depthkit Studio</italic>. The <italic>Dynamic Person</italic> scene measured an average deviation of 17&#xa0;mm, the <italic>Object Interaction</italic> scene an average deviation of 21&#xa0;mm and the <italic>Hand</italic> scene an average deviation of 4&#xa0;mm. While the camera arrangement for VolumetricCapture did not change between scenes, the calibration accuracy varied by a large margin, with the <italic>Dynamic Person</italic> scene showing a low accuracy with 26&#xa0;mm of deviation but performing much better in the <italic>Object Interaction</italic> scene with 15&#xa0;mm deviation. This shows that theoretically a competitive calibration accuracy can be achieved with the employed calibration approach, but the accuracy could not be reliably reproduced between different takes in our benchmarking setup, even though repeated attempts were made for each scene.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>The calibration accuracy of the candidates is assessed by calculating the average deviation in centimeters between the dimensions of virtual objects and their real counterparts. The ranges show the minimum and maximum deviation measured over several frames.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g005.tif"/>
</fig>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Artifact analysis</title>
<p>We conducted a thorough examination of all the benchmark footage captured, specifically focusing on identifying visual artifacts. We found that each candidate exhibits a different set of artifacts that, in sum, dominate the overall appearance of the volumetric video. The most prominent artifacts for each candidate are presented along with their frequency of occurrence and intensity.</p>
<p>
<bold>Depthkit Studio</bold> is the only application in the benchmark that renders the captured video in a mesh and texture format. This strategy seems to solve some of the artifacts that point cloud-based approaches exhibit. Videos produced with <italic>Depthkit Studio</italic> appear to be more coherent, contain less noise, and blend overlapping sensor data more elegantly. However, the surface reconstruction algorithm implemented in the application has difficulties catching finer details, such as fingers or thin objects. These parts often disappear completely from the reconstruction. Consequently, the color texture of the missing geometry is sometimes incorrectly projected onto surrounding geometry in the scene. These artifacts were present throughout the entire benchmark footage. A less common color mismatch artifact affects the blend between overlapping sensor data, making the seams more noticeable. Despite these artifacts, the footage overall exhibits a smooth and coherent look, and objects are mostly faithfully reconstructed (<xref ref-type="fig" rid="F6">Figure 6</xref>).</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Prominent artifacts in footage captured with <italic>Depthkit Studio</italic>. <bold>(A)</bold> Missing geometry, <bold>(B)</bold> Incorrectly projected color texture, <bold>(C)</bold> Color mismatch.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g006.tif"/>
</fig>
<p>
<bold>LiveScan3D</bold> renders the captured footage as point cloud sequence. Compared to the mesh and texture videos of <italic>Depthkit Studio</italic>, the point cloud footage appears grainy and noisy, but can capture finer detail. The accuracy of the spatial calibration is less precise than with <italic>Depthkit Studio</italic> but better than <italic>VolumetricCapture</italic>. This is noticeable throughout all scenes, with more detailed regions appearing to be duplicated and shifted. Objects often show trails of flying pixels, making silhouettes harder to detect. There is jitter and color mismatch in areas where sensor data overlap. <italic>LiveScan3D</italic> images generally show sufficient detail in regions such as the face or hands, and objects appear cohesive, but the artifacts can cause objects to appear slightly distorted and noisy especially in smaller regions (<xref ref-type="fig" rid="F7">Figure 7</xref>).</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Artifacts in <italic>LiveScan3D</italic> videos: <bold>(A)</bold> Imprecise spatial calibration, <bold>(B)</bold> flying pixels, <bold>(C)</bold> overlapping sensor data.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g007.tif"/>
</fig>
<p>
<bold>VolumetricCapture</bold> shows artifacts similar to <italic>LiveScan3D</italic>, but often more pronounced. Due to the limitations of the calibration routine, the sensors in the <italic>Static Person</italic> and <italic>Dynamic Person</italic> scene had to be positioned further away than in the other applications, reducing the resolution in the captures. In cases where good calibrations could not be achieved, small to medium-sized details appear blurred and distorted, such as the eyes, ears and nose of a face. Incorrectly projected colors appear as large seams that extend throughout the video. Flying pixels are not only visible near objects but are scattered throughout the entire capture volume. The general context and content of the scene is recognizable, and larger objects are correctly reconstructed. However, <italic>VolumetricCapture</italic> also captured some details that were not visible in other candidates; for example, the leg of a chair (<xref ref-type="fig" rid="F8">Figure 8</xref>).</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Artifacts in <italic>VolumetricCapture</italic> footage: <bold>(A)</bold> Imprecise spatial calibration, <bold>(B)</bold> incorrectly projected colors, <bold>(C)</bold> flying pixels (Point size was slightly increased for illustration purposes).</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 Subjective study results</title>
<p>For the subjective fidelity perception study, we asked participants to rate the volumetric video solely on its visual fidelity. We selected a cohort familiar with the processes involved in creating and analyzing computer graphics. 19 participants were recruited from a game design graduate program. All 19 participants completed the study, with an average session length of 10&#x2013;15&#xa0;min. Each participant voted 22 times during the study, for a total of 418 votes. In cases where participants chose a different candidate during the repeated presentation of a pair of comparisons, both votes were invalidated. This affected 66 votes or 15.7% of the votes. <italic>Depthkit Studio</italic> and <italic>LiveScan3D</italic> were compared 190 times (160 valid), <italic>Depthkit Studio</italic> and <italic>VolumetricCapture</italic> 114 times (98 valid), <italic>VolumetricCapture</italic> and <italic>LiveScan3D</italic> 114 times (94 valid). To assess statistical significance, we conducted goodness-of-fit tests for each evaluation scenario. As a measure, we use the sum of valid votes that each framework received from all participants. For scenes where all three software candidates produced sequences (<italic>Static</italic> scene<italic>, Dynamic</italic> scene<italic>, Object</italic> scene), Pearson&#x2019;s chi-squared (&#x3c7;<sup>2</sup>) tests were performed. For the <italic>Hand</italic> and <italic>Jenga</italic> scene, where only <italic>LiveScan3D</italic> and <italic>Depthkit Studio</italic> were able to capture footage, Barnard&#x2019;s exact test was used to account for the smaller sample size of votes. In the tests, we compare the observed number of recorded votes to the expected number, assuming an equal distribution of votes for each framework. Our null hypothesis assumes that the recorded numbers of votes follow a random distribution, while the alternative hypothesis is expected to have differing distribution proportions. After adjusting p-values to account for multiple comparisons with the Benjamini&#x2013;Hochberg procedure, we rejected the null hypothesis in all scenarios, indicating statistically significant differences in the distribution of recorded votes at a significance level of <bold>&#x3b1;</bold> &#x3d; 5%. The results of the significance tests are presented in <xref ref-type="table" rid="T3">Table 3</xref>. All statistical analyses were performed in python (version 3.12.1) with the packages <italic>SciPy</italic> (version 1.14.1) for the hypothesis testing and <italic>Pinguin</italic> (version 0.5.5) for multiple comparisons adjustment.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Significance test results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Scenes</th>
<th align="left">Test</th>
<th align="left">p</th>
<th align="left">p Adjusted</th>
<th align="left">Significant</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Static</td>
<td align="left">Pearson chi-squared test</td>
<td align="left">1,36074E-11</td>
<td align="left">6,80369E-11</td>
<td align="left">True</td>
</tr>
<tr>
<td align="left">Dynamic</td>
<td align="left">Pearson chi-squared test</td>
<td align="left">3,71198E-10</td>
<td align="left">9,27995E-10</td>
<td align="left">True</td>
</tr>
<tr>
<td align="left">Object</td>
<td align="left">Pearson chi-squared test</td>
<td align="left">0,014,737,039</td>
<td align="left">0,014,737,039</td>
<td align="left">True</td>
</tr>
<tr>
<td align="left">Hand</td>
<td align="left">Barnard&#x2019;s exact test</td>
<td align="left">0,000,162,039</td>
<td align="left">0,000,202,548</td>
<td align="left">True</td>
</tr>
<tr>
<td align="left">Jenga</td>
<td align="left">Barnard&#x2019;s exact test</td>
<td align="left">0,000,162,039</td>
<td align="left">0,000,202,548</td>
<td align="left">True</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The percentage vote distribution among the candidates is shown in <xref ref-type="fig" rid="F9">Figure 9</xref> and the total number of votes is shown in <xref ref-type="table" rid="T4">Table 4</xref>. When comparing <italic>Depthkit Studio</italic> with <italic>LiveScan3D</italic>, <italic>Depthkit Studio</italic> was preferred, with 90% (144 votes) of the votes. <italic>Depthkit Studio</italic> was slightly less preferred, but also strongly preferred over <italic>VolumetricCapture</italic> with 84% (82 votes) of the votes. <italic>LiveScan3D</italic> was strongly preferred over <italic>VolumetricCapture</italic> with 78% (74 votes) in favor. When comparing across individual scenes, the votes in the <italic>Static Person</italic> and <italic>Dynamic Person</italic> scene are similarly distributed. <italic>Depthkit Studio</italic> receives on average 64% of the votes, <italic>LiveScan3D</italic> 26% and <italic>VolumetricCapture</italic> 9%. Only in the <italic>Object Interaction</italic> scene, the vote distribution was more evenly distributed, with <italic>LiveScan3D</italic> receiving about 35% and <italic>VolumetricCapture</italic> receiving about 19% of the votes compared to <italic>Depthkit Studio</italic>. For the close-up scenes <italic>Hand</italic> and <italic>Hand Interaction</italic>, <italic>Depthkit Studio</italic> continued to be almost exclusively preferred over <italic>LiveScan3D</italic> with 94% of the votes (<xref ref-type="fig" rid="F10">Figure 10</xref>).</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Vote Distribution in the subjective Perception study.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g009.tif"/>
</fig>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Fidelity perception study total vote counts.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Votes</th>
<th align="left">Total</th>
<th align="left">Total valid</th>
<th align="left">Total invalid</th>
<th align="left">Depthkit studio vs. LiveScan3D</th>
<th align="left">Depthkit studio vs. VolumetricCapture</th>
<th align="left">LiveScan3D vs.<break/>VolumetricCapture</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Combined</td>
<td align="left">418</td>
<td align="left">352</td>
<td align="left">66</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">Depthkit Studio</td>
<td align="left">249</td>
<td align="left">226</td>
<td align="left">23</td>
<td align="left">144</td>
<td align="left">82</td>
<td align="left">&#x2014;</td>
</tr>
<tr>
<td align="left">LiveScan3D</td>
<td align="left">115</td>
<td align="left">90</td>
<td align="left">25</td>
<td align="left">16</td>
<td align="left">&#x2014;</td>
<td align="left">74</td>
</tr>
<tr>
<td align="left">VolumetricCapture</td>
<td align="left">54</td>
<td align="left">36</td>
<td align="left">18</td>
<td align="left">&#x2014;</td>
<td align="left">16</td>
<td align="left">20</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Vote distribution per scene in the subjective perception study.</p>
</caption>
<graphic xlink:href="frsip-05-1405808-g010.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<sec id="s4-1">
<title>4.1 Result interpretation</title>
<p>
<italic>Depthkit Studio</italic> shows the best results across all measured parameters. It scores highest in spatial accuracy, shows the least intensive artifacts and is the preferred candidate in the subjective perception study. We also found Depthkit Studio to be the most accessible candidate, and therefore suited for users novel to the field of volumetric video. At the same time, <italic>Depthkit Studio</italic> is the most expensive application, with a price tag of 3000 USD per month. This may be at the edge of what is considered low cost or affordable. We would therefore recommend <italic>Depthkit Studio</italic> to users who are confident that a sparse RGBD-camera volumetric video setup meets the needs of their use case and want to extract the highest possible fidelity from their system. In terms of fidelity, LiveScan3D ranks below <italic>Depthkit Studio</italic> by a larger margin, but above <italic>VolumetricCapture.</italic> Hence, we can recommend <italic>LiveScan3D</italic> as a solid entry point for beginning with volumetric captures, when the highest fidelity is not strictly required. As <italic>LiveScan3D</italic> can be modified due to the available source code, it is a suitable platform for volumetric video research and experimentation. <italic>VolumetricCapture</italic> is available as free, but proprietary software. It is specialized to be used in a distributed system mode. This offers advantages for certain use cases, with a high (&#x3e;10) number of sensors and large capture volumes, or when only low-performance hardware is available. At the same time, this distributed approach is labor-intensive with regard to hardware and software setup, making it difficult to recommend this application to first-time users, even though it is well documented. We can therefore only recommend VolumetricCapture in use cases, where a high number of sensors and large capture volumes are required.</p>
<p>The results of the conducted survey show that participants preferred the footage of <italic>Depthkit Studio</italic>, which is the only candidate that employs heavy use of post-processing filters and a mesh and texture-based rendering. Results do not provide insights into which of these aspects contribute more to the visual fidelity, but the work of <xref ref-type="bibr" rid="B30">Zerman et al. (2019)</xref> suggests that a mesh-based visualization is preferred over a point cloud based visualization, provided that it is encoded with a high enough bitrate. Accurate calibration seems to have a rather large effect on the visual quality as well. <italic>Depthkit Studio</italic> consistently showed the least deviation in calibration accuracy across all scenes and was also the most favorably rated application for fidelity.</p>
<p>Our results indicate to developers of volumetric video applications that encoding their captures in a mesh-based format, the use of filters and a spatial calibration methods with high accuracy are advantageous for visual fidelity.</p>
</sec>
<sec id="s4-2">
<title>4.2 Method limitations</title>
<p>Although our proposed benchmark environment was designed to capture footage in a variety of scenarios, it primarily focused on capturing people and their interaction with objects. The benchmark could be expanded to include a wider variety of scenes and environments, such as outdoor settings, capturing entire environments rather than individual subjects, or testing under challenging lighting conditions. Additionally, the benchmark only evaluates how the applications perform relative to each other under identical capture conditions. However, each application may perform differently when the capture environment is adapted to its individual strengths and weaknesses, such as by using more sensors, alternative sensor models or different camera configurations. While our captures provide a general estimate of how the applications perform &#x201c;in the wild,&#x201d; further improvements in visual quality are certainly achievable with tailored adjustments.</p>
<p>Our approach to measuring spatial accuracy provides some basic estimation about image fidelity but lacks detailed technical parameters. Similarly, the artifact analysis lacks a quantifiable measurement of artifact intensity. This may be sufficient for a relative visual perception analysis between captures, but it lacks detailed objective parameters that are required when measuring volumetric videos on a larger scale.</p>
<p>Unlike other publications that have presented visual perception studies of volumetric videos using more nuanced rating scales, our study used only a binary voting system to capture the impressions of the participants. This approach forces participants to clearly choose a single candidate, even if the perceived difference in fidelity is small. This is sufficient to determine which system is relatively preferred over the other, but it doesn&#x2019;t show how large the differences in perceived fidelity between the candidates are. Additionally, our subject pool is limited to a small, homogenous group from a single profession (game designers). Influenced by their domain knowledge, this group might have a different definition of high fidelity than the general population, therefore the results of our study might not be applicable to a broader audience. Re-conducting this study with a more nuanced rating scale and a more diverse study pool could provide better insights into which aspects contribute most to providing a sense of high fidelity. The UI and UX analysis of the tested systems could benefit from a systemic evaluation through a user study, as the usability has only been evaluated internally, by a group of technically skilled experts in the field of volumetric capture. Novice users may experience more challenges or would rate the usability of the candidates differently. A usability study could provide valuable insights into how volumetric capture workflows should be designed to help adaptability and efficient usage. Finally, this paper can only capture the state of knowledge in the field at the time of publication. Due to the rapid advances in this relatively young field, the concrete results of the fidelity analysis have a certain expiration date, although we believe that the proposed benchmark itself will be viable for future iterations of sparse camera volumetric video capture systems.</p>
</sec>
<sec id="s4-3">
<title>4.3 Outlook</title>
<p>Although significant advances have been made in visual fidelity, the overall image quality of sparse camera volumetric video capture applications is not yet sufficient for many use cases with higher demands. Dense camera studio captures can provide photorealistic high-fidelity captures today, but are not accessible to most researchers, creators and developers due to their high costs. This is not likely to change any time soon. Affordable sparse camera systems not only fill a niche of lower cost video creation but help to raise awareness for the field of spatial imaging. Certainly, further research and investment into low-cost sparse camera systems is needed to improve volumetric video creation. Better RGBD cameras could provide higher depth resolutions and depth stability, and the software can improve visual fidelity by deploying solid filtering and fusion pipelines, as <italic>Depthkit Studio</italic> shows.</p>
<p>However, RGBD cameras are only one of many possible capture solutions in this emerging field. Monocular depth estimation algorithms have seen major developments in recent years and are already deployed for certain VFX tasks, such as scene relighting or masking. While being a relatively new technique, 4D foundation models have been shown to generate impressive scene depth from only sparse image inputs as well. If these models are proven useful for sparse camera volumetric video capture, specialized and expensive RGBD sensors, might become superfluous. This could further improve accessibility for volumetric video capture. As indicated by the subjective fidelity study, the visualization format of the captured data has a palpable impact on the perceived fidelity. New volumetric visualization formats, such as Gaussian Splatting, remedy classical weaknesses of traditional visualization formats such as transparent, reflective or caustic surfaces.</p>
<p>Although these techniques are new, they rely on the same underlying infrastructure as RGBD-camera based capture workflows, such as spatial calibration, temporal synchronization, data streaming, and fusion. Current sparse camera volumetric video applications are therefore ideally suited to adapt these new techniques. It will be interesting to see which techniques in this rapidly developing field will prevail and how they will contribute to a more accessible and higher fidelity volumetric video creation.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The virtual reality study environment is available on the GitHub repository of Experimental Surgery, Charit&#x00E9; - Universit&#x00E4;tsmedizin Berlin: <ext-link ext-link-type="uri" xlink:href="https://github.com/ExperimentalSurgery/Volumetric_Video_Comparision_Study">https://github.com/ExperimentalSurgery/Volumetric_Video_Comparision_Study</ext-link>. The study results and analysis are available on the following repository: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/13920279">https://zenodo.org/records/13920279</ext-link>. The datasets generated for this study, mainly the volumetric video captures, are too large in size (&#x003e;100GB) to be reasonably stored in a permanent repository. The datasets are available without restrictions upon request. To access the data, please contact the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>CR: Conceptualization, data curation, formal analysis, investigation, methodology, project administration, resources, software, validation, visualization, writing&#x2013;original draft, writing&#x2013;review and editing. IMS: funding acquisition, supervision,validation, writing&#x2013;original draft, writing&#x2013;review and editing, visualization. MQ: funding acquisition, supervision, validation, writing&#x2013;original draft, writing&#x2013;review and editing, conceptualization, project administration, resources.</p>
</sec>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The authors declare that financial support was received for the research, authorship, and publication of this article. The authors acknowledge the support of the Cluster of Excellence Matters of Activity. Image Space Material funded by the German Research Foundation (grant no. EXC2025&#x2013;390648296) and from the Federal Ministry of Education and Research. project GreifbAR: Skillful interaction of user hands and fingers with real tools in mixed reality worlds (grant no. 16SV8753).</p>
</sec>
<ack>
<p>We would like to thank Dr. Zeynep Akbal, Karl Eisentr&#xe4;ger, Christoph R&#xfc;ger and Dana Ruck for their guidance and review during the writing of this article.</p>
</ack>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.13920279">https://doi.org/10.5281/zenodo.13920279</ext-link> [Accessed 11 October 2024].</p>
</fn>
<fn id="fn2">
<label>2</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/VCL3D/VolumetricCapture/">https://github.com/VCL3D/VolumetricCapture</ext-link> [Accessed 22 March 2024].</p>
</fn>
<fn id="fn3">
<label>3</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/BuildingVolumes/LiveScan3D">https://github.com/BuildingVolumes/LiveScan3D</ext-link> [Accessed 22 March 2024].</p>
</fn>
<fn id="fn4">
<label>4</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.13908942">https://doi.org/10.5281/zenodo.13908942</ext-link> [Accessed 10 October 2024].</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Alain</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zerman</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ozcinar</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Valenzise</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Introduction to immersive video technologies</article-title>,&#x201d; in <source>Immersive video technologies</source> (<publisher-name>Elsevier</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1016/B978-0-32-391755-1.00007-92</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Amamra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Aouf</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>GPU-based real-time RGBD data filtering</article-title>. <source>J. Real-Time Image Proc</source> <volume>14</volume> (<issue>14</issue>), <fpage>323</fpage>&#x2013;<lpage>340</lpage>. <pub-id pub-id-type="doi">10.1007/s11554-014-0453-7</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Barron</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Mildenhall</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Verbin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>P. P.</given-names>
</name>
<name>
<surname>Hedman</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Mip-NeRF 360: unbounded anti-aliased neural radiance fields</article-title>,&#x201d; in <conf-name>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>New Orleans, LA</conf-loc>, <conf-date>June 18&#x2013;24, 2022</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>5460</fpage>&#x2013;<lpage>5469</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.00539</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baumberg</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2002</year>). &#x201c;<article-title>Blending images for texturing 3D models</article-title>,&#x201d; in <source>Procedings of the British machine vision conference 2002</source> (<publisher-loc>Cardiff, United Kingdom</publisher-loc>: <publisher-name>British Machine Vision Association</publisher-name>), <fpage>38.1</fpage>&#x2013;<lpage>38.10</lpage>. <pub-id pub-id-type="doi">10.5244/C.16.38</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Beck</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Froehlich</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Volumetric calibration and registration of multiple RGBD-sensors into a joint coordinate system</article-title>,&#x201d; in <conf-name>2015 IEEE Symposium on 3D User Interfaces (3DUI)</conf-name>, <conf-loc>Arles, France</conf-loc>, <conf-date>March 23-24, 2015</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>89</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1109/3DUI.2015.7131731</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bhat</surname>
<given-names>S. F.</given-names>
</name>
<name>
<surname>Birkl</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wofk</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wonka</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <source>ZoeDepth: zero-shot transfer by combining relative and metric depth</source>. <comment>ArXiv</comment>. <pub-id pub-id-type="doi">10.48550/ARXIV.2302.12288</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bochkovskii</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Delaunoy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Germain</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Santos</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Richter</surname>
<given-names>S. R.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <source>Depth Pro: sharp monocular metric depth in less than a second</source>. <comment>ArXiv</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2410.02073</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>B&#xf6;nsch</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Shapiro</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Volumetric video capture using unsynchronized, low-cost cameras</article-title>,&#x201d; in <source>Proceedings of the 14th international joint conference on computer vision, imaging and computer graphics theory and applications</source> (<publisher-loc>Prague, Czech Republic</publisher-loc>: <publisher-name>VISIGRAPP</publisher-name>), <fpage>255</fpage>&#x2013;<lpage>261</lpage>. <pub-id pub-id-type="doi">10.5220/0007373202550261</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<collab>BuildingVolumes</collab> (<year>2023</year>). <article-title>Unity geometry sequence streamer</article-title>. <source>Github</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://buildingvolumes.github.io/Unity_Geometry_Sequence_Streaming/">https://buildingvolumes.github.io/Unity_Geometry_Sequence_Streaming/</ext-link>(Accessed March 22, 2024)</comment>.</citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chibane</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bansal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lazova</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pons-Moll</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Stereo radiance fields (SRF): learning view synthesis for sparse views of novel scenes</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Nashville, TN</conf-loc>, <conf-date>June 20&#x2013;25, 2021</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>7907</fpage>&#x2013;<lpage>7916</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00782</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <source>From capture to display: a survey on volumetric video</source>. <comment>ArXiv</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2309.05658</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kerbl</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kopanas</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Leimk&#xfc;hler</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Drettakis</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>3D Gaussian splatting for real-time radiance field rendering</article-title>,&#x201d; in <source>2023 ACM transaction on graphics</source>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1145/3592433</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kowalski</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Naruniec</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Daniluk</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Livescan3D: a fast and inexpensive 3D data acquisition system for multiple Kinect v2 sensors</article-title>,&#x201d; in <conf-name>2015 International Conference on 3D Vision</conf-name>, <conf-loc>Lyon</conf-loc>, <conf-date>October 19&#x2013;22, 2021</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>318</fpage>&#x2013;<lpage>325</lpage>. <pub-id pub-id-type="doi">10.1109/3DV.2015.43</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meerits</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Thomas</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Nozick</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Saito</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>FusionMLS: highly dynamic 3D reconstruction with consumer-grade RGB-D cameras</article-title>. <source>Comp. Vis. Media</source> <volume>4</volume>, <fpage>287</fpage>&#x2013;<lpage>303</lpage>. <pub-id pub-id-type="doi">10.1007/s41095-018-0121-0</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Newcombe</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Izadi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hilliges</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Molyneaux</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>KinectFusion</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Real-time dense surface mapping and tracking</article-title>,&#x201d; in <conf-name>2011 10th IEEE International Symposium on Mixed and Augmented Reality</conf-name>, <conf-loc>Basel, Switzerland</conf-loc>, <conf-date>October 26&#x2013;29, 2011</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>127</fpage>&#x2013;<lpage>136</lpage>. <pub-id pub-id-type="doi">10.1109/ISMAR.2011.6092378</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rijal</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pokhrel</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Om</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ojha</surname>
<given-names>V. P.</given-names>
</name>
</person-group> (<year>2023</year>). <source>Comparing depth estimation of azure Kinect and Realsense D435i cameras</source>. <pub-id pub-id-type="doi">10.2139/ssrn.4597442</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<collab>Scatter</collab> (<year>2024</year>). <article-title>Unity expansion package</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.depthkit.tv/unity-expansion-package">https://www.depthkit.tv/unity-expansion-package</ext-link> (Accessed March 22, 2024)</comment>.</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sterzentsenko</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Doumanoglou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Thermos</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zioulis</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zarpalas</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Daras</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Deep soft procrustes for markerless volumetric sensor alignment</article-title>,&#x201d; in <conf-name>2020 IEEE Conference on Virtual Reality and 3D User Interfaces (VR)</conf-name>, <conf-loc>Atlanta, GA</conf-loc>, <conf-date>March 22&#x2013;26, 2020</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>818</fpage>&#x2013;<lpage>827</lpage>. <pub-id pub-id-type="doi">10.1109/VR46266.2020.00106</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sterzentsenko</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Karakottas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Papachristou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zioulis</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Doumanoglou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zarpalas</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>A low-cost, flexible and portable volumetric capturing system</article-title>,&#x201d; in <conf-name>2018 14th International Conference on Signal-Image Technology and Internet-Based Systems (SITIS)</conf-name>, <conf-loc>Las Palmas de Gran Canaria, Spain</conf-loc>, <conf-date>November 26&#x2013;29, 2018</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>200</fpage>&#x2013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.1109/SITIS.2018.00038</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Subramanyam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Viola</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Cesar</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Comparing the quality of highly realistic digital humans in 3DoF and 6DoF: a volumetric video case study</article-title>,&#x201d; in <conf-name>2020 IEEE Conference on Virtual Reality and 3D User Interfaces (VR)</conf-name>, <conf-loc>Atlanta, GA</conf-loc>, <conf-date>March 22&#x2013;26, 2020</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>127</fpage>&#x2013;<lpage>136</lpage>. <pub-id pub-id-type="doi">10.1109/VR46266.2020.00031</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>T&#xf6;lgyessy</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dekan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chovanec</surname>
<given-names>&#x13d;.</given-names>
</name>
<name>
<surname>Hubinsk&#xfd;</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Evaluation of the azure Kinect and its comparison to Kinect V1 and Kinect V2</article-title>. <source>Sensors</source> <volume>21</volume>, <fpage>413</fpage>. <pub-id pub-id-type="doi">10.3390/s21020413</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Truong</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Rakotosaona</surname>
<given-names>M.-J.</given-names>
</name>
<name>
<surname>Manhardt</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Tombari</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>SPARF: neural radiance fields from sparse and noisy poses</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC</conf-loc>, <conf-date>June 17-24, 2023</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>4190</fpage>&#x2013;<lpage>4200</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00408</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Waechter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Moehrle</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Goesele</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Let there Be color! Large-scale texturing of 3D reconstructions</article-title>,&#x201d; in <source>ECCV 2014 lecture notes in computer science</source> (<publisher-name>Springer International Publishing</publisher-name>), <fpage>836</fpage>&#x2013;<lpage>850</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-10602-1_54</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Leroy</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Cabon</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chidlovskii</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Revaud</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>DUSt3R: geometric 3D vision made easy</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Seattle, WA</conf-loc>, <conf-date>June 16-22, 2024</conf-date>, <fpage>20697</fpage>&#x2013;<lpage>20709</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01956</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <source>Depth anything: unleashing the power of large-scale unlabeled data</source>. <comment>arXiv</comment>. <pub-id pub-id-type="doi">10.48550/ARXIV.2401.10891</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zerman</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ozcinar</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Smolic</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Subjective and objective quality assessment for volumetric video compression</article-title>. <source>Electron. Imaging</source> <volume>31</volume>, <fpage>323-1</fpage>&#x2013;<lpage>323-7</lpage>. <pub-id pub-id-type="doi">10.2352/ISSN.2470-1173.2019.10.IQSP-323</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zerman</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ozcinar</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Smolic</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Textured mesh vs coloured point cloud: a subjective study for volumetric video compression</article-title>,&#x201d; in <conf-name>2020 Twelfth International Conference on Quality of Multimedia Experience (QoMEX)</conf-name>, <conf-loc>Athlone, Ireland</conf-loc>, <conf-date>May 26-28, 2020</conf-date> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/QoMEX48832.2020.9123137</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Herrmann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hur</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jampani</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Cole</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <source>MonST3R: a simple approach for estimating geometry in the presence of motion</source>. <comment>Arxiv:2410.03825</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2410.03825</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Funkhouser</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Deep depth completion of a single RGB-D image</article-title>,&#x201d; in <conf-name>The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Salt Lake City, UT</conf-loc>, <conf-date>June 18-23, 2018</conf-date>, <fpage>P175</fpage>&#x2013;<lpage>P185</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00026</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>