<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Sig. Proc.</journal-id>
<journal-title>Frontiers in Signal Processing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Sig. Proc.</abbrev-journal-title>
<issn pub-type="epub">2673-8198</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">883696</article-id>
<article-id pub-id-type="doi">10.3389/frsip.2022.883696</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Signal Processing</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Perceptual evaluation of approaches for binaural reproduction of non-spherical microphone array signals</article-title>
<alt-title alt-title-type="left-running-head">L&#xfc;beck et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/frsip.2022.883696">10.3389/frsip.2022.883696</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>L&#xfc;beck</surname>
<given-names>Tim</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1621917/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Amengual Gar&#xed;</surname>
<given-names>Sebasti&#xe0; V.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Calamia</surname>
<given-names>Paul</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1849015/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Alon</surname>
<given-names>David Lou</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Crukley</surname>
<given-names>Jeffery</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1819352/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ben-Hur</surname>
<given-names>Zamir</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1854770/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Reality Labs Research</institution>, <institution>Meta</institution>, <addr-line>Redmond</addr-line>, <addr-line>WA</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Data Science and Statistics</institution>, <addr-line>Toronto</addr-line>, <addr-line>ON</addr-line>, <country>Canada</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Faculty of Medicine</institution>, <institution>Department of Speech-Language Pathology</institution>, <institution>University of Toronto</institution>, <addr-line>Toronto</addr-line>, <addr-line>ON</addr-line>, <country>Canada</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1354523/overview">Augusto Sarti</ext-link>, Politecnico di Milano, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/555235/overview">Tim Ziemer</ext-link>, University of Bremen, Germany</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/989311/overview">Jaume Segura-Garcia</ext-link>, University of Valencia, Spain</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Tim L&#xfc;beck, <email>tim.luebeck@th-koeln.de</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Audio and Acoustic Signal Processing, a section of the journal Frontiers in Signal Processing</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>15</day>
<month>08</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>2</volume>
<elocation-id>883696</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>02</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>07</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 L&#xfc;beck, Amengual Gar&#xed;, Calamia, Alon, Crukley and Ben-Hur.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>L&#xfc;beck, Amengual Gar&#xed;, Calamia, Alon, Crukley and Ben-Hur</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Microphone arrays consisting of sensors mounted on the surface of a rigid, spherical scatterer are popular tools for the capture and binaural reproduction of spatial sound scenes. However, microphone arrays with a perfectly spherical body and uniformly distributed microphones are often impractical for the consumer sector, in which microphone arrays are generally mounted on mobile and wearable devices of arbitrary geometries. Therefore, the binaural reproduction of sound fields captured with arbitrarily shaped microphone arrays has become an important field of research. In this work, we present a comparison of methods for the binaural reproduction of sound fields captured with non-spherical microphone arrays. First, we evaluated equatorial microphone arrays (EMAs), where the microphones are distributed on an equatorial contour of a rigid, spherical <xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>. Second, we evaluated a microphone array with six microphones mounted on a pair of glasses. Using these two arrays, we conducted two listening experiments comparing four rendering methods based on acoustic scenes captured in different rooms<sup>2</sup>. The evaluation includes a microphone-based stereo approach (sAB stereo), a beamforming-based stereo approach (sXY stereo), beamforming-based binaural reproduction (BFBR), and BFBR with binaural signal matching (BSM). Additionally, the perceptual evaluation included binaural Ambisonics renderings, which were based on measurements with spherical microphone arrays. In the EMA experiment we included a fourth-order Ambisonics rendering, while in the glasses array experiment we included a second-order Ambisonics rendering. In both listening experiments in which participants compared all approaches with a dummy head recording we applied non-head-tracked binaural synthesis, with sound sources only in the horizontal plane. The perceived differences were rated separately for the attributes timbre and spaciousness. Results suggest that most approaches perform similarly to the Ambisonics rendering. Overall, BSM, and microphone-based stereo were rated the best for EMAs, and BFBR and microphone-based stereo for the glasses array.</p>
</abstract>
<kwd-group>
<kwd>binaural reproduction</kwd>
<kwd>microphone arrays</kwd>
<kwd>beamforming</kwd>
<kwd>ambisonics</kwd>
<kwd>perceptual evaluation</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>With the growing number of virtual and augmented reality (AR/VR) applications, capture and immersive reproduction of sound scenes has become increasingly popular. Microphone arrays facilitate the spatial capture of a sound field and its subsequent reproduction, either over loudspeakers or binaurally for a single listener over headphones. In binaural reproduction, which is the focus of this work, the use of microphone arrays offers multiple advantages over dummy head recordings. For instance, arbitrary head orientation can be synthesised and individual head-related transfer functions (HRTFs) can easily be integrated, while performing the signal processing in real-time (<xref ref-type="bibr" rid="B26">Helmholz et al., 2019</xref>; <xref ref-type="bibr" rid="B36">McCormack and Politis, 2019</xref>), making microphone arrays flexible tools. In particular, spherical microphone arrays (SMAs) are favorable array configurations for sound field capture and spatial reproduction. The symmetric spherical body and the uniformly distributed microphones enable encoding the sound field into Ambisonics signals using the spherical harmonics (SH) transform and radial filtering. This process is denoted as SH beamforming. Thereby, the number of microphones determines the highest SH order, and hence, the highest spatial resolution of the encoded sound field. Together with a set of HRTFs, the Ambisonics signals can then be decoded into binaural signals. Ambisonics encoding and (binaural) decoding have been extensively researched in recent years (<xref ref-type="bibr" rid="B7">Avni et al., 2013</xref>; <xref ref-type="bibr" rid="B10">Bernsch&#xfc;tz, 2016</xref>; <xref ref-type="bibr" rid="B59">Zotter and Frank, 2019</xref>) and together form a well established spatial-audio format.</p>
<p>So far, microphone arrays with a perfectly spherical body and with a large number of microphones are only available on dedicated hardware, such as the Eigenmike (<xref ref-type="bibr" rid="B38">Meyer and Elko, 2002</xref>), Zylia array (<xref ref-type="bibr" rid="B60">Zylia Sp. z o.o., 2022</xref>), or several first-order Ambisonics microphones, such as the Ambeo <xref ref-type="bibr" rid="B53">Sennheiser electronic GmbH &#x26; Co. KG (2022)</xref>, or TetraMic <xref ref-type="bibr" rid="B19">Core Sound (2022a)</xref>. The integration of microphone arrays into existing consumer electronic devices, such as AR glasses, smartphones, or any other wearable devices, could open up an interesting field of AR and VR applications. For example, by mounting several microphones on a pair of glasses, the wearer could capture the sound field from their perspective and then reproduce it spatially. However, implementing microphone arrays with perfectly spherical configurations and with many microphones on consumer devices can be challenging. When using such non-spherical microphone arrays without uniform sampling, encoding the captured sound field into Ambisonics signals using the discrete SH transform may suffer from ill-conditioning problems <xref ref-type="bibr" rid="B46">Rafaely (2008</xref>, <xref ref-type="bibr" rid="B47">2015)</xref>; <xref ref-type="bibr" rid="B51">Reddy and Hegde (2017)</xref>. Alternative approaches for the binaural reproduction of non-spherical array configurations are therefore required. Several approaches have been proposed in the literature and are reviewed below.</p>
<p>One approach that is not limited to spherical arrays is beamforming-based binaural reproduction (BFBR) (<xref ref-type="bibr" rid="B23">Duraiswami et al., 2005</xref>; <xref ref-type="bibr" rid="B43">O&#x2019;Donovan et al., 2008</xref>), which applies a similar concept to Ambisonics decoding of SMAs. The sound field is decomposed into components impinging from different directions with a set of beamformers. Each component is then convolved with an HRTF from the corresponding direction. The summation of all convolved sound field components yields the binaural signals to which the listener would be exposed at the place of the microphone array. Depending on the array configuration, the beamforming filters can be calculated based on analytically derived or measured array steering vectors, which describe the anechoic array transfer functions from surrounding far-field sound sources to the microphone array. For SMAs, BFBR and Ambisonics can lead to identical results. <xref ref-type="bibr" rid="B28">Ifergan (2020)</xref> showed under which conditions BFBR and Ambisonics converge to the same solution and presented a theoretical framework for the design of beamformers for spherical and circular arrays.</p>
<p>Recently, <xref ref-type="bibr" rid="B35">Madmoni et al. (2020)</xref> introduced a BFBR method with binaural signal matching (BSM). It extends the BFBR method and directly incorporates the HRTFs into the beamformer coefficients. Applying filter and sum beamforming for each ear, it separately estimates the binaural signals from the array signals directly with a minimum mean square error metric. <xref ref-type="bibr" rid="B49">Rasumow et al. (2011</xref>, <xref ref-type="bibr" rid="B48">2017)</xref> introduced a similar approach denoted as the virtual artificial head. Although they developed it for a planar array without a rigid scatterer, it follows the same filter and sum beamformer principle. A comprehensive comparison of different regularization methods for design of these filters is presented by <xref ref-type="bibr" rid="B50">Rasumow et al. (2016)</xref>. <xref ref-type="bibr" rid="B17">Calamia et al. (2017)</xref> also proposed both BFBR and BSM approaches for use with a microphone array distributed over the surface of a helmet.</p>
<p>
<xref ref-type="bibr" rid="B4">Ahrens et al. (2021c)</xref> introduced a method to analytically derive Ambisonics signals from capture with equatorial microphone arrays (EMAs), which are spherical-rigid bodies with microphones only along the equator. In <xref ref-type="bibr" rid="B2">Ahrens et al. (2021a</xref>,<xref ref-type="bibr" rid="B3">b)</xref>, the authors extended the method for arrays with nearly, rather than perfectly, circular geometries (eXMAs). The basic idea is to bypass the SH transform with a linear combination of the microphone signals, and to pre-calculate frequency-dependent weights. These weights can also be calculated with a set of steering vectors. Hence, this approach can be regarded as a beamformer whose outputs are SH signals. A similar idea was presented in <xref ref-type="bibr" rid="B56">Tourbabin and Rafaely (2015)</xref>, who introduced a beamformer whose output is first-order Ambisonics signals (SH signals after radial filtering). Although the eXMA approach is a promising method, at the time of this work, eXMA was still in the optimization phase and is thus not further evaluated in this study.</p>
<p>A less complex approach is motion-tracked-binaural reproduction (MTB) (<xref ref-type="bibr" rid="B5">Algazi et al., 2004</xref>; <xref ref-type="bibr" rid="B1">Ackermann et al., 2020</xref>). The signals of a pair of opposite (antipodal) microphones of an EMA are directly used as so-called pseudobinaural signals. Different head orientations can be synthesized by interpolation between neighbouring microphone signals. Time differences and scattering effects due to the spherical body between the microphones ensure good restoration of interaural level and time differences (ILDs and ITDs) without HRTF processing. However, no pinnae cues can be synthesized, leading to a loss of high frequency spatial details. Moreover, the absence of pinnae cues complicates the localization of elevated sources, and can lead to front-back confusions <xref ref-type="bibr" rid="B5">Algazi et al. (2004)</xref>. In static (non-head-tracked) binaural synthesis, this approach can be regarded as microphone-based stereo. Another stereo approach is XY stereo. Two microphones with cardioid directivity patterns are placed nearly collocated at 90&#xb0; to each other. ILDs caused by the directivities create a spatial image on the reproduction side. Since the microphones are collocated, XY stereo does not produce any time differences between both channels, and thus can hardly synthesise ITDs. Employing two beamformers and steering to (<italic>&#x3d5;</italic> &#x3d; 45&#xb0;, <italic>&#x3b8;</italic> &#x3d; 90&#xb0;) and (<italic>&#x3d5;</italic> &#x3d; 315&#xb0;, <italic>&#x3b8;</italic> &#x3d; 90&#xb0;) can emulate XY microphony with microphone arrays.<xref ref-type="fn" rid="fn2">
<sup>1</sup>
</xref>.</p>
<p>All of these approaches have the potential to recreate a spatial image of the captured sound field. This work presents a perceptual comparison of BFBR, BSM, AB stereo, and XY stereo. We conducted two comparative listening experiments with static non-head-tracked binaural synthesis based on the Multiple Stimulus with Hidden Reference and Anchor (MUSHRA) paradigm. BFBR and BSM have mainly been investigated for spherical or circular array configurations (<xref ref-type="bibr" rid="B23">Duraiswami et al., 2005</xref>; <xref ref-type="bibr" rid="B43">O&#x2019;Donovan et al., 2008</xref>; <xref ref-type="bibr" rid="B28">Ifergan, 2020</xref>; <xref ref-type="bibr" rid="B34">Madmoni et al., 2021</xref>). In this work, we are interested in evaluating arbitrary array configurations. Thus, in a first experiment, we evaluated the approaches with capture from EMAs with six and eight microphones. In a second experiment, we evaluated a microphone array with six microphones mounted on a pair of glasses, as an example of a compact consumer array. To compare the performance of the approaches to the established Ambisonics decoding approach, in both experiments, we also included renderings from capture with consumer SMAs: in Experiment 1, fourth-order Ambisonics renderings based on Eigenmike capture (<xref ref-type="bibr" rid="B38">Meyer and Elko, 2002</xref>); in Experiment 2, second-order renderings based on OctoMic capture (<xref ref-type="bibr" rid="B20">Core Sound, 2022b</xref>).</p>
</sec>
<sec id="s2">
<title>2 Materials and methods</title>
<p>This section introduces the microphone arrays and the data used for quantitative and perceptual evaluation. Furthermore, it presents an overview of the fundamental theory of the binaural rendering approaches.</p>
<sec id="s2-1">
<title>2.1 Employed microphone array data</title>
<p>We chose to evaluate the approaches based on two different array configurations, which are introduced in the following sections.</p>
<sec id="s2-1-1">
<title>2.1.1 Equatorial microphone array data</title>
<p>In the first step, we examine EMAs. EMAs are the first degradation step from perfectly spherical arrays to arbitrarily shaped arrays. They are easily reproducible array configurations that were evaluated in comparable studies such as <xref ref-type="bibr" rid="B34">Madmoni et al. (2021)</xref> or <xref ref-type="bibr" rid="B4">Ahrens et al. (2021c)</xref> and therefore well suited for a comparison of the approaches. We decide to use EMAs with six and eight microphones to be comparable to the glasses array. The EMA data are based on the spatial impulse response database of <xref ref-type="bibr" rid="B55">Stade et al. (2012)</xref>, which contains impulse responses for SMAs with 29th-order Lebedev grids in four different rooms with varying reverberation times. SH interpolation at <italic>N</italic> &#x3d; 29 affords nearly artifact-free spatial resampling of the data to EMA grids with six and eight microphones, denoted as EMA6 and EMA8 in the following. The exact sampling is depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>. The array steering vectors <italic>V</italic>(<italic>&#x3c9;</italic>) used for calculating the beamforming filters were analytically simulated with the SOFiA toolbox (<xref ref-type="bibr" rid="B13">Bernsch&#xfc;tz et al., 2012</xref>) for 1730 surrounding source positions on a <italic>N</italic> &#x3d; 35 Lebedev grid. For the BFBR, BSM, and Ambisonics rendering we employed Neumann KU100 HRTFs (<xref ref-type="bibr" rid="B9">Bernsch&#xfc;tz, 2013</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Microphone distribution of the EMA with six microphones (left) and eight microphones (right); Both have a center microphone exactly in the front (the direction of the arrow). The microphones used for AB stereo are indicated with &#x201c;A&#x201d; and &#x201c;B&#x201d;. Only the EMA with eight microphones has microphones at <italic>&#x3d5;</italic> &#x3d;90&#xb0; and <italic>&#x3d5;</italic> &#x3d;270&#xb0;.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g001.tif"/>
</fig>
</sec>
<sec id="s2-1-2">
<title>2.1.2 Glasses microphone array data</title>
<p>In the second experiment, we evaluated the performance of the approaches with capture from a wearable microphone array. For this we used impulse response measurements from a 6-microphone array mounted on a pair of glasses. During the measurements, the glasses array was mounted on a KEMAR dummy head. Like the EMAs, the array has a center microphone exactly in the front. Two microphones are at the back of the temple arms, one at the right temple arm, and two more on the front frame of the glasses. A rough diagram of the microphone positions on the glasses array is depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>. The steering vectors were measured in anechoic conditions for 1,020 uniformly distributed surrounding sound sources in the same fashion as presented in <xref ref-type="bibr" rid="B22">Donley et al. (2021)</xref>. Since we only had a binaural reference measured with a KEMAR dummy head for the glasses array data, for the BFBR, BSM, and Ambisonics rendering we employed KEMAR HRTFs (<xref ref-type="bibr" rid="B21">Cuevas-Rodriguez et al., 2019</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>A diagram of the glasses array and its approximate microphone positions. The array has one microphone exactly at the front (4), two at the back of the temple arms used for AB stereo, one additional microphone at the right temple arm (1), and two at the front of the glasses frame. This figure and the microphone positions are not a depiction of any current or future product.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g002.tif"/>
</fig>
</sec>
<sec id="s2-1-3">
<title>2.1.3 Spherical microphone array data</title>
<p>For the comparative Ambisonics renderings in the quantitative and perceptual evaluation, we employed SMA impulse responses measured under the exact same conditions as for the EMAs and glasses array. For comparison with the EMA renderings, we decided to render capture from an Eigenmike at the fourth order. Since the employed database only includes 29th order SMA measurements, we again applied resampling in the SH domain at <italic>N</italic> &#x3d; 29 to the 32-microphone Eigenmike sampling scheme. It should be mentioned that the resampled signals have the same sampling grid as the original Eigenmike measurements. However, the radius of the original SMA (0.0875&#xa0;m) cannot be adjusted to that of the Eigenmike (0.042&#xa0;m), leading to slightly different aliasing effects. For comparison to the glasses array, we rendered 8-channel OctoMic data of the second order.</p>
</sec>
</sec>
<sec id="s2-2">
<title>2.2 Rendering approaches</title>
<sec id="s2-2-1">
<title>2.2.1 Beamforming-based binaural reproduction</title>
<p>The general idea of the BFBR approach is to filter and sum each microphone signal <inline-formula id="inf1">
<mml:math id="m1">
<mml:mi mathvariant="bold">x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> with the beamforming filters <inline-formula id="inf2">
<mml:math id="m2">
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>
<disp-formula id="e1">
<mml:math id="m3">
<mml:mi>z</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>where <italic>&#x3c9;</italic> is the angular frequency, <italic>M</italic> is the number of microphones on the array surface, and <italic>z</italic>(<italic>&#x3c9;</italic>) is the frequency dependent beamformer output for a specific direction. The beamforming filters can be calculated analytically as is typically done for spherical beamforming (<xref ref-type="bibr" rid="B23">Duraiswami et al., 2005</xref>; <xref ref-type="bibr" rid="B43">O&#x2019;Donovan et al., 2008</xref>; <xref ref-type="bibr" rid="B54">Song et al., 2011</xref>), or calculated with measured steering vectors <italic>V</italic>(<italic>&#x3c9;</italic>). In this work, we employed Maximum Directivity (MD) beamformers calculated according to (<xref ref-type="bibr" rid="B22">Donley et al., 2021</xref>, <xref ref-type="disp-formula" rid="e1">eq. (1)</xref>&#x2013;<xref ref-type="disp-formula" rid="e4">(4</xref>). MD beamformers maximize the directivity index for the respective direction. Steering multiple beams to uniformly distributed directions into the sound field, convolving each with an HRTF for the corresponding direction, and summing up, yields the binaural signals <italic>b</italic>(<italic>&#x3c9;</italic>)<sup>
<italic>l</italic>,<italic>r</italic>
</sup>. Several design parameters influence the performance of BFBR. For spherical and cylindrical arrays, <xref ref-type="bibr" rid="B28">Ifergan (2020)</xref> presented a framework for BFBR design. A crucial design parameter is the number of beams, which was further investigated in <xref ref-type="bibr" rid="B27">Ifergan and Rafaely (2022)</xref>. A large number of beams leads to an overlap and thus to a spatial low-pass characteristic. Too few beams leads to poor spatial resolution and loudness instabilities. This is illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>, which shows binaural signals calculated with the BFBR method from simulated array signals of a single plane wave impinging on an EMA6 from the frontal direction. The BFBR method was performed with different numbers of beams, which are indicated with different colors. Additionally, the frontal HRTF is depicted as the dashed black line. The figure shows that with increasing the number of beams, the spectral roll-off compared to the frontal HRTF increases. In this work, we used 32 uniformly distributed MD beams for binaural reproduction, as preliminary listening tests demonstrated the best results for our array geometries. The spectral roll-off was equalized with a minimum phase filter, which compensates for the deviation of the transfer function of a reference microphone from the transfer function of the BFBR output from a single plane wave impinging on the array from the frontal direction.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Magnitude spectra of frontal binaural signals rendered with BFBR with different numbers of beams, based on a single plane wave impinging on an EMA6 from the frontal direction. Additionally, a frontal HRTF of a KU100 dummy head for the frontal direction is depicted. The BFBR signals were calculated with 12, 32, 240, and 1730 uniformly distributed beams.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g003.tif"/>
</fig>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Beamforming-based binaural reproduction with matching of binaural signals</title>
<p>Similar to regular BFBR, for BSM the array signals <italic>x</italic>(<italic>&#x3c9;</italic>) are filtered and summed with pre-calculated filters <italic>c</italic>(<italic>&#x3c9;</italic>)<sup>
<italic>l</italic>,<italic>r</italic>
</sup>
<disp-formula id="e2">
<mml:math id="m4">
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>leading to the binaural signals <italic>b</italic>(<italic>&#x3c9;</italic>)<sup>
<italic>l</italic>,<italic>r</italic>
</sup>. Here, one set of filter coefficients is required for each ear separately. To calculate the BSM filters it is assumed that the sound field consists of <italic>L</italic> acoustic events (sound sources) <inline-formula id="inf3">
<mml:math id="m5">
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. Hence, the binaural signals a listener would be exposed to in the sound field are<disp-formula id="e3">
<mml:math id="m6">
<mml:mi>p</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">s</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>with <inline-formula id="inf4">
<mml:math id="m7">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> being the HRTFs for the directions of the sound sources <italic>s</italic>. The BSM filters can be calculated by minimizing the error<disp-formula id="e4">
<mml:math id="m8">
<mml:msup>
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>The mathematical derivation of this optimization problem is presented in <xref ref-type="bibr" rid="B35">Madmoni et al. (2020</xref>, <xref ref-type="bibr" rid="B34">2021)</xref>. For higher reproduction accuracy towards higher frequencies, we applied the optimization only with respect to the magnitude, starting at a transition frequency of 1.5&#xa0;kHz. This already showed significant improvements for Ambisonics decoding (<xref ref-type="bibr" rid="B52">Sch&#xf6;rkhuber et al., 2018</xref>; <xref ref-type="bibr" rid="B33">L&#xfc;beck et al., 2020</xref>; <xref ref-type="bibr" rid="B8">Ben-hur et al., 2021</xref>). One design parameter for BSM filters is the number of sound sources <italic>L</italic> (steering vectors) used for the calculation of the filters. <xref ref-type="bibr" rid="B35">Madmoni et al. (2020)</xref> showed that a number of <italic>L</italic> &#x3d; 240 leads to perceptually good results, which is why we also decided to use a subset of 240 steering vectors. Moreover, the exact positions of the microphones on the array surface are distinctive parameters for BSM. As shown in <xref ref-type="bibr" rid="B34">Madmoni et al. (2021)</xref> for static reproduction, it is favourable to use microphone locations close to the positions of the listener&#x2019;s ears, while for dynamic binaural synthesis, uniform distribution of the microphones along the equator has advantages. In our study, both EMAs have uniformly distributed microphones. However, only the EMA8 has microphones exactly at <italic>&#x3d5;</italic> &#x3d; 90&#xb0; and <italic>&#x3d5;</italic> &#x3d; 270&#xb0;. On the glasses array, the microphones closest to the ears are on the back of the glasses&#x2019; temple arms.</p>
</sec>
<sec id="s2-2-3">
<title>2.2.3 Stereo approaches</title>
<p>For the AB stereo approach, we directly used the impulse responses of the AB microphones without any processing or equalization. The AB microphones on the EMAs are depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>. Again, it is worth mentioning that the EMA8 has microphones at <italic>&#x3d5;</italic> &#x3d; 90&#xb0; and <italic>&#x3d5;</italic> &#x3d; 270&#xb0;, while the EMA6 does not. For the glasses array case, we used the microphones on the temple arms of the glasses (labeled <bold>A</bold> and <bold>B</bold> in <xref ref-type="fig" rid="F2">Figure 2</xref>).</p>
<p>For XY stereo, we employed two MD beamformers, as used for the BFBR renderings, steering to (<italic>&#x3d5;</italic> &#x3d; 45&#xb0;, <italic>&#x3b8;</italic> &#x3d; 90&#xb0;) and (<italic>&#x3d5;</italic> &#x3d; 315&#xb0;, <italic>&#x3b8;</italic> &#x3d; 90&#xb0;). Since the beams originate from the center of the array, the beamforming-based XY stereo also can hardly produce any ITD cues. Similar to AB stereo, we did not apply any post-processing or equalization. We adapted both methods from the stereo recording with microphones. To emphasize that we simulated these techniques with microphone arrays, we refer to them as sAB (simulated AB) and sXY (simulated XY) in the following.</p>
</sec>
<sec id="s2-2-4">
<title>2.2.4 Ambisonics</title>
<p>The comparative Ambisonics renderings were calculated with<disp-formula id="e5">
<mml:math id="m9">
<mml:mi>b</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:msubsup>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>where <italic>A</italic>
<sub>
<italic>nm</italic>
</sub> are the Ambisonics signals of the SMA, <italic>H</italic>
<sub>
<italic>nm</italic>
</sub> are the SH coefficients of the employed HRTF set, and <italic>N</italic> is the SH rendering order. The Ambisonics signals from the resampled Eigenmike data were calculated by transforming to the SH domain and radial filtering with 30&#xa0;dB soft-limited radial filters from the SOFiA toolbox (<xref ref-type="bibr" rid="B12">Bernsch&#xfc;tz et al., 2011b</xref>,<xref ref-type="bibr" rid="B11">a</xref>). The Ambisonics signals of the OctoMic data were directly obtained from the VST plugin VVOctoEncode<xref ref-type="fn" rid="fn3">
<sup>1</sup>
</xref>. For the Ambisonics renderings in the perceptual evaluation, we applied Magnitude Least-Squares (MagLS) optimization to the HRTFs, as proposed by <xref ref-type="bibr" rid="B52">Sch&#xf6;rkhuber et al. (2018)</xref> and <xref ref-type="bibr" rid="B59">Zotter and Frank (2019)</xref>.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<title>3 Quantitative evaluation</title>
<p>For quantitative evaluation, we considered single plane waves impinging on the microphone arrays. First, we analyzed the restoration of the interaural cues. Since we applied static binaural synthesis in the perceptual evaluation for ITD and ILD analysis, we did not synthesize binaural signals for different head orientations. The ITD and ILD analysis is based on the single plane wave, impinging on the microphone array from 360 directions in the horizontal plane in steps of 1&#xb0; (0&#xb0; &#x3c; &#x3d; <italic>&#x3d5;</italic> &#x3c; 360&#xb0;, <italic>&#x3b8;</italic> &#x3d; 90&#xb0;). For the EMAs and the comparative Eigenmike renderings, we simulated the impinging plane waves, just as with the ATF set. For the glasses and the OctoMic array, we used array impulse responses measured with loudspeakers with a distance of 1.53&#xa0;m in anechoic conditions with exponential sine sweeps, which fairly approximates plane waves. As the reference, for both array types, we directly used HRTFs for the corresponding directions from the respective database - for the EMAs the KU100 HRTFs (<xref ref-type="bibr" rid="B9">Bernsch&#xfc;tz, 2013</xref>), for the glasses array, the KEMAR HRTFs (<xref ref-type="bibr" rid="B21">Cuevas-Rodriguez et al., 2019</xref>). For each impinging plane wave we applied the BFBR, BSM, sAB, and sXY renderings. Additionally, we synthesized Ambisonics renderings according to <xref ref-type="disp-formula" rid="e5">Eq. (5)</xref>. The ITDs were calculated according to <xref ref-type="bibr" rid="B32">Kulkarni et al. (1999)</xref> and the ILDs as the ratio between the energies of the left and right ear signals. ILDs are generally highly frequency dependent, and hence, the interpretation of the ILDs must be done with reservation. <xref ref-type="fig" rid="F4">Figure 4</xref> depicts the ITDs in ms and the ILDs in dB of the EMA6 as functions of the sound incidence directions (<italic>x</italic>-axis). Additionally, the ILDs and ITDs of the KU100 HRTFs are depicted by the black dashed line as a reference. The ITD curve of the Ambisonics rendering matches the reference curve quite well, followed by the curves for BSM and sAB stereo. sXY stereo produces some notable excursions and seems to perform the worst. This supports the assumption that beamforming-based XY stereo can hardly synthesize ITDs. For the ILDs, sAB stereo matches the reference curve the best, whereas BSM and Ambisonics perform similarily to each other, but with larger underestimates. The ILD curve of sXY stereo exhibits some outliers, specifically near 45&#xb0; and 315&#xb0;. Both BFBR and sXY ILD curves are very jagged. This might be due to the beams calculated with few microphones which exhibit side-lobes. In contrast the BSM or Ambisonics ILD curves are quite smooth.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>ITDs and ILDs of the EMA6 array using different binaural reproduction approaches. The reference values (black dashed lines) were calculated from the KU100 HRTFs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g004.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F5">Figure 5</xref> depicts the ITD and ILD errors with respect to the dummy-head reference. Additionally, it shows the corresponding just notable differences (JNDs) of ITDs and ILDs, which are indicated as the gray shaded area. The JNDs of the ITDs are a function of the reference ITD and were shown to be about 20 <italic>&#x3bc;s</italic> in the front (<xref ref-type="bibr" rid="B40">Mossop and Culling, 1997</xref>), and 100 <italic>&#x3bc;s</italic> for lateral sound incidence (<xref ref-type="bibr" rid="B40">Mossop and Culling, 1997</xref>; <xref ref-type="bibr" rid="B6">Andreopoulou and Katz, 2017</xref>). For the ILDs the figure shows a broadband JND of 1&#xa0;dB according to (<xref ref-type="bibr" rid="B58">Yost and Dye, 1988</xref>; <xref ref-type="bibr" rid="B39">Mills, 1960</xref>; <xref ref-type="bibr" rid="B14">Blauert, 1996</xref>, ch. 2). The figures indicate that BSM and Ambisonics ITDs are mostly below the JND. BFBR and sAB stereo notably exceed the JND at 90&#xb0; and 270&#xb0;. The ILD errors of all approaches are clearly above the JND for most directions. Comparing with the ILDs and ITDs of the EMA8 (<xref ref-type="fig" rid="F6">Figure 6</xref>) shows that Ambisonics, BSM, and BFBR perform comparably to the EMA6, for both metrics. The sAB stereo ILD curves of the EMA8 are notably shifted compared to the curve of the EMA6, and match the reference curve quite well. This is due to the different positions of the sAB microphones, and is also supported by <xref ref-type="fig" rid="F7">Figure 7</xref> depicting the ITD and ILD errors of the EMA8. sXY stereo still performs the worst. sAB and BSM both exhibit a dip at the top of the ILD curve at around 90&#xb0; and 270&#xb0;, which can also slightly be seen in the reference curve. Interestingly, for the EMA6 this dip can only be seen in the sAB curve.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>ITD and ILD errors of the EMA6 array using different binaural reproduction approaches. The shaded gray areas represent JNDs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>ITDs and ILDs of the EMA8 array using different binaural reproduction approaches. The reference values (black dashed lines) were calculated from the KU100 HRTFs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g006.tif"/>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>ITD and ILD errors for the EMA8 array using different binaural reproduction approaches. The shaded gray areas represent JNDs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g007.tif"/>
</fig>
<p>The ILDs and ITDs of the glasses array are depicted in <xref ref-type="fig" rid="F8">Figure 8</xref>. Since the array transfer functions from the glasses array were only available for 60 directions in the horizontal plane, we linearly interpolated the ITDs and ILDs to the same 360 directions used for the EMA analyses for <xref ref-type="fig" rid="F8">Figures 8</xref>, <xref ref-type="fig" rid="F9">9</xref>. <xref ref-type="fig" rid="F8">Figure 8</xref> shows that the ITD and ILD curves are not exactly symmetric, unlike the curves for the EMAs. This is due to the non-symmetric distribution of the microphones on the glasses array. This is most clearly visible in the sXY stereo curve, which again, has a notable excursion near 270&#xb0;. For the glasses array, BSM performs better than the N &#x3d; 2 Ambisonics rendering and matches the reference curve quite well. Whereas for the EMA6 BSM produces slightly underestimates in the ITD curve, for the glasses array it produces slightly larger ITDs compared to the reference. sAB stereo lead to similar ITD curve as BSM. BFBR and Ambisonics perform comparably but notably worse than sAB and BSM. For the ILDs, BSM also performs better than Ambisonics; however, sAB stereo seems to match the reference curve the best. The ITD errors in <xref ref-type="fig" rid="F9">Figure 9</xref> show that BSM is still below the JND for most of the incidence directions. Interestingly, both stereo approaches have smaller ITD errors for the glasses array compared to the EMAs. BFBR produces errors above the ITD JND for 90&#xb0; and 270&#xb0;. In the ILD error curve no systematic difference compared to the EMAs can be observed.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>ILDs and ITDs of the glasses array using different binaural reproduction approaches. The reference values (black dashed lines) were calculated from the KEMAR HRTFs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g008.tif"/>
</fig>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>ITD and ILD errors for the glasses array using different binaural reproduction approaches. The shaded gray areas represent JNDs.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g009.tif"/>
</fig>
<p>Next, we analyzed the spectral differences in the form of the averaged differences of the magnitude spectra, calculated with<disp-formula id="e6">
<mml:math id="m10">
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>G</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
<mml:mn>20</mml:mn>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>log</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>where <italic>b</italic>
<sub>
<italic>ref</italic>
</sub> are the reference binaural signals, and &#x3a9;<sub>
<italic>d</italic>
</sub> is the set of <italic>N</italic>
<sub>
<italic>d</italic>
</sub> directions of the binaural signals (where &#x3a9;<sub>
<italic>d</italic>
</sub> is a set of 360 directions in the horizontal plane in steps of 1&#xb0;). <xref ref-type="fig" rid="F10">Figure 10</xref> depicts the spectral differences of the EMA6 and shows that Ambisonics and BSM lead to similar differences. While Ambisonics performs better near 1&#xa0;kHz, BSM performs better at higher frequencies. BFBR leads to slightly larger errors than BSM at 1&#xa0;kHz, but performs equivalently at higher frequencies. sXY stereo leads to notable differences even at frequencies up to 1.1 kHz, which matches the findings from the ILD/ITD figures. The largest magnitude errors are at frequencies above 10&#xa0;kHz for both stereo approaches, most probably due to the lack of pinnae cues.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Magnitude differences for the EMA6.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g010.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F11">Figure 11</xref> depicts the average magnitude differences of the binaural signals calculated from the glasses array capture. The figure clearly shows that the magnitude differences are higher than for the EMAs. Again, the highest errors can be observed for the sXY stereo renderings. The magnitude differences of the BSM and the Ambisonics renderings are the lowest. The differences of BSM are below 10&#xa0;dB for almost all frequencies. The differences for Ambisonics clearly increase above approximately 16&#xa0;kHz. The figure shows a similar trend as with the EMAs, that is, that BSM has larger magnitude differences at lower frequencies compared to Ambisonics, but similar or even lower errors at higher frequencies. Different from the EMAs, for the glasses array, BFBR has larger magnitude errors for nearly all frequencies compared to BSM and Ambisonics. Again, the errors of both stereo curves increase at very high frequencies.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Magnitude differences for the glasses array.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g011.tif"/>
</fig>
</sec>
<sec id="s4">
<title>4 EXPERIMENT 1: Equatorial microphone array</title>
<p>The quantitative evaluation suggests that BSM and sAB stereo can lead to similar perceptual results to the Ambisonics renderings. To evaluate this hypothesis, we conducted two comparative listening experiments. In Experiment 1, we evaluated the EMAs. The second experiment, with the glasses microphone array, is described below in <xref ref-type="sec" rid="s5">Section 5</xref>.</p>
<sec id="s4-1">
<title>4.1 Methods</title>
<sec id="s4-1-1">
<title>4.1.1 Stimuli</title>
<p>We used the SMA data measured in the dry studio room <italic>control room 1</italic> (CR1) with a reverberation time of <italic>RT</italic>
<sub>60</sub> &#x3d; 0.25&#xa0;s (at 1&#xa0;kHz) and a source distance of 2.4 m, and the SMA data measured in a concert hall <italic>small broadcast studio</italic> (SBS) with <italic>RT</italic>
<sub>60</sub> &#x3d; 1&#xa0;s (at 1&#xa0;kHz) and a source distance of 6&#xa0;m, from the database compiled by <xref ref-type="bibr" rid="B55">Stade et al. (2012)</xref>. We resampled the SMA signals as described in <xref ref-type="sec" rid="s2">Section 2</xref> for the EMA6, the EMA8, and the Eigenmike sampling scheme. From the EMA6 and EMA8 arrays we then synthesized binaural room impulse responses (BRIRs) with the BFBR, BSM, sAB stereo, and sXY stereo methods and, additionally, fourth-order Ambisonics decodings with MagLS optimization from the Eigenmike array signals. As anechoic test signals, we used a basic acoustic drum kit (basedrum, snare, hi-hat) and a speech sample. The employed database also includes BRIRs measured with a Neumann KU100 dummy head used as the binaural reference in the experiment. All stimuli were matched in loudness according to <xref ref-type="bibr" rid="B30">ITU-R BS.1770-4 (2015)</xref>.</p>
</sec>
<sec id="s4-1-2">
<title>4.1.2 Paradigm</title>
<p>We used a test design based on the Multiple Stimulus with Hidden Reference and Anchor (MUSHRA) paradigm proposed by <xref ref-type="bibr" rid="B29">ITU-R BS.1534-3 (2015)</xref>, which enables comparing multiple stimuli directly. In our case, the participants&#x2019; task was to rate the differences of the renderings compared to a binaural reference. The paradigm consisted of several blocks of comparisons presented as screens/pages within the software. For each MUSHRA screen/page, the five renderings BFBR, BSM, sAB, sXY, and Ambisonics, as well as the hidden reference and the reference indicated as such, were presented. Contrary to the recommendation of the ITU, we did not include an anchor. The participants were provided with a graphical user interface (GUI) which displayed sliders for each stimulus ranging from 0 to 100. Further, we did not ask for the overall quality compared to the reference, but for the perceived differences in terms of <italic>timbre</italic> or <italic>spaciousness</italic>. Accordingly, the sliders were labeled with &#x201c;Huge&#x201d;, &#x201c;Significant&#x201d;, &#x201c;Moderate&#x201d;, &#x201c;Small&#x201d;, and &#x201c;No&#x201d;, adapted from the MUSHRA experiment in <xref ref-type="bibr" rid="B33">L&#xfc;beck et al. (2020)</xref>. Differences in the timbre are related to any differences in coloration. Differences in the spaciousness are related to any spatial differences, like perceived source position, source distance, externalization, or the source width. In the experiment, participants rated eight MUSHRA pages in total: EMA6 in the CR1 with the speech signal, EMA6 in the CR1 with the drums test signal, EMA8 in CR1 with the drums signal, and EMA6 in the SBS with the drums signal. These factor combinations were repeated for the two metrics, timbre and spatial differences. We did not set up a complete factorial design with all factor combinations to avoid the experiment being too long. At the beginning of the experiment, participants conducted training consisting of user interface familiarization and signal familiarization.</p>
</sec>
<sec id="s4-1-3">
<title>4.1.3 Participants</title>
<p>19 participants took part in the experiment. Most of whom were staff of the audio group at Reality Labs Research at Meta; none reported any hearing issues.</p>
</sec>
<sec id="s4-1-4">
<title>4.1.4 Setup</title>
<p>The experiment was conducted in remote settings. It was implemented in Matlab and shared with each participant, who conducted the test with their own equipment, <italic>i.e.</italic>, their PC or Mac, audio device, and headphones. We recommended the use of Beyerdynamic DT990 Pro headphones, which where used by 15 participants. According to the choice of headphones, the binaural chain was equalized with appropriate headphone compensation filters provided by <xref ref-type="bibr" rid="B13">Bernsch&#xfc;tz et al. (2012)</xref>. If no headphone filters were available in the database, no equalization was applied, which was the case for two participants. During the training phase, participants were instructed to adjust the volume to a comfortable level that should not change during the experiment. All participants were asked to perform the test in a room which was as quiet as possible.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Data analysis</title>
<p>To evaluate participants&#x2019; rating differences between renderings, we ranked each rendering within each comparison of stimulus and attribute (by each MUSHRA screen). We then analyzed the ranks for each rendering using a hierarchical multivariate ordinal regression under a Bayesian framework.</p>
<p>We combined the data from Experiment 1 and Experiment 2, in order to pool variance estimates across the two experiments. The multivariate ordinal regression model regressed rendering rank for both <italic>spaciousness</italic> and <italic>timbre</italic> attributes as a function of included raw rating (MUSHRA points), rendering approach (BFBR, BSM, sXY, sAB, Ambisonics (<italic>N</italic> &#x3d; 2 and <italic>N</italic> &#x3d; 4)), room, array configuration, test signal and all interaction terms as population-level effects, and subject, trial, room, and subject group as a varying (group-level) effects, with correlation estimates for rendering approach and array configuration. The multilevel nature of our model facilitated partial pooling of group-level data, and thus parameter estimates. With partial pooling, the probability of each response choice is modeled for each participant and the data for all participants also informs the estimates for each participant (<xref ref-type="bibr" rid="B25">Gelman and Hill, 2006</xref>).</p>
<p>In the Bayesian framework, regression models calculate the distribution of parameter estimates as the posterior distribution. In this case, our model estimated the posterior distribution of each rank for each combination of participant, attribute, rendering approach, test signal, room, and array configuration, for each Markov-chain Monte Carlo (MCMC) iteration. To derive a single estimate of ranking in each independent variable combination, we calculated the weighted sum of rankings for each MCMC iteration as follows<disp-formula id="e7">
<mml:math id="m11">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:munderover>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>where <italic>p</italic> is the expected probability of rank <italic>k</italic> at each iteration <italic>i</italic>.</p>
<p>In order to evaluate ranking differences between renderings, we calculated the posterior distribution of differences between rankings for each MCMC iteration for each independent variable combination. Ranking difference estimates for which the highest density credible interval does not include zero are considered statistically significant differences.</p>
<p>All models were constructed using the Stan programming language (<xref ref-type="bibr" rid="B18">Carpenter et al., 2017</xref>) through the cmdstan (<xref ref-type="bibr" rid="B24">Gabry and &#x10c;e&#x161;novar, 2021</xref>) and brms (<xref ref-type="bibr" rid="B16">B&#xfc;rkner, 2017</xref>, <xref ref-type="bibr" rid="B15">2018</xref>) packages in R statistical computing software (<xref ref-type="bibr" rid="B45">R Core Team, 2021</xref>).</p>
</sec>
<sec id="s4-3">
<title>4.3 Results</title>
<p>A graphical overview of the results is presented in <xref ref-type="fig" rid="F12">Figure 12</xref> in the form of boxplots of the inter-subject variance in the MUSHRA points for each MUSHRA screen and rated attribute separately. The plots show that for the timbre attribute (top) most of the ratings are within the range of the Ambisonics rendering. An exception is the box for the BFBR results from the EMA6 in reverberant conditions. Overall, sAB stereo and BSM achieved the highest median ratings. They are consequently higher than the median ratings of the Ambisonics renderings. The results of the spaciousness attribute (bottom) show that only sAB stereo and BSM were rated similar to or higher than the Ambisonics rendering. An interesting observation is that in the reverberant condition, BSM was rated significantly better then all other renderings. A similar trend is shown in the boxplots for the timbre results in the reverberant condition. Comparing the results of the spaciousness attribute for the EMA6 and EMA8 with drums shows that the EMA8 might be favourable for the sAB stereo approach. This might be due to the microphone distribution.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>EXPERIMENT 1: Boxplots of the inter-individual variation in the MUSHRA points for each MUSHRA page separately. <bold>(A)</bold> Timbre: dry EMA6 drums, <bold>(B)</bold> Timbre: dry EMA6 speech, <bold>(C)</bold> Timbre: dry EMA8 drums, <bold>(D)</bold> Timbre: reverberation EMA6 drums, <bold>(E)</bold> Spaciousness: dry EMA6 drums, <bold>(F)</bold> Spaciousness: dry EMA6 speech, <bold>(G)</bold> Spaciousness: dry EMA8 drums, <bold>(H)</bold> Spaciousness: reverberation EMA6 drums.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g012.tif"/>
</fig>
<p>Median ranks for each rendering and signal, together with 89% credible intervals, are shown in <xref ref-type="fig" rid="F13">Figure 13</xref>. The median ranks support the findings from the boxplots that sAB and BSM were rated the best for the timbre attribute. To investigate how the approaches performed compared to Ambisonics, the median rank differences between each rendering and Ambisonics <italic>N</italic> &#x3d; 4, together with 89% credible intervals and asterisks indicating statistically significant differences, are shown in <xref ref-type="fig" rid="F14">Figure 14</xref>. Visual inspection reveals that for the spaciousness attribute, Ambisonics <italic>N</italic> &#x3d; 4 was ranked higher than BFBR for speech with the EMA6 in dry conditions, sAB was ranked higher with the EMA8 in dry conditions, and BSM was ranked higher with the EMA6 in reverberant conditions. For the attribute timbre, sAB was ranked higher for drums and BSM was ranked higher for both drums and speech. Both sAB and BSM were also ranked higher with EMA8 in dry conditions. BSM and sXY were ranked higher with EMA6 in the reverberant room. BFBR was always ranked in the range of Ambisonics for the timbre attribute.</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>EXPERIMENT 1: Median ranks for each rendering by attribute, array configuration, test signal, and rendering approach. Points represent median rank and error bars depict the 89% highest density credible interval.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g013.tif"/>
</fig>
<fig id="F14" position="float">
<label>FIGURE 14</label>
<caption>
<p>EXPERIMENT 1: Median rank differences between renderings and Ambisonics <italic>N</italic> &#x3d;4 by attribute, room, array configuration, and test signal. Points represent median rank difference and error bars depict the 89% highest density credible interval. Asterisks indicate statistically significant differences.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g014.tif"/>
</fig>
<p>The median rank differences (<xref ref-type="fig" rid="F14">Figure 14</xref>) suggests that sAB and BSM perform the best for the EMAs. Mostly, all rendering approaches were rated in a similar range as the <italic>N</italic> &#x3d; 4 Ambisonics rendering.</p>
</sec>
</sec>
<sec id="s5">
<title>5 EXPERIMENT 2: Glasses microphone array</title>
<sec id="s5-1">
<title>5.1 Methods</title>
<p>For the second experiment, we employed array impulse responses measured in a room with variable acoustics for two different source positions (loudspeaker one&#xa0;at 23&#xb0; with a distance of 2&#xa0;m, loudspeaker 2&#xa0;at 325&#xb0; with a distance of 1.5&#xa0;m). We used measurements in dry conditions (<italic>RT</italic>
<sub>60</sub> &#x3d; 0.447 s, at 1&#xa0;kHz), and in more reverberant conditions (<italic>RT</italic>
<sub>60</sub> &#x3d; 0.564 s, at 1&#xa0;kHz). The measurements were done with the 6-microphone glasses array described in <xref ref-type="sec" rid="s2-1-2">Section 2.1.2</xref>, and for the comparative Ambisonics renderings with an 8-microphone OctoMic array. The binaural reference in Experiment 2 was measured with a KEMAR dummy head. The test signals were the same as for Experiment 1 such that in total participants again rated eight MUSHRA pages: dry conditions with loudspeaker 1 (spk 1) and the drums signal, dry conditions with spk one and speech signal, dry conditions with spk 2 and drums signal, and the reverberant condition with spk one and the drums signal. Again, to avoid the experiment being to long, we only tested a subset of all factor combinations.</p>
<p>Since, no headphone compensation filters were available for the KEMAR dummy head, the second experiment was conducted without any headphone equalization.</p>
<p>In all other aspects, setup, procedure, and data analysis were identical to Experiment 1. All 19 subjects, participated in both experiments.</p>
</sec>
<sec id="s5-2">
<title>5.2 Results</title>
<p>A graphical overview of the results is presented in <xref ref-type="fig" rid="F15">Figure 15</xref> in the form of boxplots of the inter-subject variance in the MUSHRA points for each MUSHRA page and the timbre and spaciousness attributes, separately. As the quantitative evaluation suggests, the glasses array is the more challenging condition. However, except for the results of the BSM renderings for spk 2, all timbre ratings are within the range of the Ambisonics results. For the glasses array, BFBR and sAB seem to perform the best regarding the timbre. For the spaciousness results, the boxplots do not indicate any approach as being the best. Only the results for sXY stereo for spk 2 are notably worse compared to the other conditions.</p>
<fig id="F15" position="float">
<label>FIGURE 15</label>
<caption>
<p>EXPERIMENT 2: Boxplots of the inter-individual variation in the MUSHRA points for each MUSHRA page separately. <bold>(A)</bold> Timbre: dry spk1 drums, <bold>(B)</bold> Timbre: dry spk1 speech, <bold>(C)</bold> Timbre: dry spk2 drums, <bold>(D)</bold> Timbre: reverberation spk1 drums, <bold>(E)</bold> Spaciousness: dry spk1 drums, <bold>(F)</bold> Spaciousness: dry spk1 speech, <bold>(G)</bold> Spaciousness: dry spk2 drums, <bold>(H)</bold> Spaciousness: reverberation spk1 drums.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g015.tif"/>
</fig>
<p>Median ranks for each rendering and test signal, together with 89% credible intervals, are shown in <xref ref-type="fig" rid="F16">Figure 16</xref>.</p>
<fig id="F16" position="float">
<label>FIGURE 16</label>
<caption>
<p>EXPERIMENT 2: Median ranks for each rendering by attribute, array configuration, room, test signal, and rendering. Points represent median rank and error bars depict the 89% highest density credible interval.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g016.tif"/>
</fig>
<p>To investigate how the approaches performed compared to the Ambisonics renderings <xref ref-type="fig" rid="F17">Figure 17</xref> shows the median rank differences between each rendering approach and Ambisonics <italic>N</italic> &#x3d; 2, together with 89% credible intervals and asterisks indicating statistically significant differences. Visual inspection reveals that for the spaciousness attribute, Ambisonics <italic>N</italic> &#x3d; 2 was ranked higher than sXY for spk 2 in dry conditions. For the timbre attribute, BFBR was ranked higher for speech with spk one in dry conditions and for drums with spk one in reverberant conditions. Ambisonics <italic>N</italic> &#x3d; 2 was ranked higher than BSM with spk 2 in dry conditions. Median rank differences suggest that in most cases all renderings were rated similar to the Ambisonics rendering.</p>
<fig id="F17" position="float">
<label>FIGURE 17</label>
<caption>
<p>EXPERIMENT 2: Median rank differences between renderings and Ambisonics <italic>N</italic> &#x3d;2 by attribute, array configuration, room, and test signal. Points represent median rank differences and error bars depict the 89% highest density credible interval. Asterisks indicate statistically significant differences.</p>
</caption>
<graphic xlink:href="frsip-02-883696-g017.tif"/>
</fig>
</sec>
</sec>
<sec id="s6">
<title>6 General discussion</title>
<p>A primary motivation of the study was to investigate if capture from non-spherical arrays, together with the approaches sAB stereo, sXY stereo, BFBR, or BSM, can lead to auralization that is comparable to the established Ambisonics chain. Both quantitative and perceptual evaluation suggest that for EMAs with six and eight microphones, sAB stereo and BSM performed comparably to, and mostly better than, the fourth-order Ambisonics reproduction of SMA capture with an Eigenmike sampling scheme. This is surprising, considering the increased microphone count 32) of the Eigenmike compared to the EMA6 and EMA8. For the glasses array with six microphones, BFBR and sAB stereo performed comparably only to a second-order Ambisonics reproduction of SMA capture with an OctoMic. Hence, the glasses array is certainly the more challenging array configuration; that is also supported by the quantitative evaluation.</p>
<p>It can be assumed that sAB stereo highly depends on the location of the microphones; this is supported by the ITD and ILD analyses. The results of the EMA experiment show that regarding the spaciousness, microphones at <italic>&#x3d5;</italic> &#x3d; 90&#xb0; and <italic>&#x3d5;</italic> &#x3d; 270&#xb0; might be advantageous. However, the timbre is not affected by the microphone position. Furthermore, we could not find any significant difference in the performance of sAB stereo between the EMAs and the glasses array.</p>
<p>The listening experiment results show that sXY reproduces the sound scene with a relatively accurate timbre. However, sXY stereo does not lead to good spatial reproduction. This strongly matches the findings from the quantitative evaluation. sXY cannot restore the correct ITDs and ILDs. This is, for one thing due to collocation of the beams and for another due to non-optimal beamforming. MD beamforming with a small number of microphones introduces side lobes, which might cause the ITD and ILD distortions. Moreover, the original XY stereophony employs microphones with cardioid directivity instead of maximum directivity.</p>
<p>BFBR was rated better for the glasses array than for the EMAs; the quantitative evaluation does not clearly supported this. For example, BFBR has larger magnitude differences for the glasses array than for the EMAs compared to BSM or Ambisonics. It might be due to the inconsistent use of the rating scale. Another explanation could be that for the EMAs all microphones are in the horizontal plane, leading to a lack of height information. This does not affect the ITD and ILD analysis since we only considered horizontal sound incidences but may affect complex sound scenes, with reflection from all directions. Furthermore, BFBR assumes far-field sound sources. For the EMAs, the source distance was 2.4 and 6&#xa0;m; for the glasses array 1.5 and 2&#xa0;m. This might further influence the performance of BFBR. Moreover, it is interesting to mention that BSM behaves in the opposite way; it was rated better for the EMAs.</p>
<p>
<xref ref-type="bibr" rid="B34">Madmoni et al. (2021)</xref> investigated the influence of the microphone distribution on the performance of BSM. However, they only investigated semi-circular array configurations. They concluded that for static reproduction, microphones placed close to the ears are favourable. For dynamic binaural synthesis, uniformly sampling on a full-circular array has advantages. Our study did not find any significant difference between the EMA6 and EMA8. Future work is suggested to develop design criteria for optimal array configurations for the BSM method.</p>
<p>sAB has the clear advantage of not being affected by any undersampling artifacts, such as spatial aliasing. However, it is the only approach that cannot synthesize binaural signals for different head orientations in the form evaluated in this paper. One possible method to adapt the binaural signals is presented in <xref ref-type="bibr" rid="B41">Nagel et al. (2020)</xref> and <xref ref-type="bibr" rid="B42">Nagel and Jax (2021)</xref>. The authors proposed a binaural cue adaptation of static binaural recordings. For this, the recorded signals are divided into coherent and incoherent components. The coherent components, which mainly generate the ITDs and ILDs and are important for spatial perception (<xref ref-type="bibr" rid="B31">Jeffress and Robinson, 1962</xref>; <xref ref-type="bibr" rid="B57">Trahiotis et al., 2001</xref>), are then adapted to the listeners&#x2019; head orientation based on a spherical head model. Another approach would be the MTB method (<xref ref-type="bibr" rid="B5">Algazi et al., 2004</xref>), which interpolates between neighboured microphone signals according to the listeners&#x2019; head orientation.</p>
<p>The significant advantage of sXY stereo is its simplicity; it does not require HRTF processing, similarly to sAB stereo. However, sXY stereo does not necessarily require microphones at the position of the listener&#x2019;s ears. Binaural signals for different head orientations could be synthesized by varying the directions of the XY beams.</p>
<p>The clear benefit of BFBR is that it is the most flexible approach. Since the sound field is decomposed into different directional components, it can easily be manipulated. This could be used to either synthesize different head orientations or amplify specific directions of the sound field. Moreover, different HRTFs can easily be integrated since they are not incorporated in the beamforming coefficients, as with BSM, for example. Moreover, beamforming plays an important role in consumer devices, for example, for applications that enhance speech intelligibility.</p>
<p>Overall, BSM seems to reproduce the most accurate binaural signals. However, the BSM filters already incorporate the HRTFs, which is why a complete set of BSM filters is required for each head orientation. Hence, applying dynamic binaural synthesis would require a large set of beamforming coefficients.</p>
<p>This study only focuses on scene-based approaches, i.e., re-synthesis of the entire captured scene. In future work, it would also be conceivable to apply parametric approaches, like DiraC (<xref ref-type="bibr" rid="B44">Pulkki, 2007</xref>) or SIRR (<xref ref-type="bibr" rid="B37">Merimaa and Pulkki, 2004</xref>). With beamforming, objects or specific dominant sound sources of the sound field could be extracted and spatially rendered.</p>
</sec>
<sec id="s7">
<title>7 Conclusion</title>
<p>We presented a comparison of approaches for the binaural rendering of capture from equatorial microphone arrays and capture from a glasses microphone array. A MUSHRA-like listening experiment applying non-head-tracked binaural synthesis showed that the approaches have potential to synthesize spatial sound scenes with similar quality as Ambisonics renderings from spherical microphone array capture with a similar number of microphones. Beamforming-based binaural reproduction with binaural signal matching and a microphone-based stereo approach performed the best for equatorial arrays. For the glasses array, beamforming-based binaural reproduction and microphone-based stereo performed the best. The results further suggest that for non-head-tracked binaural reproduction, the more sophisticated beamforming approaches (BSM or BFBR) do not outperform the simple microphone-based stereo approach. Future work is suggested to investigate how the approaches perform with head-tracked dynamic binaural reproduction. Moreover, in the current study, we only focused on sound sources in the horizontal plane. The performance of the approaches with elevated sound sources or vertical head movements needs to be investigated in future work.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s9">
<title>Ethics statement</title>
<p>The studies involving human participants were reviewed and approved by an internal research review committee and an external institutional review board (IRB). Written informed consent for participation was not required for this study in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec id="s10">
<title>Author contributions</title>
<p>As part of the internship, TL designed the project, implemented or maintained the tested algorithms, and conducted the listening experiment. He also wrote the first draft of manuscript. ZH and DA supervised the project. ZH, DA, SA, and PC assisted in refining the research question and experimental design. JC designed and performed the statistical analysis. All authors contributed to manuscript revision, and read and approved the submitted version.</p>
</sec>
<sec id="s11">
<title>Funding</title>
<p>This work was done during an internship with Reality Labs Research at Meta.</p>
</sec>
<ack>
<p>The authors would like to thank Lior Madmoni and Boaz Rafaely for providing us with the code of BSM, as well as our collegues from Reality Labs - Research, Vladimir Tourbabin, Jacob Donley, Sam Clap, and Andrew Luck who greatly assisted in this internship project. Many thanks also to all voluntary participants.</p>
</ack>
<sec sec-type="COI-statement" id="s12">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn1">
<label>&#x2a;</label>
<p>also at Institute of Communications Engineering, TH K&#xf6;ln - University of Applied Sciences, Cologne, D-50679, Germany.</p>
</fn>
<fn id="fn2">
<label>1</label>
<p>Throughout this article, <italic>&#x3d5;</italic> &#x2208; [0&#xb0;, 360&#xb0;) denotes the azimuth angle, and <italic>&#x3b8;</italic> &#x2208; [0&#xb0;, 180&#xb0;] the colatitude angle.</p>
</fn>
<fn id="fn3">
<label>2</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://www.vvaudio.com/landing/VVOctoEncode_OctoMic">https://www.vvaudio.com/landing/VVOctoEncode_OctoMic.</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ackermann</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Fiedler</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Brinkmann</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Weinzierl</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>On the acoustic qualities of dynamic pseudobinaural recordings</article-title>. <source>J. Audio Eng. Soc.</source> <volume>68</volume>, <fpage>418</fpage>&#x2013;<lpage>427</lpage>. <pub-id pub-id-type="doi">10.17743/jaes.2020.0036</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Helmholz</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Alon</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Gar&#xed;</surname>
<given-names>S. V. A.</given-names>
</name>
</person-group> (<year>2021a</year>). &#x201c;<article-title>A head-mounted microphone array for binaural rendering</article-title>,&#x201d; in <conf-name>International Conference on Immersive and 3D Audio</conf-name>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>. </citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Helmholz</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Alon</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Gar&#xed;</surname>
<given-names>S. V. A.</given-names>
</name>
</person-group> (<year>2021b</year>). &#x201c;<article-title>Spherical harmonic decomposition of a sound field based on microphones around the circumference of a human head</article-title>,&#x201d; in <conf-name>Workshop on Applications of Signal Processing to Audio and Acoustics</conf-name>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Helmholz</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Alon</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Gar&#xed;</surname>
<given-names>S. V. A.</given-names>
</name>
</person-group> (<year>2021c</year>). <article-title>Spherical harmonic decomposition of a sound field based on observations along the equator of a rigid spherical scatterer</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>805</volume>, <fpage>805</fpage>&#x2013;<lpage>815</lpage>. <pub-id pub-id-type="doi">10.1121/10.0005754</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Algazi</surname>
<given-names>V. R.</given-names>
</name>
<name>
<surname>Duda</surname>
<given-names>R. O.</given-names>
</name>
<name>
<surname>Thompson</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Motion-tracked binaural sound</article-title>. <source>AES J. Audio Eng. Soc.</source> <volume>52</volume>, <fpage>1142</fpage>&#x2013;<lpage>1156</lpage>. </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Andreopoulou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Katz</surname>
<given-names>B. F. G.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Identification of perceptually relevant methods of inter-aural time difference estimation</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>142</volume>, <fpage>588</fpage>&#x2013;<lpage>598</lpage>. <pub-id pub-id-type="doi">10.1121/1.4996457</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Avni</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Geier</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Spors</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wierstorf</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Spatial perception of sound fields recorded by spherical microphone arrays with varying spatial resolution</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>133</volume>, <fpage>2711</fpage>&#x2013;<lpage>2721</lpage>. <pub-id pub-id-type="doi">10.1121/1.4795780</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ben-hur</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Alon</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Mehra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Binaural reproduction based on bilateral Ambisonics and ear-aligned HRTFs</article-title>,&#x201d; in <conf-name>IEEE/ACM Transaction on Audio, Speech, and Language Processing</conf-name>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/TASLP.2021.3055038</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>A spherical far field HRIR/HRTF compilation of the Neumann KU 100</article-title>,&#x201d; in <conf-name>Proceedings of the 39th DAGA (Meran)</conf-name>, <fpage>592</fpage>&#x2013;<lpage>595</lpage>. </citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Microphone arrays and sound field decomposition for dynamic binaural recording</source>. <publisher-loc>Berlin</publisher-loc>: <publisher-name>Technische Universit&#xe4;t Berlin</publisher-name>, <fpage>264</fpage>. <pub-id pub-id-type="doi">10.14279/depositonce-5082</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>P&#xf6;rschmann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Spors</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Weinzierl</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011a</year>). &#x201c;<article-title>SOFiA sound field analysis toolbox</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Spatial Audio (ICSA)</conf-name>, <fpage>8</fpage>&#x2013;<lpage>16</lpage>. </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>P&#xf6;rschmann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Spors</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Weinzierl</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011b</year>). <article-title>Soft-Limiting der modalen Amplitudenverst &#x308; arkung bei sph &#x308; arischen Mikrofonarrays im Plane Wave Decomposition Verfahren Einleitung Begrenzung der Verst &#x308; arkung Kompensation des Frequenzgangs Verifikation Zusammenfassung</article-title>. <source>Proc. 37th DAGA (D&#xfc;sseldorf)</source> <volume>2</volume>, <fpage>661</fpage>&#x2013;<lpage>662</lpage>. </citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Stade</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>R&#xfc;hl</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Sound field analysis in room Acoustics</article-title>,&#x201d; in <conf-name>27th Tonmeistertagung - VDT International Convention</conf-name>. </citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Blauert</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1996</year>). <source>Spatial hearing</source>. <publisher-loc>Camebridge</publisher-loc>: <publisher-name>Hirzel Verlag Stuttgart</publisher-name>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>B&#xfc;rkner</surname>
<given-names>P.-C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Advanced Bayesian multilevel modeling with the R package brms</article-title>. <source>R J.</source> <volume>10</volume>, <fpage>395</fpage>&#x2013;<lpage>411</lpage>. <pub-id pub-id-type="doi">10.32614/RJ-2018-017</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>B&#xfc;rkner</surname>
<given-names>P.-C.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>brms: An R package for Bayesian multilevel models using Stan</article-title>. <source>J. Stat. Softw.</source> <volume>80</volume>, <fpage>1</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v080.i01</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Calamia</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Smalt</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Weston</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>A conformal, helmet-mounted microphone array for auditory situational awareness and hearing protection</article-title>,&#x201d; in <conf-name>2017 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)</conf-name>, <fpage>96</fpage>&#x2013;<lpage>100</lpage>. </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carpenter</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gelman</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hoffman</surname>
<given-names>M. D.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goodrich</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Betancourt</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Stan: A probabilistic programming language</article-title>. <source>J. Stat. Softw.</source> <volume>76</volume>. <pub-id pub-id-type="doi">10.18637/jss.v076.i01</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<comment>[Dataset]</comment> <collab>Core Sound</collab> (<year>2022a</year>). <article-title>Core sound Octomic<sup>&#x2122;</sup>
</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.core-sound.com/products/octomic">https://www.core-sound.com/products/octomic</ext-link>
</comment>. </citation>
</ref>
<ref id="B20">
<citation citation-type="web">
<comment>[Dataset]</comment> <collab>Core Sound</collab> (<year>2022b</year>). <article-title>Core sound Octomic<sup>&#x2122;</sup>
</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.core-sound.com/products/octomic">https://www.core-sound.com/products/octomic</ext-link>
</comment>. </citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cuevas-Rodriguez</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alon</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Clapp</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Mehra</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Evaluation of the effect of head-mounted display on individualized head-related transfer functions</article-title>,&#x201d; in <conf-name>Proceedings of the International Congress on Acoustics</conf-name>, <conf-date>2019-September</conf-date>, <fpage>2635</fpage>&#x2013;<lpage>2642</lpage>. <pub-id pub-id-type="doi">10.18154/RWTH-CONV-239516</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Donley</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tourbabin</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.-S.</given-names>
</name>
<name>
<surname>Broyles</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Easycom: An augmented reality dataset to support algorithms for easy communication in noisy environments</article-title>. <source>arXiv:2107.04174</source>. </citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Duraiswami</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zotkin</surname>
<given-names>D. N.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Grassi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gumerov</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>L. S.</given-names>
</name>
</person-group> (<year>2005</year>). <source>High order spatial audio capture and its binaural head-tracked playback over headphones with HRTF cues</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>HRTF</publisher-name>. </citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Gabry</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>&#x10c;e&#x161;novar</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Cmdstanr: R interface to &#x2019;CmdStan&#x2019;</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://discourse.mc-stan.org">https://discourse.mc-stan.org</ext-link>
</comment>. </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gelman</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hill</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2006</year>). <source>Data analysis using regression and multilevel/hierarchical models</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>. </citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Helmholz</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Andersson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Real-time implementation of binaural rendering of high-order spherical microphone array signals</article-title>,&#x201d; in <source>Proceedings of the 45th DAGA</source> (<publisher-loc>Rostock, GE</publisher-loc>: <publisher-name>DAGA</publisher-name>), <fpage>2</fpage>&#x2013;<lpage>5</lpage>. </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ifergan</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>On the selection of the number of beamformers in beamforming-based binaural reproduction</article-title>. <source>EURASIP J. Audio Speech Music Process.</source> <volume>6</volume>. <pub-id pub-id-type="doi">10.1186/s13636-022-00238-7</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="thesis">
<person-group person-group-type="author">
<name>
<surname>Ifergan</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Theoretical framework for beamformer distribution in Beamforming based Binaural Reproduction thesis submitted in partial fulfillment of the Theoretical framework for beamformer distribution in Beamforming based Binaural Reproduction</article-title>,&#x201d;. <comment>Ph.D. thesis</comment>. </citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<comment>[Dataset]</comment> <collab>ITU-R BS.1534-3</collab> (<year>2015</year>). <source>Method for the subjective assessment of intermediate quality level of audio systems</source>. </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<collab>ITU-R BS.1770-4</collab> (<year>2015</year>). <article-title>Algorithms to measure audio programme loudness and true-peak audio level BS Series Broadcasting service (sound)</article-title>. <source>Radiocommunication Sect. ITU</source> <volume>4</volume>. </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jeffress</surname>
<given-names>L. A.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>D. E.</given-names>
</name>
</person-group> (<year>1962</year>). <article-title>Formulas for the coefficient of interaural correlation for noise</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>34</volume>, <fpage>1658</fpage>&#x2013;<lpage>1659</lpage>. <pub-id pub-id-type="doi">10.1121/1.1909077</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kulkarni</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Isabelle</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Colburn</surname>
<given-names>H. S.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Sensitivity of human subjects to head-related transfer-function phase spectra</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>105</volume>, <fpage>2821</fpage>&#x2013;<lpage>2840</lpage>. <pub-id pub-id-type="doi">10.1121/1.426898</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>L&#xfc;beck</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Helmholz</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Arend</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>P&#xf6;rschmann</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ahrens</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Perceptual evaluation of mitigation approaches of impairments due to spatial undersampling in binaural rendering of spherical microphone array data</article-title>. <source>J. Audio Eng. Soc.</source> <volume>68</volume>, <fpage>428</fpage>&#x2013;<lpage>440</lpage>. <pub-id pub-id-type="doi">10.17743/jaes.2020.0038</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Madmoni</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Donley</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tourbabin</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Binaural reproduction from microphone array signals incorporating head-tracking</article-title>,&#x201d; in <source>Immersive and 3D audio: From architecture to automotive</source>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1109/i3da48870.2021.9610940</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Madmoni</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Donley</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tourbabin</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Beamforming-based binaural reproduction by matching of binaural signals</article-title>,&#x201d; in <conf-name>Proceedings of the AES International Conference on Audio for Virtual and Augmented Reality</conf-name>, <fpage>318</fpage>&#x2013;<lpage>322</lpage>. <comment>vol. 8</comment>. <pub-id pub-id-type="doi">10.1016/s0967-2109(00)00016-8</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>McCormack</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Politis</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Sparta &#x26; compass: Real-time implementations of linear and parametric spatial audio reproduction and processing methods</article-title>,&#x201d; in <conf-name>Proceedings of the AES Conference on Immersive and Interaktive Audio</conf-name>, <conf-loc>York, UK</conf-loc>, <fpage>E-brief 111</fpage>. </citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Merimaa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pulkki</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Spatial impulse response rendering</article-title>,&#x201d; in <conf-name>Proceedings of the 7th International Conference on Digital Audio Effects (Naples)</conf-name>, <fpage>139</fpage>&#x2013;<lpage>144</lpage>. </citation>
</ref>
<ref id="B38">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Meyer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Elko</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2002</year>). &#x201c;<article-title>A highly scalable spherical microphone array based on an orthonormal decomposition of the soundfield</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name> (<publisher-loc>Orlando, FL, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1781</fpage>&#x2013;<lpage>1784</lpage>. <comment>vol. 2</comment>. <pub-id pub-id-type="doi">10.1109/ICASSP.2002.5744968</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mills</surname>
<given-names>A. W.</given-names>
</name>
</person-group> (<year>1960</year>). <article-title>Lateralization of high-frequency tones</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>32</volume>, <fpage>132</fpage>&#x2013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1121/1.1907864</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mossop</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Culling</surname>
<given-names>J. F.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Lateralization for large interaural delays</article-title>. <source>Br. J. Audiology</source> <volume>31</volume>, <fpage>99</fpage>. <pub-id pub-id-type="doi">10.1121/1.424369</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nagel</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Haupt</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jax</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Coherence-adaptive binaural cue adaptation</article-title>,&#x201d; in <conf-name>AES International Conference on Audio for Virtual and Augmented Reality (AVAR) (Audio Engineering Society (AES))</conf-name>, <fpage>1</fpage>&#x2013;<lpage>3</lpage>. </citation>
</ref>
<ref id="B42">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nagel</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jax</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>On the use of additional microphones in binaural cue adaptation 2 signal model 1 introduction speech communication</article-title>,&#x201d; in <conf-name>14th ITG Conference</conf-name>, <fpage>54</fpage>&#x2013;<lpage>58</lpage>. </citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Donovan</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Zotkin</surname>
<given-names>D. N.</given-names>
</name>
<name>
<surname>Duraiswami</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Spherical microphone array based immersive audio scene rendering</article-title>,&#x201d; in <conf-name>Proceedings of the 14th Interantional Conference on Auditory Display</conf-name>, <conf-loc>Paris, France</conf-loc>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pulkki</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Spatial sound reproduction with directional audio coding</article-title>. <source>J. Audio Eng. Soc.</source> <volume>55</volume>, <fpage>503</fpage>&#x2013;<lpage>516</lpage>. </citation>
</ref>
<ref id="B45">
<citation citation-type="book">
<collab>R Core Team</collab> (<year>2021</year>). <source>R: A language and environment for statistical computing</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name>. </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2008</year>). <source>IEEE Trans. Audio Speech Lang. Process.</source> <volume>16</volume>, <fpage>740</fpage>&#x2013;<lpage>747</lpage>. <pub-id pub-id-type="doi">10.1109/tasl.2008.920059</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Fundamentals of spherical array processing</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>. <pub-id pub-id-type="doi">10.1007/978-3-642-11130-3</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rasumow</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Blau</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Doclo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Van De Par</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hansen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Puschel</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Perceptual Evaluation of individualized binaural reproduction using a virtual artificial head</article-title>. <source>J. Audio Eng. Soc.</source> <volume>65</volume>, <fpage>448</fpage>&#x2013;<lpage>459</lpage>. <pub-id pub-id-type="doi">10.17743/jaes.2017.0012</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rasumow</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Blau</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hansen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Doclo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Van De Par</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mellert</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Robustness of virtual artifcial head topologies with respect to microphone positioning</article-title>. <source>Proc. Forum Acusticum</source> <volume>2011</volume>, <fpage>2251</fpage>&#x2013;<lpage>2256</lpage>. </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rasumow</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hansen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Van De Par</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>P&#xfc;schel</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mellert</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Doclo</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Regularization approaches for synthesizing HRTF directivity patterns</article-title>. <source>IEEE/ACM Trans. Audio Speech Lang. Process.</source> <volume>24</volume>, <fpage>215</fpage>&#x2013;<lpage>225</lpage>. <pub-id pub-id-type="doi">10.1109/TASLP.2015.2504874</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Reddy</surname>
<given-names>C. S.</given-names>
</name>
<name>
<surname>Hegde</surname>
<given-names>R. M.</given-names>
</name>
</person-group> (<year>2017</year>). <source>On the conditioning of the spherical harmonic matrix for spatial audio applications</source>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sch&#xf6;rkhuber</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zaunschirm</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Holdrich</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Binaural rendering of Ambisonic signals via magnitude least squares</article-title>. <source>Proc. 44th DAGA (M&#xfc;nchen)</source> <volume>4</volume>, <fpage>339</fpage>&#x2013;<lpage>342</lpage>. </citation>
</ref>
<ref id="B53">
<citation citation-type="web">
<comment>[Dataset]</comment> <collab>Sennheiser electronic GmbH &#x26; Co. KG</collab> (<year>2022</year>). <article-title>Sennheiser Ambeo VR mic</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://en-de.sennheiser.com/microphone-3d-audio-ambeo-vr-mic">https://en-de.sennheiser.com/microphone-3d-audio-ambeo-vr-mic</ext-link>
</comment>. </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ellermeier</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hald</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Psychoacoustic evaluation of multichannel reproduced sounds using binaural synthesis and spherical beamforming</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>130</volume>, <fpage>2063</fpage>&#x2013;<lpage>2075</lpage>. <pub-id pub-id-type="doi">10.1121/1.3628323</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stade</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bernsch&#xfc;tz</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>R&#xfc;hl</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>A spatial audio impulse response compilation captured at the WDR broadcast studios</article-title>,&#x201d; in <conf-name>Proceedings of the 27th Tonmeistertagung - VDT International Convention</conf-name>, <fpage>551</fpage>&#x2013;<lpage>567</lpage>. </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tourbabin</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Rafaely</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Direction of arrival estimation using microphone array processing for moving humanoid robots</article-title>. <source>IEEE/ACM Trans. Audio Speech Lang. Process.</source> <volume>23</volume>, <fpage>2046</fpage>&#x2013;<lpage>2058</lpage>. <pub-id pub-id-type="doi">10.1109/TASLP.2015.2464671</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trahiotis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bernstein</surname>
<given-names>L. R.</given-names>
</name>
<name>
<surname>Akeroyd</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Manipulating the &#x201c;straightness&#x201d; and &#x201c;curvature&#x201d; of patterns of interaural cross correlation affects listeners&#x2019; sensitivity to changes in interaural delay</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>109</volume>, <fpage>321</fpage>&#x2013;<lpage>330</lpage>. <pub-id pub-id-type="doi">10.1121/1.1327579</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yost</surname>
<given-names>W. A.</given-names>
</name>
<name>
<surname>Dye</surname>
<given-names>R. H.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>Discrimination of interaural differences of level as a function of frequency</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>83</volume>, <fpage>1846</fpage>&#x2013;<lpage>1851</lpage>. <pub-id pub-id-type="doi">10.1121/1.396520</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zotter</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Frank</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Ambisonics A practical 3D audio theory for recording, studio production, sound reinforcement, and virtual reality</source>. <publisher-loc>Berlin Heidelberg</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>. <pub-id pub-id-type="doi">10.1007/978-3-030-17207-7</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="web">
<comment>[Dataset]</comment> <collab>Zylia Sp. z o.o.</collab> (<year>2022</year>). <article-title>Zylia Sp. z o.o. ZYLIA ZM-1 Microphone</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.zylia.co/zylia-zm-1-microphone.html">https://www.zylia.co/zylia-zm-1-microphone.html</ext-link>
</comment>. </citation>
</ref>
</ref-list>
</back>
</article>