<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2026.1756386</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Attention to speech modulates distortion product otoacoustic emissions evoked by speech-derived stimuli in humans</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Steinebach</surname> <given-names>Janna</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<uri xlink:href="https://loop.frontiersin.org/people/3323305"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Reichenbach</surname> <given-names>Tobias</given-names></name>
<xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/219548"/>
</contrib>
</contrib-group>
<aff id="aff1"><institution>Department Artificial Intelligence in Biomedical Engineering, Friedrich-Alexander-Universit&#x000E4;t Erlangen-N&#x000FC;rnberg</institution>, <city>Erlangen</city>, <country country="de">Germany</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Tobias Reichenbach, <email xlink:href="mailto:tobias.j.reichenbach@fau.de">tobias.j.reichenbach@fau.de</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02">
<day>02</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>20</volume>
<elocation-id>1756386</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Steinebach and Reichenbach.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Steinebach and Reichenbach</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Humans are remarkably skilled at understanding speech in noisy environments. While segregation of different audio streams is mostly accomplished in the auditory cortex, neural feedback connections run from the cortex to the brainstem and to the cochlea. The latter organ not only houses the mechanosensitive hair cells, but also possesses an active process enabling it to amplify sound in a frequency-dependent manner. A physiological correlate of the active process are distortion-product otoacoustic emissions (DPOAEs) that can be measured non-invasively from the ear canal. Here we employed speech-like DPOAEs, measured in response to stimuli derived from natural human speech and thus reflecting the harmonic spectral structure of voiced speech. We show that these emissions are modulated by selective attention to one of two competing voices, as well as by intermodal attention. Specifically, speech-like DPOAEs evoked by stimuli related to resolved harmonics of a voice were significantly reduced when that voice was attended compared to when it was ignored. No such effect was observed for stimuli related to unresolved harmonics of the target voice when the competing voice&#x00027;s harmonics in that range were unresolved as well, indicating that attentional modulation is specific to those components of voiced speech that are spectrally resolved. Our findings support the hypothesis that the cochlea&#x00027;s active process already shapes selective attention to speech in noise. Moreover, the speech-like DPOAEs that we developed open up further possibilities for investigating the contribution of the cochlear active process to auditory scene analysis in naturalistic settings.</p></abstract>
<kwd-group>
<kwd>auditory attention</kwd>
<kwd>distortion products</kwd>
<kwd>efferent feedback</kwd>
<kwd>inner ear biology</kwd>
<kwd>MOC system</kwd>
<kwd>otoacoustic emissions</kwd>
<kwd>speech processing</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Deutsche Forschungsgemeinschaft</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/501100001659</institution-id>
</institution-wrap>
</funding-source>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was funded by the Deutsche Forschungsgemeinschaft (German Research Foundation) through grant 514955521 (to TR).</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="1"/>
<equation-count count="3"/>
<ref-count count="66"/>
<page-count count="13"/>
<word-count count="10322"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Auditory Cognitive Neuroscience</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Understanding speech in noisy environments is an important yet highly complex human ability. In crowded settings such as restaurants or family gatherings, selective auditory attention enables listeners to focus on a single speaker while filtering out competing voices&#x02014;also known as the cocktail party effect (<xref ref-type="bibr" rid="B31">McDermott, 2009</xref>; <xref ref-type="bibr" rid="B8">Cherry, 1953</xref>). This ability is essential for many aspects of social participation, but is vulnerable to hearing damage: many people with hearing impairment complain of difficulty understanding speech in background noise, even when using hearing aids (<xref ref-type="bibr" rid="B42">Plomp, 1978</xref>).</p>
<p>The neural machinery behind selective auditory attention, including attention to speech, has been extensively studied at the level of the cerebral cortex (<xref ref-type="bibr" rid="B44">Pugh et al., 1996</xref>; <xref ref-type="bibr" rid="B27">Lakatos et al., 2013</xref>; <xref ref-type="bibr" rid="B34">Mesgarani and Chang, 2012</xref>; <xref ref-type="bibr" rid="B12">Ding and Simon, 2012</xref>; <xref ref-type="bibr" rid="B22">Horton et al., 2013</xref>). However, anatomical and physiological evidence points to substantial descending feedback from the auditory cortex to the auditory brainstem and to the cochlea (<xref ref-type="bibr" rid="B24">Huffman and Henson, 1990</xref>; <xref ref-type="bibr" rid="B40">Pickles, 1988</xref>; <xref ref-type="bibr" rid="B64">Winer et al., 1998</xref>). Through these neural feedback loops, subcortical processing centers may contribute to accomplishing the cocktail party effect. Several studies have indeed found that neural responses from the brainstem, in particular frequency-following responses, can be modulated by selective auditory attention, although others did not find an attentional effect (<xref ref-type="bibr" rid="B18">Galbraith et al., 2003</xref>; <xref ref-type="bibr" rid="B16">Forte et al., 2017</xref>; <xref ref-type="bibr" rid="B14">Etard et al., 2019</xref>; <xref ref-type="bibr" rid="B59">Strauss et al., 2025</xref>; <xref ref-type="bibr" rid="B58">Stoll et al., 2025</xref>; <xref ref-type="bibr" rid="B66">Xie, 2025</xref>).</p>
<p>The cochlea&#x02014;the sensory organ of hearing in which sound vibrations are converted into electrical signals&#x02014;may already contribute to selective auditory attention as well. This fascinating organ spatially decomposes a complex sound such as speech into its individual frequency components, following a tonotopic map in which high frequencies are detected near the organ&#x00027;s base, and lower frequencies progressively further toward the apex (<xref ref-type="bibr" rid="B50">Robles and Ruggero, 2001</xref>; <xref ref-type="bibr" rid="B47">Reichenbach and Hudspeth, 2014</xref>).</p>
<p>In addition, the cochlea possesses an active process through which it amplifies weak sounds (<xref ref-type="bibr" rid="B10">Dallos, 1992</xref>; <xref ref-type="bibr" rid="B23">Hudspeth, 2014</xref>). This mechanical amplification is provided by outer hair cells and can be reduced through activation of the medial olivocochlear (MOC) fibers&#x02014;efferent connections that innervate the outer hair cells (<xref ref-type="bibr" rid="B20">Guinan, 2006</xref>; <xref ref-type="bibr" rid="B29">Lopez-Poveda, 2018</xref>). Because each MOC fiber is tuned to a narrow frequency band, and because the innervation of the cochlea by these fibers displays a tonotopic arrangement, the gain of the active process can potentially be regulated by the brain in a frequency-dependent manner.</p>
<p>The active process is accompanied by a nonlinear response that gives rise to otoacoustic emissions (OAEs). These can be recorded from the ear canal and serve as a non-invasive measure of the amplification gain. OAEs have indeed been used to assess the contribution of the cochlea to selective attention, although with inconclusive results (<xref ref-type="bibr" rid="B33">Meric and Collet, 1992</xref>; <xref ref-type="bibr" rid="B38">Michie et al., 1996</xref>; <xref ref-type="bibr" rid="B63">Walsh et al., 2015</xref>; <xref ref-type="bibr" rid="B55">Smith et al., 2012</xref>; <xref ref-type="bibr" rid="B4">Beim et al., 2018</xref>; <xref ref-type="bibr" rid="B17">Francis et al., 2018</xref>; <xref ref-type="bibr" rid="B65">Wittekindt et al., 2014</xref>). A limitation of these studies was that they either did not involve naturalistic sounds such as speech that facilitate attention, or that they elicited OAEs in a manner that was not directly related to the auditory signal that the participants were asked to attend.</p>
<p>Computational models have shown that selective attention to a speech signal in noise may be supported by frequency-specific modulation of the cochlear active process (<xref ref-type="bibr" rid="B35">Messing et al., 2009</xref>; <xref ref-type="bibr" rid="B9">Clark et al., 2012</xref>). Most parts of speech are voiced, with the energy carried by the fundamental frequency and its many higher harmonics. The lower harmonics, up to the 10th, can be spatially resolved in the cochlea, that is, they cause peaks at significantly distinct locations (<xref ref-type="bibr" rid="B6">Bernstein and Oxenham, 2003</xref>; Pit, <xref ref-type="bibr" rid="B41">2005</xref>; <xref ref-type="bibr" rid="B37">Micheyl and Oxenham, 2007</xref>; <xref ref-type="bibr" rid="B7">Carcagno and Plack, 2011</xref>).</p>
<p>A compelling hypothesis is that the cochlear amplifier selectively enhances the resolved harmonics of a target speech and suppresses the spectral bands that lie inbetween. This mechanisms would thus reduce background noise already at the level of cochlear activity. Because unresolved harmonics cannot be spatially differentiated in the cochlea, this mechanism should not be able to operate for these.</p>
<p>Here, we set out to test this hypothesis. We employed distortion product otoacoustic emissions that were evoked by certain higher harmonics of the voiced parts of speech (speech-like DPOAEs). As the fundamental frequency of natural speech varies over time, the stimuli used to generate speech-like DPOAEs, as well as the DPOAEs themselves, were not pure tones, but instead had instantaneous frequencies that varied over time in proportion to the fundamental frequency of the source signal. The amplitude of the stimuli varied as well, in particular, it was zero during voiceless parts of the speech signal or during silences. We recently developed and corroborated this approach (<xref ref-type="bibr" rid="B51">Saiz-Al&#x000ED;a et al., 2021</xref>).</p>
<p>We elicited and recorded speech-like DPOAEs from one ear while two competing talkers were presented to the contralateral ear. Subjects were instructed to attend either one of the two talkers or to read a text in front of them (visual attention). We then evaluated how the speech-like DPOAEs, in particular those related to resolved and unresolved harmonics, were affected by the attentional focus.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec>
<label>2.1</label>
<title>Experimental design</title>
<p>We utilized a single- and a competing-speaker paradigm. In the single-speaker recordings, an audiobook either spoken by a female or by a male voice was presented to the right ear of the participant. Subjects were asked to focus their attention on the single voice.</p>
<p>In the competing-speaker scenario, two audiobooks, one spoken by a women and the other by a man, were added together and presented to each subject&#x00027;s right ear (<xref ref-type="fig" rid="F1">Figure 1</xref>). Subjects were then instructed to attend either the female or the male voice. In the following, we refer to these two attentional conditions as attended female voice (Att. F) and attended male voice (Att. M). To test effects of intermodal attention, a visual distractor was introduced as well and attended upon prompt; participants then read a story displayed in segments on a screen while ignoring both audio streams. This attentional condition will be referred to as attended visual distractor (Att. V).</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Experimental setup. Two audiobooks, one spoken by a women with fundamental frequency <inline-formula><mml:math id="M1"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and one narrated by a man with fundamental frequency <inline-formula><mml:math id="M2"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, were simultaneously presented to the right ear. Speech-like DPOAEs were recorded from the left ear. They were elicited by four pairs of waveforms. The first stimulus pair, <italic>M</italic><sub>res</sub>, consisted of two waveforms <inline-formula><mml:math id="M3"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>6</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M4"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>8</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> based on resolved harmonics of the male voice [frequencies <inline-formula><mml:math id="M5"><mml:mn>6</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M6"><mml:mn>8</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>]. The second stimulus pair, <italic>F</italic><sub>res</sub>, comprised two waveforms <inline-formula><mml:math id="M7"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M8"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>9</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> following resolved harmonics of the female voice [frequencies <inline-formula><mml:math id="M9"><mml:mn>7</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M10"><mml:mn>9</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>]. The third and fourth stimulus pair, <italic>M</italic><sub>unres</sub> and <italic>F</italic><sub>unres</sub>, were waveforms <inline-formula><mml:math id="M11"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M12"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>18</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> resp. <inline-formula><mml:math id="M13"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M14"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>18</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> derived from unresolved harmonics of the male resp. female voice [frequencies <inline-formula><mml:math id="M15"><mml:mn>15</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M16"><mml:mn>18</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> resp. <inline-formula><mml:math id="M17"><mml:mn>15</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M18"><mml:mn>18</mml:mn><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>]. Participants directed their attention either to one of the two talkers or visually to a continuously presented story on a screen.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1756386-g0001.tif">
<alt-text content-type="machine-generated">Diagram illustrating the experimental setup. It shows a head directed at a screen and the inputs to the left and right ear. Exemplary waveforms for the stimuli derived from the male and female voice and a waveform representing the recording are shown on the left, waveforms representing the audiobooks read by the male and female voice are shown on the right</alt-text>
</graphic>
</fig>
<p>Speech-like DPOAEs were elicited by waveforms derived from the speech signals. To enable clean simultaneous measurement of four different speech-like DPOAEs, these were evoked and measured from the contralateral, i.e. the left, ear.</p>
<p>The presentation of the audiobooks together with the recordings of the speech-like DPOAEs were segmented into two-minute trials, each followed by three comprehension questions and a rating of perceived mental effort to ensure task engagement. For the auditory conditions (attending the female resp. male voice), mental effort was equivalent to the listening effort. In order to include assessment of the effort for the third, i.e. the visual, condition, the term mental effort was chosen.</p>
</sec>
<sec>
<label>2.2</label>
<title>Participants</title>
<p>Speech-like DPOAE measurements were conducted with <italic>N</italic> = 40 participants (21 female, 19 male), aged 18&#x02013;31 years (mean age &#x000B1; SD: 25 &#x000B1; 3 years). Inclusion criteria were right-handedness, native German proficiency, and the absence of neurological or hearing impairments. One subject was excluded since their comprehension score was below chance level, and another due to a faulty microphone recording. 38 participants were thus included in the final analysis.</p>
<p>All procedures were approved by the ethics board of the University Hospital Erlangen (registration 133-12B) and were conducted in accordance with institutional regulations. Informed consent was obtained from all participants.</p>
</sec>
<sec>
<label>2.3</label>
<title>Speech signals</title>
<p>Two audiobooks were synthesized with a text-to-speech engine (ElevenLab, U.S.A.) using the voice &#x0201C;Matilda&#x0201D; for the female speaker and the voice &#x0201C;Brian&#x0201D; for the male one (<xref ref-type="bibr" rid="B13">ElevenLabs, n.d.</xref>). We chose voice parameters that yielded a large separation between the fundamental frequencies of the two voices, which resulted in an average fundamental frequency of <inline-formula><mml:math id="M19"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>195</mml:mn><mml:mo>&#x000B1;</mml:mo><mml:mn>40</mml:mn></mml:math></inline-formula> Hz for the female voice and <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>90</mml:mn><mml:mo>&#x000B1;</mml:mo><mml:mn>20</mml:mn></mml:math></inline-formula> Hz for the male voice (mean &#x000B1; SD).</p>
<p>The texts for the audiobooks were taken from &#x0201C;Eine Frau erlebt die Polarnacht&#x0201D; (<xref ref-type="bibr" rid="B49">Ritter, 2016</xref>) (Book A) and from &#x0201C;Darum&#x0201D; (<xref ref-type="bibr" rid="B19">Glattauer, 2018</xref>) (Book B). Each text was synthesized with both the female and the male voice.</p>
<p>The amplitudes of the digital waveforms representing the speech signals were normalized and scaled such that the root-mean-square amplitude was (2.0 &#x000B1; 0.1) &#x000D7; 10<sup>&#x02212;3</sup> for both the male- and female-spoken texts. Using PsychoPy (<xref ref-type="bibr" rid="B39">Peirce et al., 2019</xref>), the presentation level of the speech stimuli was adjusted such that it reached an average sound pressure level of 37 dB SPL over the two-minute segments.</p>
</sec>
<sec>
<label>2.4</label>
<title>Visual distractor</title>
<p>The visual distractor consisted of text excerpts from a third book, &#x0201C;Frau Ella&#x0201D; (<xref ref-type="bibr" rid="B3">Beckerhoff, 2022</xref>) (Book C). The text was rendered as a video in which short paragraphs appeared word by word at a comfortable reading pace and were displayed centrally on a computer screen.</p>
</sec>
<sec>
<label>2.5</label>
<title>Stimuli for eliciting speech-like DPOAEs</title>
<p>Pure-tone DPOAEs were elicited by two primary frequencies, <italic>f</italic><sub>1</sub> and <italic>f</italic><sub>2</sub>. The lower-sideband cubic distortion product 2<italic>f</italic><sub>1</sub>&#x02212;<italic>f</italic><sub>2</sub> is the strongest, and is maximal at a ratio of <italic>f</italic><sub>2</sub>/<italic>f</italic><sub>1</sub>&#x02248;1.2. For its measurement we employed <italic>f</italic><sub>1</sub> &#x0003D; 1 kHz and <italic>f</italic><sub>2</sub> &#x0003D; 1.2 kHz.</p>
<p>The stimuli to elicit speech-like DPOAEs were computed using an approach that we developed recently (<xref ref-type="bibr" rid="B51">Saiz-Al&#x000ED;a et al., 2021</xref>).</p>
<p>For the voiced segments of each speech signal, we first computed the fundamental waveform <italic>w</italic><sub>0</sub>(<italic>t</italic>), which follows the time-varying fundamental frequency <italic>f</italic><sub>0</sub>(<italic>t</italic>) of the source signal. This was achieved by applying a zero-phase, sixth-order IIR bandpass filter centered on the mean fundamental frequency <inline-formula><mml:math id="M21"><mml:mover accent="true"><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>, with corner frequencies of &#x000B1;0.5 standard deviations around <inline-formula><mml:math id="M22"><mml:mover accent="true"><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>. The mean fundamental frequency was estimated using the probabilistic YIN algorithm implemented in the librosa library (<xref ref-type="bibr" rid="B32">McFee et al., 2025</xref>). The fundamental waveform <italic>w</italic><sub>0</sub>(<italic>t</italic>) was normalized by z-scoring.</p>
<p>Based on <italic>w</italic><sub>0</sub>(<italic>t</italic>), waveforms for the harmonic overtones <italic>n</italic> and <italic>m</italic> (<italic>n</italic>&#x0003C;<italic>m</italic>) were constructed such that their instantaneous frequencies equaled <italic>nf</italic><sub>0</sub>(<italic>t</italic>) and <italic>mf</italic><sub>0</sub>(<italic>t</italic>), respectively. To do so, we computed the analytic representation of the fundamental waveform using the Hilbert transform <italic>H</italic>[<italic>w</italic><sub>0</sub>(<italic>t</italic>)]:</p>
<disp-formula id="EQ1"><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>The fundamental waveform can then be expressed as the real part of the complex signal:</p>
<disp-formula id="EQ2"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x0211C;</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x003A6;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>in which <inline-formula><mml:math id="M25"><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo></mml:math></inline-formula> denotes the signal amplitude, and <inline-formula><mml:math id="M26"><mml:mo>&#x003A6;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mo class="qopname">arg</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula> specifies its instantaneous phase.</p>
<p>Harmonic waveforms <italic>w</italic><sub><italic>n</italic></sub>(<italic>t</italic>) and <italic>w</italic><sub><italic>m</italic></sub>(<italic>t</italic>) were obtained by multiplying the phase &#x003A6;(<italic>t</italic>) by the desired harmonic number and taking the real part:</p>
<disp-formula id="EQ3"><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x0211C;</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x003A6;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mtext class="textrm" mathvariant="normal">and&#x000A0;</mml:mtext><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x0211C;</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x003A6;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>The instantaneous frequencies of the two elicitor waveforms are thus <italic>nf</italic><sub>0</sub>(<italic>t</italic>) and <italic>mf</italic><sub>0</sub>(<italic>t</italic>). Consequently, the lower-sideband cubic distortion product they generate exhibits an instantaneous frequency of (2<italic>n</italic>&#x02212;<italic>m</italic>)<italic>f</italic><sub>0</sub>(<italic>t</italic>), corresponding to the waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>). The resulting speech-like DPOAE was identified by cross-correlating <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) with the microphone recording.</p>
<p>To assess attentional modulation of cochlear activity at both resolved and unresolved harmonics, we designed four pairs of stimulus waveforms: (1) the stimulus <italic>F</italic><sub>res</sub>: two waveforms <inline-formula><mml:math id="M28"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M29"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>9</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> derived from resolved harmonics of the female voice, (2) the stimulus <italic>F</italic><sub>unres</sub>: two waveforms <inline-formula><mml:math id="M30"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M31"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>18</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> derived from unresolved harmonics of the female voice, (3) the stimulus <italic>M</italic><sub>res</sub>: two waveforms <inline-formula><mml:math id="M32"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>6</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M33"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>8</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> derived from resolved harmonics of the male voice, and (4) the stimulus <italic>M</italic><sub>unres</sub>: two waveforms <inline-formula><mml:math id="M34"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M35"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>18</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> derived from unresolved harmonics of the male voice (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>The harmonic numbers <italic>n</italic> and <italic>m</italic> for the different stimuli, with the harmonic index 2<italic>n</italic>&#x02212;<italic>m</italic> of the resulting lower-sideband cubic distortion product and the ratio <italic>m</italic>/<italic>n</italic>. <inline-formula><mml:math id="M38"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="M39"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M40"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>n</mml:mi><mml:mo>-</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> denote the average frequencies of the waveforms <italic>w</italic><sub><italic>n</italic></sub>(<italic>t</italic>), <italic>w</italic><sub><italic>m</italic></sub>(<italic>t</italic>) and <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) as (mean &#x000B1; SD) across all 2 min segments.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Stimulus type</bold></th>
<th valign="top" align="center"><bold><italic>n</italic></bold></th>
<th valign="top" align="center"><bold><italic>m</italic></bold></th>
<th valign="top" align="center"><bold>2<italic>n</italic>&#x02212;<italic>m</italic></bold></th>
<th valign="top" align="center"><bold><italic>m</italic>/<italic>n</italic></bold></th>
<th valign="top" align="center"><bold><inline-formula><mml:math id="M41"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> (Hz)</bold></th>
<th valign="top" align="center"><bold><inline-formula><mml:math id="M42"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> (Hz)</bold></th>
<th valign="top" align="center"><bold><inline-formula><mml:math id="M43"><mml:msub><mml:mrow><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>n</mml:mi><mml:mo>-</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:math></inline-formula> (Hz)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>F</italic><sub>res</sub></td>
<td valign="top" align="center">7</td>
<td valign="top" align="center">9</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">1.29</td>
<td valign="top" align="center">1,360 &#x000B1; 73</td>
<td valign="top" align="center">1,750 &#x000B1; 94</td>
<td valign="top" align="center">970 &#x000B1; 50</td>
</tr>
<tr>
<td valign="top" align="left"><italic>F</italic><sub>unres</sub></td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">18</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.2</td>
<td valign="top" align="center">2,910 &#x000B1; 158</td>
<td valign="top" align="center">3,500 &#x000B1; 190</td>
<td valign="top" align="center">2,320 &#x000B1; 120</td>
</tr>
<tr>
<td valign="top" align="left"><italic>M</italic><sub>res</sub></td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">8</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">1.33</td>
<td valign="top" align="center">530 &#x000B1; 37</td>
<td valign="top" align="center">710 &#x000B1; 49</td>
<td valign="top" align="center">350 &#x000B1; 22</td>
</tr>
<tr>
<td valign="top" align="left"><italic>M</italic><sub>unres</sub></td>
<td valign="top" align="center">15</td>
<td valign="top" align="center">18</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">1.2</td>
<td valign="top" align="center">1,330 &#x000B1; 91</td>
<td valign="top" align="center">1,600 &#x000B1; 109</td>
<td valign="top" align="center">1,050 &#x000B1; 67</td>
</tr></tbody>
</table>
</table-wrap>
<p>Importantly, the terms &#x0201C;resolved&#x0201D; and &#x0201C;unresolved&#x0201D; refer here to the harmonic structure of the speech signal presented to the right ear. In contrast, in the left, contralateral, ear used for speech-like DPOAE recording, the two waveforms constituting each stimulus pair [e.g., <inline-formula><mml:math id="M36"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M37"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>18</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> representing the 15th resp. 18th harmonic overtone of the speech signal] are separated by a similar frequency ratio as the lower-order pairs and are therefore comparably resolved by cochlear filtering. Thus, the distinction between resolved and unresolved stimuli does not pertain to the separability of the harmonic numbers of the stimulus waveforms themselves, but to whether the surrounding harmonic components in the speech signal are resolved or not. For the unresolved harmonics, multiple adjacent harmonics fall within a single cochlear filter, resulting in broad excitation of the cochlear region corresponding to the DPOAE generation site and, consequently, a different pattern of MOC-mediated modulation.</p>
<p>We employed different harmonics <italic>n</italic> and <italic>m</italic> for the resolved harmonics of the female and the male voice in order to avoid unwanted correlations between the stimulus waveforms and the DPOAE waveforms. Moreover, we tried to achieve a ratio of <italic>m</italic>/<italic>n</italic>&#x02248;1.2 for optimal distortion product generation. For unresolved harmonics, the frequency spacing was sufficiently large to allow the same harmonic pairs to be used for male- and female-related stimuli without inducing unwanted correlations between the speech-like DPOAE and stimulus waveforms.</p>
<p>As a consequence of these design choices, the frequency ranges of the resolved harmonics of the female voice partially overlap with those of the unresolved harmonics of the male voice (<xref ref-type="table" rid="T1">Table 1</xref>). This overlap arises from the different fundamental frequencies of the male and female voices and the need to balance harmonic resolvability with the constraints mentioned above. The implications of this spectral overlap for the interpretation of the results are addressed in the Discussion.</p>
<p>For the simultaneous presentation of all four harmonic pairs, all waveforms corresponding to the lower harmonic number <italic>n</italic> were summed to form the waveform <italic>W</italic><sup>(1)</sup>(<italic>t</italic>), and all waveforms corresponding to the higher harmonic number <italic>m</italic> were summed separately to form the waveform <italic>W</italic><sup>(2)</sup>(<italic>t</italic>). These two waveforms were delivered to the ear canal via two independent loudspeakers.</p>
</sec>
<sec>
<label>2.6</label>
<title>Experimental setup</title>
<p>Experiments were conducted in a sound-proof, semi-anechoic chamber. Stimulus presentation and data acquisition were automated through PsychoPy (<xref ref-type="bibr" rid="B39">Peirce et al., 2019</xref>). Instructions were displayed on a screen; responses were given via mouse click.</p>
<p>The sound stimuli were presented at 44.1 kHz using a high-performance sound card (RME Fireface 802) and delivered through an extended-bandwidth otoacoustic measurement system (ER10X, Etymotics, U.S.A.), equipped with one microphone and three speakers per ear. Custom ear tips ensured optimal probe fit. Audiobooks were presented to the right ear while stimuli presentation and speech-like DPOAE recordings were conducted in the left ear. For each stimulus, the two waveforms <italic>W</italic><sup>(1)</sup>(<italic>t</italic>) and <italic>W</italic><sup>(2)</sup>(<italic>t</italic>), each consisting of the sum of the four harmonic waveforms corresponding to the lower and higher harmonic numbers <italic>n</italic> and <italic>m</italic> respectively, were played through different speakers to avoid hardware-induced distortion.</p>
<p>Stimuli were delivered directly into the ear canal. The presentation level was adjusted so that, when averaged across each trial of approximately two minutes, the resulting mean sound pressure level in the ear canal was 37 dB SPL. All participants reported this level as comfortable. It was intentionally kept low to avoid eliciting the middle-ear muscle reflex (<xref ref-type="bibr" rid="B25">Jennings, 2021</xref>; <xref ref-type="bibr" rid="B61">Trevino et al., 2023</xref>).</p>
</sec>
<sec>
<label>2.7</label>
<title>Experimental routine</title>
<p>Each story segment lasted about two minutes. After each such two-minute trial, participants answered three comprehension questions and rated the perceived mental effort on a 13-point Likert scale.</p>
<p>The experiment began with a two-minute pure-tone DPOAE measurement to verify DPOAE detectability. DPOAEs could be recorded in all participants.</p>
<p>Next, speech-like DPOAEs were recorded in a single-speaker scenario, using either the male and the female voice in isolation. Speech-like DPOAEs to both the resolved and the unresolved harmonics of the corresponding voice were measured simultaneously to confirm that both speech-like DPOAEs could be measured concurrently.</p>
<p>The main part of the experiment consisted of a competing-speaker scenario in which both the female and the male voice were presented simultaneously. Participants were instructed to direct their attention either to the female speaker (Att. F), to the male speaker (Att. M), or to the visual task (Att. V). The target audio thereby always contained the story of Book A, spoken either by the male or the female voice, allowing participants to follow a continuing story when switching attention between speakers. In the Att. V. condition, participants ignored both audio streams and focused on reading Book C, presented word-by-word on a monitor. During the two auditory-attention conditions (Att. F and Att. M), the text from Book C was also shown on the monitor; however, participants were allowed to choose whether to look at the text directly or at another fixed point on the screen, depending on whether they found looking at the text too distracting from the auditory task. During each of the conditions, the waveforms <italic>W</italic><sup>(1)</sup>(<italic>t</italic>) and <italic>W</italic><sup>(2)</sup>(<italic>t</italic>) comprising the four stimulus pairs <italic>F</italic><sub>res</sub>, <italic>F</italic><sub>unres</sub>, <italic>M</italic><sub>res</sub>, and <italic>M</italic><sub>unres</sub> were presented to elicit the four respective speech-like DPOAEs.</p>
<p>To verify that the measurement equipment did not contribute to the speech-like DPOAEs, out-of-ear control measurements were conducted. The probe was placed outside the ear canal in the center of the recording room, with all reflective surfaces avoided to minimize acoustic feedback. No speech-like DPOAEs emerged in that case.</p>
</sec>
<sec>
<label>2.8</label>
<title>Analysis of speech-like DPOAEs</title>
<p>Hardware-induced delays were estimated per trial by cross-correlating the stimulus waveforms with the microphone recording. The recordings were corrected for the delays before further analysis.</p>
<p>Speech-like DPOAEs were computed by cross-correlating each of the four speech-like DPOAE waveforms <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) with the microphone recording. To compensate for potential phase shifts between the otoacoustic emission and the waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>), we computed the complex cross-correlation. Its real part corresponds to the correlation between the real component of the analytic representation of <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) and the microphone signal, whereas the imaginary part corresponds to the correlation with the imaginary component of the analytic representation. The envelope of the complex cross-correlation was then obtained as the absolute value of the resulting complex-valued correlation.</p>
<p>Grand averages were computed by averaging the envelopes of the cross-correlations across all two-minute trials, distinguishing between the three attentional conditions and the four stimulus types. A peak in the grand average was considered significant if it exceeded the maximum of the noise. The noise level was thereby determined from the values of the envelope of the correlation coefficients at time lags of &#x02212;750 to &#x02212;70 ms and from 70 to 750 ms, that is, at delays at which no speech-like DPOAEs should occur.</p>
<p>To detect speech-like DPOAEs at the level of individual two-minute trials, the expected window for peak delays was defined as 1 &#x000B1; 3 ms (mean &#x000B1; SD), based on the grand averages of the stimuli <italic>F</italic><sub>res</sub> and <italic>F</italic><sub>unres</sub>. Stimuli corresponding to the male voice were excluded for the determination of this window: the grand average for the <italic>M</italic><sub>res</sub> stimulus did not yield reliable results, and we wanted to prevent an imbalance between resolved and unresolved harmonics, such that we disregarded the stimulus <italic>M</italic><sub>res</sub> as well. A peak from an individual trial within this window was considered significant if it exceeded the 97th percentile of the noise. The percentage of significant trials was then computed for each stimulus type.</p>
<p>To compare speech-like DPOAEs across attentional conditions, the cross-correlation coefficients at the grand average peak delay were extracted for each two-minute trial, matched per stimulus type and attentional condition, and averaged per participant. Single-speaker comparisons used unpaired <italic>t</italic>- or Mann&#x02013;Whitney <italic>U</italic> tests. For the data obtained from the competing-speaker scenario, paired <italic>t</italic>-tests or Wilcoxon signed-rank tests were used, with outlier exclusion when justified.</p>
<p>To evaluate differences between speech-like DPOAEs evoked by resolved and unresolved harmonics, peak delays per individual two-minute trials were extracted for all significant peaks, matched per stimulus type and condition and averaged per participant. Statistical comparisons used unpaired <italic>t</italic>- or Mann&#x02013;Whitney <italic>U</italic> tests.</p>
<p>All <italic>p</italic>-values were corrected for multiple comparisons using the False Discovery Rate correction.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec>
<label>3.1</label>
<title>Comprehension scores and mental effort</title>
<p>Speech comprehension was quantified as the percentage of correct answers, and mental effort was rated using mean values on a Likert scale from 1 to 13 (low to high). Values are given as (mean &#x000B1; SD). Statistical significance was assessed using Wilcoxon signed-rank tests.</p>
<p>In the single-speaker measurements, comprehension scores were high: 97 &#x000B1; 13% for the female voice and 96 &#x000B1; 14% for the male voice, with no significant difference between them.</p>
<p>The comprehension scores were slightly lower in the competing-speaker condition: 95 &#x000B1; 6% when attending the female voice, 91 &#x000B1; 8% for the male voice, and 94 &#x000B1; 7% when reading the text. Again, no significant differences were found, suggesting that all conditions were similarly comprehensible.</p>
<p>Regarding mental effort, the female and male voices were perceived as similarly demanding in the single-speaker condition, with ratings of 4.0 &#x000B1; 2.3 vs. 4.4 &#x000B1; 2.4 and no significant difference.</p>
<p>In the competing-speaker condition, perceived effort varied significantly depending on the attentional focus. Attending the visual distractor was rated easiest at 5.7 &#x000B1; 2.2; lower than when attending the male voice (<italic>p</italic> &#x0003C; 0.001) and lower than when attending the female voice (<italic>p</italic> &#x0003C; 0.05). In contrast, attending the male voice was rated hardest at 7.3 &#x000B1; 1.7; with <italic>p</italic> &#x0003C; 0.001 when compared to other conditions. Attending the female voice fell in between at a value of 6.5 &#x000B1; 1.8.</p>
</sec>
<sec>
<label>3.2</label>
<title>Measurement of speech-like DPOAEs</title>
<p>To measure a particular speech-like DPOAE, we computed a waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) that corresponded to the lower-sideband cubic distortion product of the pair of harmonics that was used for stimulation. As an example, for the stimulus <italic>F</italic><sub>res</sub> we utilized waveforms <inline-formula><mml:math id="M44"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>7</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M45"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>9</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, yielding <inline-formula><mml:math id="M46"><mml:msubsup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>5</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> as the lower sideband cubic distortion product.</p>
<p>The speech-like DPOAE waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub> was then cross-correlated with the microphone recording (<xref ref-type="fig" rid="F2">Figures 2A</xref>&#x02013;<xref ref-type="fig" rid="F2">D</xref>). To obtain a sense of the expected shape of the cross-correlation, we first computed the auto-correlation of a waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub> (<xref ref-type="fig" rid="F2">Figure 2A</xref>). As expected, this showed a single peak at zero latency, which was particularly apparent in the autocorrelation&#x00027;s envelope.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Recording of speech-like DPOAEs. Speech-like DPOAEs were measured through cross-correlating the expected DPOAE waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>(<italic>t</italic>) with the microphone signal. <bold>(A)</bold> The auto-correlation of the speech-like DPAOE waveform <italic>w</italic><sub>5</sub>(<italic>t</italic>) of the <italic>F</italic><sub>res</sub> stimulus gives an estimate for the morphology of the expected peak. The envelope (purple) of the cross-correlation (black) peaks at 0 ms, as required for an autocorrelation. <bold>(B, C)</bold> The correlation of the waveform <italic>w</italic><sub>5</sub>(<italic>t</italic>) of the <italic>F</italic><sub>res</sub> stimulus with the microphone recording, for two individual trials, exhibited a peak at a short delay (dashed line) that showed the presence of the speech-like DPOAE at this delay. <bold>(D)</bold> In contrast, if no peak emerged, the speech-like DPOAE could not be measured for that particular trial. The exemplary recordings show results from a single two-minute trial in the single-speaker scenario, for the <italic>F</italic><sub>res</sub> stimulus and for different subjects. <bold>(E, F)</bold> Speech-like DPOAEs could be detected in most trials both in the single-speaker and in the competing-speaker scenario, except for the stimulus <italic>M</italic><sub>res</sub>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1756386-g0002.tif">
<alt-text content-type="machine-generated">Figure containing six panels: Panels A to D are line graphs showing correlation versus delay in milliseconds, with A as an auto-correlation, B to D as cross-correlations labeled with delay times of 2.6 and 1.3 milliseconds for B and C, and no marked value for D. Panels E and F are bar charts comparing percentages for F_res, F_unres, M_res, and M_unres groups under single speaker and competing speaker conditions, with higher values for M_unres in both conditions and generally lower values for M_res, especially in panel F.</alt-text>
</graphic>
</fig>
<p>We then computed the envelope of the cross-correlation of the waveform <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub> with the microphone recording to obtain, in most subjects, a single peak at a delay between 0 &#x02212; 3 ms. A peak can be interpreted as a successful measurement of a speech-like DPOAE, with a delay that corresponds to that of the peak. Examples of individual trials with large and moderate peak amplitudes, as well as a trial without a significant peak, are shown in <xref ref-type="fig" rid="F2">Figures 2B</xref>&#x02013;<xref ref-type="fig" rid="F2">D</xref>, illustrating the variability in speech-like DPOAE morphology. These examples further show that the width of the cross-correlation parallels that of the autocorrelation, indicating that the temporal resolution is limited by the autocorrelation, not by additional temporal jitter in the emissions.</p>
<p>To assess how well the speech-like DPOAEs for the different stimuli could be measured, we computed the percentage of trials per stimulus type that yielded significant peaks in the cross-correlation. In the single-speaker scenario, the three stimuli <italic>F</italic><sub>res</sub>, <italic>F</italic><sub>unres</sub>, and <italic>M</italic><sub>unres</sub> all yielded significant speech-like DPOAEs in over 85% of single-speaker trials (<xref ref-type="fig" rid="F2">Figure 2D</xref>). However, the stimulus <italic>M</italic><sub>res</sub> performed notably worse, with only 35% of trials showing a significant speech-like DPOAE.</p>
<p>A similar pattern emerged in the competing-speaker scenario (<xref ref-type="fig" rid="F2">Figure 2E</xref>): the stimuli <italic>F</italic><sub>unres</sub> and <italic>M</italic><sub>unres</sub> performed well with speech-like DPOAEs detectable in about 70% of the trials, the stimulus <italic>F</italic><sub>res</sub> slightly lower, with about 50% of trials yielding significant speech-like DPOAEs, and <italic>M</italic><sub>res</sub> remained poor with only about 12% of trials producing a significant measurement.</p>
<p>To further compare the speech-like DPOAEs evoked by the different stimuli, we averaged the envelopes of the complex cross-correlations across trials and subjects, yielding grand averages.</p>
<p>In the single-speaker scenario, all stimulus types produced significant peaks, but with considerable variation in the amplitudes, that is, the values of the envelopes of the cross-correlations at the peak (<xref ref-type="fig" rid="F3">Figure 3E</xref>). The speech-like DPOAE for the stimulus <italic>M</italic><sub>res</sub>, with an average frequency of 350 Hz, had the smallest amplitude. The highest amplitudes emerged for the stimuli <italic>F</italic><sub>res</sub> and <italic>M</italic><sub>unres</sub>, both of which produced speech-like DPOAEs with average frequencies around 1 kHz.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Speech-like DPOAEs in the single-speaker scenario. <bold>(A&#x02013;D)</bold> Grand averages of the envelopes of the complex cross-correlations for the different stimuli. All peaks except the one for the <italic>M</italic><sub>res</sub> stimulus were clearly visible. The delays of the peaks are marked by dashed lines. <bold>(E)</bold> Speech-like DPOAE amplitudes varied significantly across stimulus types. They displayed an inverted U when plotted against the average frequency <inline-formula><mml:math id="M47"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>D</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> of the expected speech-like DPOAE waveforms <italic>w</italic><sub>2<italic>n</italic>&#x02212;<italic>m</italic></sub>, with the largest amplitudes emerging for speech-like DPOAEs with average frequencies around 1 kHz. (F) Peak delays of individual trials differed significantly across stimulus types. The variation in delays for the <italic>M</italic><sub><italic>res</italic></sub> stimulus was high due to the low number of significant peaks. Statistical significance is indicated as <sup>&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.05), <sup>&#x0002A;&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.01), <sup>&#x0002A;&#x0002A;&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.001).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1756386-g0003.tif">
<alt-text content-type="machine-generated">Scientific figure with four line graphs labeled A to D showing envelope cross-correlation versus delay for four conditions, each with a peak marked by a horizontal dashed line and corresponding delay in milliseconds. Panels E and F display boxplots comparing amplitude and delay, respectively, across four groups distinguished by color (purple, blue, green) and frequency in Hertz. Statistical significance is marked with asterisks. Color legend differentiates groups by F_res, F_unres, M_res, and M_unres.</alt-text>
</graphic>
</fig>
<p>The delays of the speech-like DPOAEs also varied across the four stimulus types (<xref ref-type="fig" rid="F3">Figure 3F</xref>). The shortest delay was observed for the <italic>F</italic><sub>unres</sub> stimulus at 0 ms, while the longest delay occurred for the <italic>M</italic><sub>res</sub> stimulus at 3.1 ms. For the <italic>M</italic><sub>res</sub> stimulus, peak delays showed substantial variability across participants. This variability was likely due to the low number of significant peaks for this stimulus type (<xref ref-type="fig" rid="F2">Figures 2D</xref>, <xref ref-type="fig" rid="F2">E</xref>), further underscoring the limited interpretability of the corresponding results.</p>
<p>Due to the small amplitude and the low rate of significant peaks observed, the stimulus <italic>M</italic><sub>res</sub> was excluded from further analysis.</p>
<p>For competing-speaker trials, the remaining stimuli (<italic>F</italic><sub>res</sub>, <italic>F</italic><sub>unres</sub>, <italic>M</italic><sub>unres</sub>) produced significant peaks in all three attentional conditions (<xref ref-type="fig" rid="F4">Figure 4A</xref>&#x02013;<xref ref-type="fig" rid="F4">I</xref>). While the delays of the peaks remained stable across attentional conditions, they continued to vary between stimulus types.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Attentional modulation of speech-like DPOAEs. <bold>(A&#x02013;I)</bold> Envelopes of the complex cross-correlations of speech-like DPOAEs evoked by the three different stimuli <italic>F</italic><sub>res</sub>, <italic>F</italic><sub>unres</sub>, and <italic>M</italic><sub>unres</sub>, in the three different attentional conditions attended female voice (Att. F), attended male voice (Att. M), and attended visual distractor (Att. V.). All showed clear peaks at short delays (dashed lines). <bold>(J&#x02013;L)</bold> Comparison of the peak amplitudes yielded attentional effects for the <italic>F</italic><sub>res</sub> stimulus and the <italic>M</italic><sub>unres</sub> stimulus, but not for the stimulus <italic>F</italic><sub>unres</sub>. <bold>(M&#x02013;O)</bold> The peak delays were not affected by the attentional focus, for neither of the three stimulus types. Statistical significance is indicated as <sup>&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.05), <sup>&#x0002A;&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.01), <sup>&#x0002A;&#x0002A;&#x0002A;</sup> (<italic>p</italic> &#x0003C; 0.001).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1756386-g0004.tif">
<alt-text content-type="machine-generated">Nine line graphs labeled A to I in three columns show envelope cross-correlation versus delay for three attentional conditions (Att. F, Att. M, Att. V) and three stimulus types (F_res, F_unres, M_unres), distinguished by purple, blue, and green respectively. Boxplots J to O below compare amplitude and delay distributions for each stimulus type across attentional conditions, with statistical significance annotated.</alt-text>
</graphic>
</fig>
<p>Out-of-ear control measurements did not yield significant peaks in the grand average for any stimulus type in single- or competing-speaker trials, confirming the absence of measurable distortion products when the probe was placed outside the ear canal.</p>
</sec>
<sec>
<label>3.3</label>
<title>Attentional modulation of speech-like DPOAEs</title>
<p>To assess effects of attentional focus on the speech-like DPOAEs in the competing-speaker scenario, we characterized the latter through both the delay of the peak in the complex cross-correlation and the amplitude, that is, the value of the envelope of the complex cross-correlation at the grand average peak delay.</p>
<p>We first quantified the influence of attention on the amplitude of the emissions. We started with the stimulus <italic>F</italic><sub>res</sub>, that is, by assessing the speech-like DPOAEs evoked by resolved harmonics of the female voice (<xref ref-type="fig" rid="F4">Figure 4J</xref>). We found a significantly lower amplitude when the female speaker was attended (Att. F) than when the male speaker was attended (Att. M). The difference was highly statistically significant, with a <italic>p</italic>-value below 0.001 (<italic>p</italic> &#x0003D; 0.0003). The ratio of the amplitudes in the two conditions, Att. F. vs. Att. M., was 0.8, or &#x02013;2.2 dB.</p>
<p>The amplitude when attending the female speaker was also significantly lower than when reading the text, that is, when attention was focused on the visual modality, with a <italic>p</italic>-value below 0.001 (<italic>p</italic> &#x0003D; 0.0003, <xref ref-type="fig" rid="F4">Figure 4J</xref>). In this case, the amplitude ratio was 0.7, or &#x02013;2.6 dB. In contrast, no significant difference emerged when comparing the attended male voice (Att. M) condition to the attend visual condition (Att. V).</p>
<p>For the speech-like DPOAEs elicited by unresolved harmonics of the female voice, the stimulus <italic>F</italic><sub>unres</sub>, we did not observe any difference in amplitudes across the three attentional conditions (<xref ref-type="fig" rid="F4">Figure 4K</xref>).</p>
<p>For the male speaker, because the stimulus <italic>M</italic><sub>res</sub> did not give reliable speech-like DPOAEs, we could only assess the stimulus <italic>M</italic><sub>unres</sub> which utilized unresolved harmonics of the male voice. Significant amplitude differences emerged between all three conditions (<xref ref-type="fig" rid="F4">Figure 4L</xref>). The amplitude when attending the male voice was significantly higher than in the other two conditions. The ratio between the amplitudes when attending the male voice and when attending the female one was 1.5, or 3.8 dB (<italic>p</italic> &#x0003D; 7x10<sup>&#x02212;7</sup>). In addition, the amplitude when ignoring the male voice, i.e. attending the female voice, was smaller than when reading the text (ratio of 0.8, or &#x02013;2 dB; <italic>p</italic> &#x0003D; 0.002). The amplitude difference between attending the male voice and reading the text was slightly less pronounced (<italic>p</italic> &#x0003D; 0.03) with a ratio of 1.2, or 1.7 dB.</p>
<p>For the delays of the speech-like DPOAEs, we did not find any significant differences between the three attentional conditions, in neither of the three stimuli (<xref ref-type="fig" rid="F4">Figures 4M</xref>&#x02013;<xref ref-type="fig" rid="F4">O</xref>).</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>This study examined the feasibility of simultaneously eliciting and measuring multiple speech-like DPOAEs in response to four pairs of stimuli composed of resolved and unresolved harmonics that were derived from two distinct voices. We further investigated whether the speech-like DPOAEs were modulated by selective auditory attention as well as by intermodal attention, and whether such attentional modulation differed between resolved and unresolved harmonics.</p>
<p>Our study shows that four speech-like DPOAEs in response to multiple stimuli pairs can be measured successfully. However, we observed considerable variability in the quality of the recorded responses between the different speech-like DPOAEs (cf. <xref ref-type="fig" rid="F2">Figure 2</xref>), which may reflect either inherent variability in the generation mechanisms of these otoacoustic emissions or differences in noise levels due to varying frequencies of the eliciting stimulus waveforms. In addition, not all significant speech-like DPOAE peaks shared the same morphology; variations were observed in peak width, SNR, peak height, and peak latency.</p>
<p>While a previous study from our group already demonstrated the feasibility of recording speech-like DPOAEs (<xref ref-type="bibr" rid="B51">Saiz-Al&#x000ED;a et al., 2021</xref>), here, we extend this approach by eliciting speech-like DPOAEs with four simultaneously presented harmonic pairs instead of only one, thereby increasing both the speech-related information in the stimulus and the resulting DPOAE signal, and more closely approximating natural speech.</p>
<p>Further, we selected the employed harmonics such that they were either clearly resolved or clearly unresolved, strengthening the interpretability of our conditions. Indeed, our previous work reported indications of attentional effects in speech-like DPOAEs, but also noted inconsistencies between the male and female voices (<xref ref-type="bibr" rid="B51">Saiz-Al&#x000ED;a et al., 2021</xref>). These may be attributed to differences in the harmonic structures employed, with the harmonics of the male voice in the previous study occupying the transition zone between resolved and unresolved harmonics.</p>
<p>Importantly, the auditory and visual stimuli remained statistically the same throughout the different attentional conditions. In particular, both audio streams and the visual text were presented concurrently with all four stimulus pairs for eliciting the speech-like DPOAEs. As a result, the only variable that differed between conditions was the participant&#x00027;s attentional focus. This design minimized the likelihood of systematic confounds such as differences in movement or task structure. Furthermore, even if the middle-ear muscle reflex should have been activated despite the low stimulus volumes, it would have affected all attentional conditions equally.</p>
<p>Regarding the behavioral results, we found that the comprehension scores did not differ significantly across the attentional conditions, confirming that task difficulty was well-matched. Small differences in self-reported mental effort were likely attributable to speaker-specific factors such as intonation.</p>
<sec>
<label>4.1</label>
<title>Frequency-dependency of speech-like DPOAEs</title>
<p>Our evaluation of the amplitude of the speech-like DPOAEs in the single-speaker scenario revealed a pronounced dependency on the emission frequency (<xref ref-type="fig" rid="F3">Figure 3E</xref>). Consistent with previous findings on pure-tone DPOAEs, amplitudes were strongest for stimulus frequencies around 1 kHz (<xref ref-type="bibr" rid="B43">Probst et al., 1991</xref>). Both lower and higher emission frequencies resulted in lower amplitudes. Because noise in electronics as well as in mechanical and acoustic systems increases at lower frequencies, the signal at the lowest emission, around 350 Hz for the stimulus <italic>M</italic><sub>res</sub>, could not be detected in most trials. This stimulus was therefore excluded from further analysis.</p>
</sec>
<sec>
<label>4.2</label>
<title>Attentional effects</title>
<p>We observed attentional modulation of the amplitudes of the speech-like DPOAEs for the stimuli <italic>F</italic><sub>res</sub> and <italic>M</italic><sub>unres</sub>, but not for <italic>F</italic><sub>unres</sub>.</p>
<p>The hypothesized differences between resolved and unresolved harmonics emerged clearly for the speech-like DPOAEs related to the female voice. Resolved harmonics, such as those employed for the stimulus <italic>F</italic><sub>res</sub>, produce spatially distinct excitation peaks along the basilar membrane (Pit, <xref ref-type="bibr" rid="B41">2005</xref>; <xref ref-type="bibr" rid="B6">Bernstein and Oxenham, 2003</xref>). Vibrations at these locations can be selectively enhanced through higher gain of the active process, which can help to enhance the neural representation of a target speech. In contrast, unresolved harmonics generate overlapping peaks along the basilar membrane, making such a spatial filter unfeasible. These considerations likely explain the presence of attentional modulation for the stimulus <italic>F</italic><sub>res</sub> together with its absence for the stimulus <italic>F</italic><sub>unres</sub>.</p>
<p>The direction of the observed attentional effect for speech-like DPOAEs elicited by the stimulus <italic>F</italic><sub>res</sub>, however, was unexpected: the resolved harmonics of the female voice counterintuitively caused <italic>lower</italic> amplitudes when attention was directed at the female voice versus when the male voice or the visual task was attended. Taken at face value, this result means that speech-like DPOAEs are weaker when a signal is attended than when it is ignored. This could suggest that the cochlea amplifies the harmonic structure of the target voice <italic>less</italic> than the background noise. Although unexpected, similar decreases have been reported in previous DPOAE studies (<xref ref-type="bibr" rid="B55">Smith et al., 2012</xref>; <xref ref-type="bibr" rid="B57">Srinivasan et al., 2012</xref>, <xref ref-type="bibr" rid="B56">2014</xref>) and occasionally elsewhere in the auditory system [e.g., decreases in neural tracking when intelligibility is high (<xref ref-type="bibr" rid="B21">Hauswald et al., 2022</xref>)].</p>
<p>One plausible explanation for the observed reduction in speech-like DPOAE amplitudes during selective attention is rooted in the known operating principles of the medial olivocochlear (MOC) efferent system. If we suppose that attending to a target signal enhances MOC activity, it consequently suppresses outer hair cell&#x02013;mediated cochlear amplification and thereby partially linearizes the basilar membrane input&#x02013;output function by reducing its compressive nonlinearity (<xref ref-type="bibr" rid="B30">Maison et al., 2001</xref>; <xref ref-type="bibr" rid="B20">Guinan, 2006</xref>). Such partial linearization has been proposed to improve speech intelligibility in noisy environments by suppressing background energy more uniformly, while preserving the salience of structured speech components (<xref ref-type="bibr" rid="B30">Maison et al., 2001</xref>; <xref ref-type="bibr" rid="B36">Micheyl and Collet, 1993</xref>). Importantly, however, distortion product otoacoustic emissions rely on strong local nonlinearities of the BM response. Consequently, MOC-induced linearization may lead to reduced DPOAE generation, even in conditions where perceptual performance is enhanced.</p>
<p>Still, it should be noted that the present paradigm does not involve classical speech-in-noise, but rather a speech-in-speech scenario in which both the target and the competing signal exhibit similar spectral structure and density. It therefore remains unclear to what extent mechanisms proposed for broadband noise suppression generalize to this more specialized listening situation.</p>
<p>In this context, it is instructive to consider previous studies that reported heterogeneous outcomes. (<xref ref-type="bibr" rid="B65">Wittekindt et al. 2014</xref>) found reduced DPOAE levels during visual attention when comparing levels to a baseline value of inattention, but no change during auditory attention, whereas (<xref ref-type="bibr" rid="B63">Walsh et al. 2015</xref>) observed attentional differences whose directions varied from subject to subject. These findings were challenged by (<xref ref-type="bibr" rid="B17">Francis et al. 2018</xref>), who observed that switching between states of attention and inattention produced changes in ear-canal noise that could mimic modulation of otoacoustic emissions. Our design avoids this confound: both auditory streams and the visual text were presented in all three attentional conditions, and only the focus of attention varied. We did not measure states of inattention. Thus, ear-canal noise and participant movement were expected to be equal across trials.</p>
<p>Other findings add to the mixed picture. (<xref ref-type="bibr" rid="B4">Beim et al. 2018</xref>) and (<xref ref-type="bibr" rid="B5">Beim et al. 2019</xref>) reported higher SFOAEs during auditory attention in one study but failed to replicate the effect in a second cohort. Earlier work by (<xref ref-type="bibr" rid="B38">Michie et al. 1996</xref>) also found no attentional effect on tone-pip evoked OAEs. On the other hand, evidence for efferent modulation of otoacoustic emissions comes from a recent work showing that the predictability of tone sequences modulates DPOAE amplitudes depending on behavioral relevance (<xref ref-type="bibr" rid="B48">Riecke et al., 2020</xref>). Differences across study designs, types of OAEs, and susceptibility to noise likely contribute to these discrepancies.</p>
<p>The study most comparable to this is our earlier one on speech-like DPOAEs (<xref ref-type="bibr" rid="B51">Saiz-Al&#x000ED;a et al., 2021</xref>). It found a positive attentional modulation coefficient for the female voice, indicating larger DPOAEs when the corresponding voice was attended. However, the reported effect was modest (<italic>p</italic> &#x0003D; 0.02) compared with the clearer differences observed here (<italic>p</italic> &#x0003D; 0.0003), and the actual DPOAE amplitudes did not differ significantly. Key methodological differences to this study include the previous use of only one harmonic pair per voice, partial placement of male harmonics in the resolved&#x02013;unresolved transition region, and separate measurement blocks for attended and ignored states.</p>
<p>A possible origin of the weaker speech-like DPOAE when attending the corresponding voice may lie in the peculiar mechanics of the cochlea at low frequencies. Classical descriptions based on critical-layer absorption accurately capture basal, high-frequency processing (<xref ref-type="bibr" rid="B28">Lighthill, 1981</xref>; <xref ref-type="bibr" rid="B50">Robles and Ruggero, 2001</xref>; <xref ref-type="bibr" rid="B47">Reichenbach and Hudspeth, 2014</xref>), but several studies indicate that this framework may not apply straightforwardly at frequencies lower than 4 kHz (<xref ref-type="bibr" rid="B53">Shera et al., 2010</xref>; <xref ref-type="bibr" rid="B54">Siegel et al., 2005</xref>; <xref ref-type="bibr" rid="B60">Temchin et al., 2008</xref>; <xref ref-type="bibr" rid="B1">Ashmore, 2008</xref>).</p>
<p>In this low-frequency regime, previous studies have proposed an alternative mode of operation, including independent resonance of the active process and unidirectional coupling between the basilar membrane and outer hair cells (<xref ref-type="bibr" rid="B45">Reichenbach and Hudspeth, 2010</xref>, <xref ref-type="bibr" rid="B46">2011</xref>). Such mechanisms could suppress backward-propagating distortion products and may therefore provide an explanation for the direction of the attentional effects observed in our data.</p>
<p>Alternatively, the observed reduction of speech-like DPOAE amplitudes for the attended voice may result from phase-dependent interference between different DPOAE generation mechanisms, such as distortion and coherent reflection sources, whose relative phase relationships may be altered by attentional state or stimulus context. Changes in these relationships could lead to partial phase cancellation at the recording site, resulting in an apparent suppression of the measured emission without a corresponding reduction in local cochlear nonlinearity.</p>
<p>Another potential contributor is activation of the middle ear muscle reflex (MEMR). Sensitive wideband measures indicate that the MEMR can be elicited by acoustic stimuli at levels substantially lower than clinical thresholds, with detectable effects as low as 60 dB SPL in some individuals and measurement paradigms, and strength that varies with stimulus level and prior acoustic context (<xref ref-type="bibr" rid="B2">Baricevich et al., 2025</xref>). However, there is no clear evidence that MEMR activation occurs at the moderate stimulus levels used here (&#x0007E;37 dB SPL). Still, a contribution of weak or transient MEMR activity to the measured emission amplitudes cannot be fully excluded. However, as all attentional conditions in the present study shared the same acoustic stimuli, any MEMR activation driven by overall stimulus context or task engagement would be expected to occur in a statistically similar manner across conditions. Under this assumption, MEMR-related attenuation would primarily contribute to an overall modification of speech-like DPOAE amplitude rather than to the systematic, condition-specific differences observed here.</p>
<p>The speech-like DPOAEs evoked by the <italic>F</italic><sub>res</sub> stimuli did not differ between the Att. M condition (equivalent to ignoring the female voice) and the Att. V condition. This suggests that cochlea activity at the resolved harmonics of an ignored speaker&#x02014;in this case, the female voice&#x02014;remains the same in these two conditions. One possibility, in line with the above considerations regarding apical cochlear mechanics, is that the resolved harmonics of a target speaker are enhanced through the active process, but yield lower speech-like DPOAEs, e.g., due to the peculiarities of low-frequency cochlear mechanics. When this voice is not attended, the harmonics are less enhanced, independent of whether attention is directed toward another auditory stream or the visual signal.</p>
<p>Because unresolved harmonics do not produce distinct peaks along the basilar membrane, the attentional modulation observed for the <italic>M</italic><sub>unres</sub> stimulus was unexpected. However, it is important to note that the <italic>M</italic><sub>unres</sub> stimulus was presented against a background of resolved harmonics of the female voice (<xref ref-type="fig" rid="F5">Figure 5</xref>). We hypothesize that attention to the female voice enhances cochlear activity in the regions corresponding to its resolved harmonics (purple shading in <xref ref-type="fig" rid="F5">Figure 5</xref>). This enhanced activity will also affect the responses to certain unresolved harmonics of the male voice, namely those that peak in the same section of the basilar membrane. These included the waveforms used in the <italic>M</italic><sub>unres</sub> stimulus. When the attentional focus changed from the female to the male voice, we hypothesize that cochlear activity in these regions became smaller, again affecting the corresponding unresolved harmonics of the male voice, including the <italic>M</italic><sub>unres</sub> stimulus. The attentional modulation for the <italic>M</italic><sub>unres</sub> stimulus was thus likely caused by changes in cochlear activity for the resolved harmonics of the female voice.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Resolved and unresolved harmonics. In the part of the cochlea where the <italic>F</italic><sub>res</sub> and the <italic>M</italic><sub>unres</sub> stimuli have their characteristic places, the harmonics of the female voice are resolved <bold>(top)</bold> while those of the male voice are unresolved <bold>(bottom)</bold>. The cochlear active process may enhance the response at the locations where the resolved harmonics of the female voice peak (purple shading). This enhancement will then also affect the responses to the unresolved harmonics of the male voice at these locations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-20-1756386-g0005.tif">
<alt-text content-type="machine-generated">Two graphs compare responses along the cochlea&#x02019;s longitudinal position from base to apex. The top graph uses purple to show peaks of resolved harmonics based on the F_res stimulus with peaks labeled by frequency, including 1760 Hertz and 1370 Hertz. The bottom graph uses green to show peaks of unresolved harmonics based on the M_unres stimulus, with peaks labeled 1500 Hertz and 1250 Hertz. Both graphs display overlapping curves representing spatial and temporal patterns, with highlighted regions corresponding to specific waveforms. Axes denote longitudinal position x.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>4.3</label>
<title>Delay of the speech-DPOAEs</title>
<p>The delays reported here do not represent classical DPOAE group delays derived from phase&#x02013;frequency slopes. Instead, they reflect latency estimates obtained from the peak position of the cross-correlation between the expected speech-like DPOAE waveform and the recorded ear-canal signal. This time-domain measure can be interpreted as an approximation of the effective DPOAE latency, and thus allows qualitative comparison with known DPOAE delay behavior. As expected, these latency estimates did not depend on the focus of selective attention (<xref ref-type="fig" rid="F4">Figures 4M</xref>&#x02013;<xref ref-type="fig" rid="F4">O</xref>). However, they differed systematically between stimulus types in the single-speaker condition due to frequency dependencies.</p>
<p>For the <italic>F</italic><sub>unres</sub> stimulus, speech-like DPOAEs peaked at approximately 0 ms, consistent with reports of near-zero DPOAE group delays at frequencies above &#x0007E;1&#x02013;1.5 kHz, where cochlear scaling symmetry and wave-fixed distortion generation are expected to hold (<xref ref-type="bibr" rid="B15">Faulstich and K&#x000F6;ssl, 2000</xref>; <xref ref-type="bibr" rid="B11">Dhar et al., 2011</xref>). Similarly, the <italic>M</italic><sub>unres</sub> stimulus yielded short latencies of about 0.7 ms (approximately 0.7 cycles). Given that the average stimulus and DPOAE frequencies for this condition lie in the range of 1&#x02013;1.5 kHz, this delay is consistent with a partial breakdown of scaling symmetry in this frequency region.</p>
<p>In contrast, speech-like DPOAEs evoked by the <italic>F</italic><sub>res</sub> stimulus peaked at approximately 2 ms, corresponding to about 2 stimulus cycles. Even though the frequency regime largely overlaps with that of the stimulus <italic>M</italic><sub>unres</sub>, scaling symmetry seems to break down more fully, and the observed delay is compatible with distortion generation near the peak of the basilar-membrane traveling wave, followed by backward propagation through a slow basilar-membrane wave (<xref ref-type="bibr" rid="B52">Shera and Guinan, 1999</xref>; <xref ref-type="bibr" rid="B62">Tubis et al., 2000</xref>; <xref ref-type="bibr" rid="B43">Probst et al., 1991</xref>; <xref ref-type="bibr" rid="B26">Knight and Kemp, 2001</xref>; <xref ref-type="bibr" rid="B50">Robles and Ruggero, 2001</xref>).</p>
<p>The <italic>M</italic><sub>res</sub> stimulus did not produce reliable responses and was therefore excluded from further analysis.</p>
</sec>
<sec>
<label>4.4</label>
<title>Limitations</title>
<p>Comparability of the stimuli derived from the male voice was limited. In the competing speaker scenario, the stimulus <italic>M</italic><sub>res</sub> failed, in most trials, to produce significant speech-like DPOAEs for statistical evaluation. Additionally, the stimulus <italic>M</italic><sub>unres</sub> overlapped in frequency with the stimulus <italic>F</italic><sub>res</sub>, compromising interpretability due to potential confounds in the origin of observed effects. Indeed, this overlap precludes a clear dissociation between modulation driven by harmonic resolvability and modulation driven by attention to a shared frequency region, thereby limiting the interpretability of the observed effects.</p>
<p>The conclusions drawn in the present study are specific to the frequency regions investigated by means of the F<sub><italic>res</italic></sub> stimuli for resolved harmonics and of the F<sub><italic>unres</italic></sub> stimuli for unresolved harmonics, spanning approximately 1&#x02013;1.8 kHz and 2.3&#x02013;3.5 kHz, respectively. While these two frequency ranges yield robust speech-like DPOAEs, it remains unclear to what degree the obtained results can be generalized to different frequency regions of the cochlea. In particular, the failure to obtain speech-like DPOAEs for the <italic>M</italic><sub>res</sub> stimulus at lower frequencies highlights the vulnerability of speech-like DPOAE measurements to low-frequency noise and reduced emission strength.</p>
<p>Future work is thus required to extend the present paradigm to additional frequency bands, with stimulus designs optimized for higher and lower cochlear regions and special focus on avoiding spectral overlap, in order to assess the frequency dependence and generalizability of attentional effects on speech-like DPOAEs.</p>
</sec>
<sec>
<label>4.5</label>
<title>Conclusion</title>
<p>Our study demonstrates that selective attention modulates the morphology of speech-like DPOAEs elicited by multiple, simultaneously presented harmonic pairs derived from natural speech signals. These findings are consistent with the notion that DPOAEs can also be evoked by natural, running speech, although extracting such responses remains challenging due to the high noise floor and spectral complexity of real speech. Within these constraints, the experimental paradigm employed here provides a tractable approximation for probing cochlear responses to ecologically relevant stimuli.</p>
<p>Attentional effects were observed for stimuli derived from both resolved and unresolved harmonic regions. Given the partial overlap in frequency content between conditions, we argue that the effects seen for the unresolved harmonics most likely resulted from attentional modulation of the resolved harmonics of the competing speaker. Notably, the direction and pattern of attentional modulation were consistent, whether attention was shifted from the target speech to a competing voice or from the target to a visual task. However, further work is required to unequivocally establish the attentional effects on resolved and unresolved harmonics.</p>
<p>Unexpectedly, attention reduced&#x02014;rather than enhanced&#x02014;the speech-like DPOAE associated with the attended speaker. In light of heterogeneous findings in the existing literature, replication and systematic extension of this effect across frequency regions and stimulus configurations will be necessary to establish its robustness and generality.</p>
<p>More broadly, we hope that the speech-like DPOAE paradigm introduced here will contribute to a deeper understanding of how the active cochlea participates in the perceptual and cognitive processing of complex naturalistic signals such as speech.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>All custom code used for stimulus generation, speech-like DPOAE analysis via cross-correlation, and statistical evaluation is openly available on GitHub at <ext-link ext-link-type="uri" xlink:href="https://github.com/janna-stb/dpoae_attention_study.git">https://github.com/janna-stb/dpoae_attention_study.git</ext-link> under the MIT License.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>The studies involving humans were approved by the ethics board of the University Hospital Erlangen. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>JS: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Validation, Writing &#x02013; original draft. TR: Conceptualization, Funding acquisition, Methodology, Project administration, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. Generative AI tools were used to improve the clarity and readability of the manuscript. The scientific content, analyses, and conclusions were developed by the author(s).</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ashmore</surname> <given-names>J.</given-names></name></person-group> (<year>2008</year>). <article-title>Cochlear outer hair cell motility</article-title>. <source>Physiol. Rev</source>. <volume>88</volume>, <fpage>173</fpage>&#x02013;<lpage>210</lpage>. doi: <pub-id pub-id-type="doi">10.1152/physrev.00044.2006</pub-id><pub-id pub-id-type="pmid">18195086</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Baricevich</surname> <given-names>A.</given-names></name> <name><surname>Bassett</surname> <given-names>D.</given-names></name> <name><surname>Chan</surname> <given-names>S.</given-names></name> <name><surname>Lavi</surname> <given-names>S.</given-names></name> <name><surname>Siegel</surname> <given-names>J.</given-names></name></person-group> (<year>2025</year>). <article-title>Frequency and level dependence of the middle ear acoustic reflex and its decay measured in wideband absorbance with contralateral narrowband noise elicitors</article-title>. <source>Hear. Res</source>. <volume>459</volume>:<fpage>109225</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.heares.2025.109225</pub-id><pub-id pub-id-type="pmid">40024093</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Beckerhoff</surname> <given-names>F.</given-names></name></person-group> (<year>2022</year>). <source>Frau Ella</source>. <publisher-loc>Munich</publisher-loc>: <publisher-name>Dotbooks</publisher-name>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Beim</surname> <given-names>J. A.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name> <name><surname>Wojtczak</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Examining replicability of an otoacoustic measure of cochlear function during selective attention</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>144</volume>, <fpage>2882</fpage>&#x02013;<lpage>2895</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.5079311</pub-id><pub-id pub-id-type="pmid">30522315</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Beim</surname> <given-names>J. A.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name> <name><surname>Wojtczak</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>No effects of attention or visual perceptual load on cochlear function, as measured with stimulus-frequency otoacoustic emissions</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>146</volume>, <fpage>1475</fpage>&#x02013;<lpage>1491</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.5123391</pub-id><pub-id pub-id-type="pmid">31472524</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bernstein</surname> <given-names>J. G.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name></person-group> (<year>2003</year>). <article-title>Pitch discrimination of diotic and dichotic tone complexes: harmonic resolvability or harmonic number?</article-title> <source>J. Acoust. Soc. Am</source>. <volume>113</volume>, <fpage>3323</fpage>&#x02013;<lpage>3334</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.1572146</pub-id><pub-id pub-id-type="pmid">12822804</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Carcagno</surname> <given-names>S.</given-names></name> <name><surname>Plack</surname> <given-names>C. J.</given-names></name></person-group> (<year>2011</year>). <article-title>Pitch discrimination learning: specificity for pitch and harmonic resolvability, and electrophysiological correlates</article-title>. <source>J. Assoc. Res. Otolaryngol</source>. <volume>12</volume>, <fpage>503</fpage>&#x02013;<lpage>517</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10162-011-0266-3</pub-id><pub-id pub-id-type="pmid">21484466</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cherry</surname> <given-names>E. C.</given-names></name></person-group> (<year>1953</year>). <article-title>Some experiments on the recognition of speech, with one and with two ears</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>25</volume>, <fpage>975</fpage>&#x02013;<lpage>979</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.1907229</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Clark</surname> <given-names>N. R.</given-names></name> <name><surname>Brown</surname> <given-names>G. J.</given-names></name> <name><surname>J&#x000FC;rgens</surname> <given-names>T.</given-names></name> <name><surname>Meddis</surname> <given-names>R.</given-names></name></person-group> (<year>2012</year>). <article-title>A frequency-selective feedback model of auditory efferent suppression and its implications for the recognition of speech in noise</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>132</volume>, <fpage>1535</fpage>&#x02013;<lpage>1541</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.4742745</pub-id><pub-id pub-id-type="pmid">22978882</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dallos</surname> <given-names>P.</given-names></name></person-group> (<year>1992</year>). <article-title>The active cochlea</article-title>. <source>J. Neurosci</source>. <volume>12</volume>, <fpage>4575</fpage>&#x02013;<lpage>4585</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.12-12-04575.1992</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dhar</surname> <given-names>S.</given-names></name> <name><surname>Rogers</surname> <given-names>A.</given-names></name> <name><surname>Abdala</surname> <given-names>C.</given-names></name></person-group> (<year>2011</year>). <article-title>Breaking away: violation of distortion emission phase-frequency invariance at low frequencies</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>129</volume>, <fpage>3115</fpage>&#x02013;<lpage>3122</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.3569732</pub-id><pub-id pub-id-type="pmid">21568414</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>N.</given-names></name> <name><surname>Simon</surname> <given-names>J. Z.</given-names></name></person-group> (<year>2012</year>). <article-title>Emergence of neural encoding of auditory objects while listening to competing speakers</article-title>. <source>Proc. Natl. Acad. Sci. USA</source>. <volume>109</volume>, <fpage>11854</fpage>&#x02013;<lpage>11859</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1205381109</pub-id><pub-id pub-id-type="pmid">22753470</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="web"><collab>ElevenLabs</collab> (<year>n.d.</year>). <source>ElevenLabs text-to-speech engine.</source> Available online at: <ext-link ext-link-type="uri" xlink:href="https://elevenlabs.io">https://elevenlabs.io</ext-link></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Etard</surname> <given-names>O.</given-names></name> <name><surname>Kegler</surname> <given-names>M.</given-names></name> <name><surname>Braiman</surname> <given-names>C.</given-names></name> <name><surname>Forte</surname> <given-names>A. E.</given-names></name> <name><surname>Reichenbach</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>Decoding of selective attention to continuous speech from the human auditory brainstem response</article-title>. <source>Neuroimage</source> <volume>200</volume>, <fpage>1</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuroimage.2019.06.029</pub-id><pub-id pub-id-type="pmid">31212098</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Faulstich</surname> <given-names>M.</given-names></name> <name><surname>K&#x000F6;ssl</surname> <given-names>M.</given-names></name></person-group> (<year>2000</year>). <article-title>Evidence for multiple DPOAE components based upon group delay of the 2f1-f2 distortion in the gerbil</article-title>. <source>Hear. Res</source>. <volume>140</volume>, <fpage>99</fpage>&#x02013;<lpage>110</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0378-5955(99)00189-6</pub-id><pub-id pub-id-type="pmid">10675638</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Forte</surname> <given-names>A. E.</given-names></name> <name><surname>Etard</surname> <given-names>O.</given-names></name> <name><surname>Reichenbach</surname> <given-names>T.</given-names></name></person-group> (<year>2017</year>). <article-title>The human auditory brainstem response to running speech reveals a subcortical mechanism for selective attention</article-title>. <source>Elife</source> <volume>6</volume>:<fpage>e27203</fpage>. doi: <pub-id pub-id-type="doi">10.7554/eLife.27203</pub-id><pub-id pub-id-type="pmid">28992445</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Francis</surname> <given-names>N. A.</given-names></name> <name><surname>Zhao</surname> <given-names>W.</given-names></name> <name><surname>Guinan</surname> <given-names>J.r.</given-names></name> J. J</person-group>. (<year>2018</year>). <article-title>Auditory attention reduced ear-canal noise in humans by reducing subject motion, not by medial olivocochlear efferent inhibition: implications for measuring otoacoustic emissions during a behavioral task</article-title>. <source>Front. Syst. Neurosci</source>. <volume>12</volume>:<fpage>42</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnsys.2018.00042</pub-id><pub-id pub-id-type="pmid">30271329</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Galbraith</surname> <given-names>G. C.</given-names></name> <name><surname>Olfman</surname> <given-names>D. M.</given-names></name> <name><surname>Huffman</surname> <given-names>T. M.</given-names></name></person-group> (<year>2003</year>). <article-title>Selective attention affects human brain stem frequency-following response</article-title>. <source>Neuroreport</source> <volume>14</volume>:<fpage>735</fpage>. doi: <pub-id pub-id-type="doi">10.1097/00001756-200304150-00015</pub-id><pub-id pub-id-type="pmid">12692473</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Glattauer</surname> <given-names>D.</given-names></name></person-group> (<year>2018</year>). <source>Darum</source>. <publisher-loc>Vienna</publisher-loc>: <publisher-name>Paul Zsolnay Verlag</publisher-name>.</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guinan</surname> <given-names>J. J. J.</given-names></name></person-group> (<year>2006</year>). <article-title>Olivocochlear efferents: anatomy, physiology, function, and the measurement of efferent effects in humans</article-title>. <source>Ear Hear</source>. <volume>27</volume>:<fpage>589</fpage>. doi: <pub-id pub-id-type="doi">10.1097/01.aud.0000240507.83072.e7</pub-id><pub-id pub-id-type="pmid">17086072</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hauswald</surname> <given-names>A.</given-names></name> <name><surname>Keitel</surname> <given-names>A.</given-names></name> <name><surname>Chen</surname> <given-names>Y.-P.</given-names></name> <name><surname>R&#x000F6;sch</surname> <given-names>S.</given-names></name> <name><surname>Weisz</surname> <given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>Degradation levels of continuous speech affect neural speech tracking and alpha power differently</article-title>. <source>Eur. J. Neurosci</source>. <volume>55</volume>, <fpage>3288</fpage>&#x02013;<lpage>3302</lpage>. doi: <pub-id pub-id-type="doi">10.1111/ejn.14912</pub-id><pub-id pub-id-type="pmid">32687616</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Horton</surname> <given-names>C.</given-names></name> <name><surname>D&#x00027;Zmura</surname> <given-names>M.</given-names></name> <name><surname>Srinivasan</surname> <given-names>R.</given-names></name></person-group> (<year>2013</year>). <article-title>Suppression of competing speech through entrainment of cortical oscillations</article-title>. <source>J. Neurophysiol</source>. <volume>109</volume>, <fpage>3082</fpage>&#x02013;<lpage>3093</lpage>. doi: <pub-id pub-id-type="doi">10.1152/jn.01026.2012</pub-id><pub-id pub-id-type="pmid">23515789</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hudspeth</surname> <given-names>A. J.</given-names></name></person-group> (<year>2014</year>). <article-title>Integrating the active process of hair cells with cochlear function</article-title>. <source>Nat. Rev. Neurosci</source>. <volume>15</volume>, <fpage>600</fpage>&#x02013;<lpage>614</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrn3786</pub-id><pub-id pub-id-type="pmid">25096182</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huffman</surname> <given-names>R. F.</given-names></name> <name><surname>Henson</surname> <given-names>O. W.</given-names></name></person-group> (<year>1990</year>). <article-title>The descending auditory pathway and acousticomotor systems: connections with the inferior colliculus</article-title>. <source>Brain Res. Rev</source>. <volume>15</volume>, <fpage>295</fpage>&#x02013;<lpage>323</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0165-0173(90)90005-9</pub-id><pub-id pub-id-type="pmid">2289088</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jennings</surname> <given-names>S. G.</given-names></name></person-group> (<year>2021</year>). <article-title>The role of the medial olivocochlear reflex in psychophysical masking and intensity resolution in humans: a review</article-title>. <source>J. Neurophysiol</source>. <volume>125</volume>, <fpage>2279</fpage>&#x02013;<lpage>2308</lpage>. doi: <pub-id pub-id-type="doi">10.1152/jn.00672.2020</pub-id><pub-id pub-id-type="pmid">33909513</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Knight</surname> <given-names>R. D.</given-names></name> <name><surname>Kemp</surname> <given-names>D. T.</given-names></name></person-group> (<year>2001</year>). <article-title>Wave and place fixed DPOAE maps of the human ear</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>109</volume>, <fpage>1513</fpage>&#x02013;<lpage>1525</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.1354197</pub-id><pub-id pub-id-type="pmid">11325123</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lakatos</surname> <given-names>P.</given-names></name> <name><surname>Musacchia</surname> <given-names>G.</given-names></name> <name><surname>O&#x00027;Connel</surname> <given-names>M. N.</given-names></name> <name><surname>Falchier</surname> <given-names>A. Y.</given-names></name> <name><surname>Javitt</surname> <given-names>D. C.</given-names></name> <name><surname>Schroeder</surname> <given-names>C. E.</given-names></name></person-group> (<year>2013</year>). <article-title>The spectrotemporal filter mechanism of auditory selective attention</article-title>. <source>Neuron</source> <volume>77</volume>, <fpage>750</fpage>&#x02013;<lpage>761</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuron.2012.11.034</pub-id><pub-id pub-id-type="pmid">23439126</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lighthill</surname> <given-names>J.</given-names></name></person-group> (<year>1981</year>). <article-title>Energy flow in the cochlea</article-title>. <source>J. Fluid Mech</source>. <volume>106</volume>, <fpage>149</fpage>&#x02013;<lpage>213</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0022112081001560</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lopez-Poveda</surname> <given-names>E. A.</given-names></name></person-group> (<year>2018</year>). <article-title>Olivocochlear efferents in animals and humans: from anatomy to clinical relevance</article-title>. <source>Front. Neurol</source>. <volume>9</volume>:<fpage>197</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fneur.2018.00197</pub-id><pub-id pub-id-type="pmid">29632514</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Maison</surname> <given-names>S.</given-names></name> <name><surname>Micheyl</surname> <given-names>C.</given-names></name> <name><surname>Collet</surname> <given-names>L.</given-names></name></person-group> (<year>2001</year>). <article-title>Influence of focused auditory attention on cochlear activity in humans</article-title>. <source>Psychophysiology</source> <volume>38</volume>, <fpage>35</fpage>&#x02013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1469-8986.3810035</pub-id><pub-id pub-id-type="pmid">11321619</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McDermott</surname> <given-names>J. H.</given-names></name></person-group> (<year>2009</year>). <article-title>The cocktail party problem</article-title>. <source>Curr. Biol</source>. <volume>19</volume>, <fpage>R1024</fpage>&#x02013;<lpage>R1027</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cub.2009.09.005</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>McFee</surname> <given-names>B.</given-names></name> <name><surname>McVicar</surname> <given-names>M.</given-names></name> <name><surname>Faronbi</surname> <given-names>D.</given-names></name> <name><surname>Roman</surname> <given-names>I.</given-names></name> <name><surname>Gover</surname> <given-names>M.</given-names></name> <name><surname>Balke</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2025</year>). <source>Librosa/librosa: 0.11.0</source>. <publisher-loc>Geneva</publisher-loc>: <publisher-name>Zenodo</publisher-name>.</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meric</surname> <given-names>C.</given-names></name> <name><surname>Collet</surname> <given-names>L.</given-names></name></person-group> (<year>1992</year>). <article-title>Visual attention and evoked otoacoustic emissions: a slight but real effect</article-title>. <source>Int. J. Psychophysiol</source>. <volume>12</volume>, <fpage>233</fpage>&#x02013;<lpage>235</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0167-8760(92)90061-F</pub-id><pub-id pub-id-type="pmid">1639669</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mesgarani</surname> <given-names>N.</given-names></name> <name><surname>Chang</surname> <given-names>E. F.</given-names></name></person-group> (<year>2012</year>). <article-title>Selective cortical representation of attended speaker in multi-talker speech perception</article-title>. <source>Nature</source> <volume>485</volume>, <fpage>233</fpage>&#x02013;<lpage>236</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nature11020</pub-id><pub-id pub-id-type="pmid">22522927</pub-id></mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Messing</surname> <given-names>D. P.</given-names></name> <name><surname>Delhorne</surname> <given-names>L.</given-names></name> <name><surname>Bruckert</surname> <given-names>E.</given-names></name> <name><surname>Braida</surname> <given-names>L. D.</given-names></name> <name><surname>Ghitza</surname> <given-names>O.</given-names></name></person-group> (<year>2009</year>). <article-title>A non-linear efferent-inspired model of the auditory system; matching human confusions in stationary noise</article-title>. <source>Speech Commun</source>. <volume>51</volume>, <fpage>668</fpage>&#x02013;<lpage>683</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.specom.2009.02.002</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Micheyl</surname> <given-names>C.</given-names></name> <name><surname>Collet</surname> <given-names>L.</given-names></name></person-group> (<year>1993</year>). <article-title>Involvement of medial olivocochlear system in detection in noise</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>93</volume>, <fpage>2314</fpage>&#x02013;<lpage>2314</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.406383</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Micheyl</surname> <given-names>C.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name></person-group> (<year>2007</year>). <article-title>Across-frequency pitch discrimination interference between complex tones containing resolved harmonics</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>121</volume>, <fpage>1621</fpage>&#x02013;<lpage>1631</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.2431334</pub-id><pub-id pub-id-type="pmid">17407899</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Michie</surname> <given-names>P. T.</given-names></name> <name><surname>LePage</surname> <given-names>E. L.</given-names></name> <name><surname>Solowij</surname> <given-names>N.</given-names></name> <name><surname>Haller</surname> <given-names>M.</given-names></name> <name><surname>Terry</surname> <given-names>L.</given-names></name></person-group> (<year>1996</year>). <article-title>Evoked otoacoustic emissions and auditory selective attention</article-title>. <source>Hear. Res</source>. <volume>98</volume>, <fpage>54</fpage>&#x02013;<lpage>67</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0378-5955(96)00059-7</pub-id><pub-id pub-id-type="pmid">8880181</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Peirce</surname> <given-names>J.</given-names></name> <name><surname>Gray</surname> <given-names>J. R.</given-names></name> <name><surname>Simpson</surname> <given-names>S.</given-names></name> <name><surname>MacAskill</surname> <given-names>M.</given-names></name> <name><surname>H&#x000F6;chenberger</surname> <given-names>R.</given-names></name> <name><surname>Sogo</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>PsychoPy2: Experiments in behavior made easy</article-title>. <source>Behav. Res</source>. <volume>51</volume>, <fpage>195</fpage>&#x02013;<lpage>203</lpage>. doi: <pub-id pub-id-type="doi">10.3758/s13428-018-01193-y</pub-id><pub-id pub-id-type="pmid">30734206</pub-id></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pickles</surname> <given-names>J. O.</given-names></name></person-group> (<year>1988</year>). <source>An Introduction to the Physiology of Hearing</source>. <publisher-loc>London</publisher-loc>: <publisher-name>Academic press</publisher-name>.</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Plack</surname> <given-names>C. J.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name> <name><surname>Fay</surname> <given-names>R. R.</given-names></name></person-group> (<year>2005</year>). <source>Pitch: Neural Coding and Perception</source>. New York: Springer. doi: <pub-id pub-id-type="doi">10.1007/0-387-28958-5</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Plomp</surname> <given-names>R.</given-names></name></person-group> (<year>1978</year>). <article-title>Auditory handicap of hearing impairment and the limited benefit of hearing aids</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>63</volume>, <fpage>533</fpage>&#x02013;<lpage>549</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.381753</pub-id><pub-id pub-id-type="pmid">670550</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Probst</surname> <given-names>R.</given-names></name> <name><surname>Lonsbury-Martin</surname> <given-names>B. L.</given-names></name> <name><surname>Martin</surname> <given-names>G. K.</given-names></name></person-group> (<year>1991</year>). <article-title>A review of otoacoustic emissions</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>89</volume>, <fpage>2027</fpage>&#x02013;<lpage>2067</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.400897</pub-id></mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pugh</surname> <given-names>K. R.</given-names></name> <name><surname>Shaywitz</surname> <given-names>B. A.</given-names></name> <name><surname>Shaywitz</surname> <given-names>S. E.</given-names></name> <name><surname>Fulbright</surname> <given-names>R. K.</given-names></name> <name><surname>Byrd</surname> <given-names>D.</given-names></name> <name><surname>Skudlarski</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>1996</year>). <article-title>Auditory Selective attention: an fMRI investigation</article-title>. <source>Neuroimage</source> <volume>4</volume>, <fpage>159</fpage>&#x02013;<lpage>173</lpage>. doi: <pub-id pub-id-type="doi">10.1006/nimg.1996.0067</pub-id><pub-id pub-id-type="pmid">9345506</pub-id></mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Reichenbach</surname> <given-names>T.</given-names></name> <name><surname>Hudspeth</surname> <given-names>A. J.</given-names></name></person-group> (<year>2010</year>). <article-title>A ratchet mechanism for amplification in low-frequency mammalian hearing</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A</source>. <volume>107</volume>, <fpage>4973</fpage>&#x02013;<lpage>4978</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.0914345107</pub-id><pub-id pub-id-type="pmid">20194771</pub-id></mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Reichenbach</surname> <given-names>T.</given-names></name> <name><surname>Hudspeth</surname> <given-names>A. J.</given-names></name></person-group> (<year>2011</year>). <article-title>Unidirectional amplification as a mechanism for low-frequency hearing in mammals</article-title>. <source>AIP Conf. Proc</source>. <volume>1403</volume>, <fpage>507</fpage>&#x02013;<lpage>512</lpage>. doi: <pub-id pub-id-type="doi">10.1063/1.3658139</pub-id></mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Reichenbach</surname> <given-names>T.</given-names></name> <name><surname>Hudspeth</surname> <given-names>A. J. J.</given-names></name></person-group> (<year>2014</year>). <article-title>The physics of hearing: Fluid mechanics and the active process of the inner ear</article-title>. <source>Rep. Prog. Phys</source>. <volume>77</volume>:<fpage>076601</fpage>. doi: <pub-id pub-id-type="doi">10.1088/0034-4885/77/7/076601</pub-id><pub-id pub-id-type="pmid">25006839</pub-id></mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Riecke</surname> <given-names>L.</given-names></name> <name><surname>Marianu</surname> <given-names>I.-A.</given-names></name> <name><surname>De Martino</surname> <given-names>F.</given-names></name></person-group> (<year>2020</year>). <article-title>Effect of auditory predictability on the human peripheral auditory system</article-title>. <source>Front. Neurosci</source>. <volume>14</volume>:<fpage>362</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2020.00362</pub-id><pub-id pub-id-type="pmid">32351361</pub-id></mixed-citation>
</ref>
<ref id="B49">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ritter</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <source>Eine Frau erlebt die Polarnacht</source>. <publisher-loc>Berlin</publisher-loc>: <publisher-name>Ullstein Taschenbuch</publisher-name>.</mixed-citation>
</ref>
<ref id="B50">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Robles</surname> <given-names>L.</given-names></name> <name><surname>Ruggero</surname> <given-names>M. A.</given-names></name></person-group> (<year>2001</year>). <article-title>Mechanics of the mammalian cochlea</article-title>. <source>Physiol. Rev</source>. <volume>81</volume>, <fpage>1305</fpage>&#x02013;<lpage>1352</lpage>. doi: <pub-id pub-id-type="doi">10.1152/physrev.2001.81.3.1305</pub-id><pub-id pub-id-type="pmid">11427697</pub-id></mixed-citation>
</ref>
<ref id="B51">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saiz-Al&#x000ED;a</surname> <given-names>M.</given-names></name> <name><surname>Miller</surname> <given-names>P.</given-names></name> <name><surname>Reichenbach</surname> <given-names>T.</given-names></name></person-group> (<year>2021</year>). <article-title>Otoacoustic emissions evoked by the time-varying harmonic structure of speech</article-title>. <source>eNeuro</source> <volume>8</volume>:<fpage>428</fpage>. doi: <pub-id pub-id-type="doi">10.1523/ENEURO.0428-20.2021</pub-id><pub-id pub-id-type="pmid">33632811</pub-id></mixed-citation>
</ref>
<ref id="B52">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shera</surname> <given-names>C. A.</given-names></name> <name><surname>Guinan</surname> <given-names>J. J.</given-names></name></person-group> (<year>1999</year>). <article-title>Evoked otoacoustic emissions arise by two fundamentally different mechanisms: a taxonomy for mammalian OAEs</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>105</volume>, <fpage>782</fpage>&#x02013;<lpage>798</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.426948</pub-id><pub-id pub-id-type="pmid">9972564</pub-id></mixed-citation>
</ref>
<ref id="B53">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shera</surname> <given-names>C. A.</given-names></name> <name><surname>Guinan</surname> <given-names>J. J.</given-names></name> <name><surname>Oxenham</surname> <given-names>A. J.</given-names></name></person-group> (<year>2010</year>). <article-title>Otoacoustic estimation of cochlear tuning: validation in the chinchilla</article-title>. <source>J. Assoc. Res. Otolaryngol</source>. <volume>11</volume>, <fpage>343</fpage>&#x02013;<lpage>365</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10162-010-0217-4</pub-id><pub-id pub-id-type="pmid">20440634</pub-id></mixed-citation>
</ref>
<ref id="B54">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Siegel</surname> <given-names>J. H.</given-names></name> <name><surname>Cerka</surname> <given-names>A. J.</given-names></name> <name><surname>Recio-Spinoso</surname> <given-names>A.</given-names></name> <name><surname>Temchin</surname> <given-names>A. N.</given-names></name> <name><surname>Van Dijk</surname> <given-names>P.</given-names></name> <name><surname>Ruggero</surname> <given-names>M. A.</given-names></name></person-group> (<year>2005</year>). <article-title>Delays of stimulus-frequency otoacoustic emissions and cochlear vibrations contradict the theory of coherent reflection filtering</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>118</volume>, <fpage>2434</fpage>&#x02013;<lpage>2443</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.2005867</pub-id><pub-id pub-id-type="pmid">16266165</pub-id></mixed-citation>
</ref>
<ref id="B55">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Smith</surname> <given-names>D. W.</given-names></name> <name><surname>Aouad</surname> <given-names>R. K.</given-names></name> <name><surname>Keil</surname> <given-names>A.</given-names></name></person-group> (<year>2012</year>). <article-title>Cognitive task demands modulate the sensitivity of the human cochlea</article-title>. <source>Front. Psychol</source>. <volume>3</volume>:<fpage>30</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2012.00030</pub-id><pub-id pub-id-type="pmid">22347870</pub-id></mixed-citation>
</ref>
<ref id="B56">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Srinivasan</surname> <given-names>S.</given-names></name> <name><surname>Keil</surname> <given-names>A.</given-names></name> <name><surname>Stratis</surname> <given-names>K.</given-names></name> <name><surname>Osborne</surname> <given-names>A. F.</given-names></name> <name><surname>Cerwonka</surname> <given-names>C.</given-names></name> <name><surname>Wong</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Interaural attention modulates outer hair cell function</article-title>. <source>Eur. J. Neurosci</source>. <volume>40</volume>, <fpage>3785</fpage>&#x02013;<lpage>3792</lpage>. doi: <pub-id pub-id-type="doi">10.1111/ejn.12746</pub-id><pub-id pub-id-type="pmid">25302959</pub-id></mixed-citation>
</ref>
<ref id="B57">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Srinivasan</surname> <given-names>S.</given-names></name> <name><surname>Keil</surname> <given-names>A.</given-names></name> <name><surname>Stratis</surname> <given-names>K.</given-names></name> <name><surname>Woodruff Carr</surname> <given-names>K.</given-names></name> <name><surname>Smith</surname> <given-names>D.</given-names></name></person-group> (<year>2012</year>). <article-title>Effects of cross-modal selective attention on the sensory periphery: cochlear sensitivity is altered by selective attention</article-title>. <source>Neuroscience</source> <volume>223</volume>, <fpage>325</fpage>&#x02013;<lpage>332</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuroscience.2012.07.062</pub-id><pub-id pub-id-type="pmid">22871520</pub-id></mixed-citation>
</ref>
<ref id="B58">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stoll</surname> <given-names>T. J.</given-names></name> <name><surname>Vandjelovic</surname> <given-names>N. D.</given-names></name> <name><surname>Polonenko</surname> <given-names>M. J.</given-names></name> <name><surname>Li</surname> <given-names>N. R.</given-names></name> <name><surname>Lee</surname> <given-names>A. K.</given-names></name> <name><surname>Maddox</surname> <given-names>R. K.</given-names></name></person-group> (<year>2025</year>). <article-title>The auditory brainstem response to natural speech is not affected by selective attention</article-title>. <source>PLoS Biol</source>. <volume>23</volume>:<fpage>e3003407</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pbio.3003407</pub-id><pub-id pub-id-type="pmid">41052102</pub-id></mixed-citation>
</ref>
<ref id="B59">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Strauss</surname> <given-names>D. J.</given-names></name> <name><surname>Corona-Strauss</surname> <given-names>F. I.</given-names></name> <name><surname>Mai</surname> <given-names>A.</given-names></name> <name><surname>Hillyard</surname> <given-names>S. A.</given-names></name></person-group> (<year>2025</year>). <article-title>Unraveling the effects of selective auditory attention in ERPs: from the brainstem to the cortex</article-title>. <source>Neuroimage</source> <volume>316</volume>:<fpage>121295</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuroimage.2025.121295</pub-id><pub-id pub-id-type="pmid">40490091</pub-id></mixed-citation>
</ref>
<ref id="B60">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Temchin</surname> <given-names>A. N.</given-names></name> <name><surname>Rich</surname> <given-names>N. C.</given-names></name> <name><surname>Ruggero</surname> <given-names>M. A.</given-names></name></person-group> (<year>2008</year>). <article-title>Threshold tuning curves of chinchilla auditory-nerve fibers. I. Dependence on characteristic frequency and relation to the magnitudes of cochlear vibrations</article-title>. <source>J. Neurophysiol</source>. <volume>100</volume>, <fpage>2889</fpage>&#x02013;<lpage>2898</lpage>. doi: <pub-id pub-id-type="doi">10.1152/jn.90637.2008</pub-id><pub-id pub-id-type="pmid">18701751</pub-id></mixed-citation>
</ref>
<ref id="B61">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Trevino</surname> <given-names>M.</given-names></name> <name><surname>Zang</surname> <given-names>A.</given-names></name> <name><surname>Lobarinas</surname> <given-names>E.</given-names></name></person-group> (<year>2023</year>). <article-title>The middle ear muscle reflex: current and future role in assessing noise-induced cochlear damage</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>153</volume>, <fpage>436</fpage>&#x02013;<lpage>445</lpage>. doi: <pub-id pub-id-type="doi">10.1121/10.0016853</pub-id><pub-id pub-id-type="pmid">36732247</pub-id></mixed-citation>
</ref>
<ref id="B62">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tubis</surname> <given-names>A.</given-names></name> <name><surname>Talmadge</surname> <given-names>C. L.</given-names></name> <name><surname>Tong</surname> <given-names>C.</given-names></name></person-group> (<year>2000</year>). <article-title>Modeling the temporal behavior of distortion product otoacoustic emissions</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>107</volume>, <fpage>2112</fpage>&#x02013;<lpage>2127</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.428493</pub-id><pub-id pub-id-type="pmid">10790037</pub-id></mixed-citation>
</ref>
<ref id="B63">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Walsh</surname> <given-names>K. P.</given-names></name> <name><surname>Pasanen</surname> <given-names>E. G.</given-names></name> <name><surname>McFadden</surname> <given-names>D.</given-names></name></person-group> (<year>2015</year>). <article-title>Changes in otoacoustic emissions during selective auditory and visual attention</article-title>. <source>J. Acoust. Soc. Am</source>. <volume>137</volume>, <fpage>2737</fpage>&#x02013;<lpage>2757</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.4919350</pub-id><pub-id pub-id-type="pmid">25994703</pub-id></mixed-citation>
</ref>
<ref id="B64">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Winer</surname> <given-names>J. A.</given-names></name> <name><surname>Larue</surname> <given-names>D. T.</given-names></name> <name><surname>Diehl</surname> <given-names>J. J.</given-names></name> <name><surname>Hefti</surname> <given-names>B. J.</given-names></name></person-group> (<year>1998</year>). <article-title>Auditory cortical projections to the cat inferior colliculus</article-title>. <source>J. Compar. Neurol</source>. <volume>400</volume>, <fpage>147</fpage>&#x02013;<lpage>174</lpage>. doi: <pub-id pub-id-type="doi">10.1002/(SICI)1096-9861(19981019)400:2&#x0003C;147::AID-CNE1&#x0003E;3.0.CO;2-9</pub-id><pub-id pub-id-type="pmid">9766397</pub-id></mixed-citation>
</ref>
<ref id="B65">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wittekindt</surname> <given-names>A.</given-names></name> <name><surname>Kaiser</surname> <given-names>J.</given-names></name> <name><surname>Abel</surname> <given-names>C.</given-names></name></person-group> (<year>2014</year>). <article-title>Attentional modulation of the inner ear: a combined otoacoustic emission and EEG study</article-title>. <source>J. Neurosci</source>. <volume>34</volume>, <fpage>9995</fpage>&#x02013;<lpage>10002</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.4861-13.2014</pub-id><pub-id pub-id-type="pmid">25057201</pub-id></mixed-citation>
</ref>
<ref id="B66">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>Z.</given-names></name></person-group> (<year>2025</year>). <article-title>Subcortical responses to continuous speech under bimodal divided attention</article-title>. <source>J. Neurophysiol</source>. <volume>133</volume>, <fpage>1216</fpage>&#x02013;<lpage>1221</lpage>. doi: <pub-id pub-id-type="doi">10.1152/jn.00039.2025</pub-id><pub-id pub-id-type="pmid">40098452</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/29632/overview">Fatima T. Husain</ext-link>, University of Illinois at Urbana-Champaign, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/820821/overview">Mareike Daeglau</ext-link>, University of Oldenburg, Germany</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/974409/overview">Renata Sisto</ext-link>, National Institute for Insurance against Accidents at Work (INAIL), Italy</p>
</fn>
</fn-group>
</back>
</article>