<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Commun.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Communication</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Commun.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2297-900X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomm.2025.1620465</article-id><article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading"><subject>Original Research</subject></subj-group>
</article-categories>
<title-group>
<article-title>Differential effects of hand and mouth gesture training on L2 English pronunciation: targeting suprasegmental and segmental features</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yamane</surname><given-names>Noriko</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2226752"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Shinya</surname><given-names>Masahiro</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/228578"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Tan</surname><given-names>Xiaofeng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3251743"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Chiya</surname><given-names>Amos</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3241524"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Graduate School of Humanities and Social Sciences, Hiroshima University</institution>, <city>Higashihiroshima</city>, <country country="jp">Japan</country></aff>
<aff id="aff2"><label>2</label><institution>Nagoya University of Commerce and Business</institution>, <city>Aichi</city>, <country country="jp">Japan</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Noriko Yamane, <email xlink:href="mailto:yamanen@hiroshima-u.ac.jp">yamanen@hiroshima-u.ac.jp</email>; Masahiro Shinya, <email xlink:href="mailto:mshinya@hiroshima-u.ac.jp">mshinya@hiroshima-u.ac.jp</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-05">
<day>05</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>10</volume>
<elocation-id>1620465</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>04</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Yamane, Shinya, Tan and Chiya.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Yamane, Shinya, Tan and Chiya</copyright-holder>
<license><ali:license_ref start_date="2026-01-05">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Human communication inherently integrates speech and gesture. Acquiring second language (L2) pronunciation, encompassing both segmental (e.g., vowels) and suprasegmental features (e.g., rhythm, fluency), remains a major challenge. This study investigated how two types of gesture training&#x2014;manual (hand gesture training) versus articulatory (mouth gesture training)&#x2014;influence these features in Japanese EFL learners. Forty university students participated in a four-week counterbalanced design, receiving hand gesture training (rhythmic circular motions) and mouth gesture training (bio-visual feedback for /&#x00E6;/ vs. /&#x028C;/ distinction). Speech rate (as a suprasegmental proxy) and second formant (F2) values of target vowels (as a segmental proxy) were measured at pre, mid-, and post-training. Results revealed distinct effects: hand gesture training significantly improved speech rate across both groups, enhancing suprasegmental fluency, while mouth gesture training significantly improved F2 distinction for /&#x00E6;/. These findings suggest that hand and mouth gestures target complementary aspects of L2 pronunciation. Taken together, the results support an embodied, multimodal approach to pronunciation instruction, highlighting the pedagogical value of integrating suprasegmental fluency practice with segmental refinement.</p>
</abstract>
<kwd-group>
<kwd>multimodal communication</kwd>
<kwd>embodied cognition</kwd>
<kwd>speech&#x2013;gesture integration</kwd>
<kwd>suprasegmental fluency</kwd>
<kwd>segmental accuracy</kwd>
<kwd>vowel production</kwd>
<kwd>biovisual feedback</kwd>
<kwd>Japanese EFL learners</kwd>
</kwd-group>
<funding-group><funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by JSPS KAKENHI Grant Number 22K00621, 25K04168, and Hiroshima University Promotion grant of Integrated Arts and Sciences project.</funding-statement></funding-group>
<counts>
<fig-count count="9"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="85"/>
<page-count count="14"/>
<word-count count="9915"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Multimodality of Communication</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<title>Introduction</title>
<p>Human communication is inherently multimodal, with speech and gesture tightly intertwined in the construction of meaning. Gestures are not merely ancillary to speech; rather, they are deeply integrated with cognitive and interactive processes, shaping and being shaped by the dynamics of real-time interaction. Research on gesture-speech coupling has highlighted how gestures facilitate comprehension, structure discourse, and serve cognitive functions such as disambiguation and conceptual organization.</p>
<sec id="sec2">
<title>Theoretical background</title>
<p>Research increasingly shows that speech and gesture form an integrated cognitive and interactional system, rather than parallel channels. Within embodied cognition, gestures ground linguistic meaning in sensorimotor experience (<xref ref-type="bibr" rid="ref37">Johnson and Lakoff, 2002</xref>; <xref ref-type="bibr" rid="ref4">Barsalou, 2008</xref>) with often indicating metaphorical mappings (<xref ref-type="bibr" rid="ref50">Lakoff and Johnson, 1980</xref>). Thus gestures help activate, manipulate, and package information for speech, reflecting their role in the integrated cognitive system that underlies both thinking and speaking (<xref ref-type="bibr" rid="ref43">Kita et al., 2017</xref>). Interactionist accounts emphasize that gesture&#x2013;speech timing is socially organized through multiple semiotic resources, with gesture, talk, gaze, and other modalities functioning as coordinated parts of interaction (<xref ref-type="bibr" rid="ref26">Goodwin, 2007</xref>; <xref ref-type="bibr" rid="ref41">Kendon, 2004</xref>; <xref ref-type="bibr" rid="ref67">Parisse et al., 2022</xref>). Both cognitive and interactional approaches converge in viewing fluency as an integrative outcome, realized through the smooth expression of intrapersonal (mind&#x2013;body) and interpersonal (speaker&#x2013;interlocutor) coordination. Fluency is thus multidimensional, encompassing speech, interaction, and gesture, a perspective that <xref ref-type="bibr" rid="ref45">Kosmala (2024)</xref> dubs &#x2018;inter-fluency.&#x2019;</p>
<p>In this study, gestures include both hand movements and vocal tract actions such as the tongue and lips. Articulatory Phonology (AP; <xref ref-type="bibr" rid="ref8">Browman and Goldstein, 1986</xref>, <xref ref-type="bibr" rid="ref9">1989</xref>) defines gestures as vocal-tract actions whose spatiotemporal coordination underlies phonological structure and phonetic implementation. Although AP presents the elaborate system of vocal tract gestures rather than the general body gestures, this perspective dissolves the boundary between speech and gesture, showing that linguistic segments and prosodic patterns emerge from the temporal coordination of bodily actions in an integrated communicative system (<xref ref-type="bibr" rid="ref62">McNeill, 1992</xref>; <xref ref-type="bibr" rid="ref24">Goldin-Meadow and Alibali, 2013</xref>). This kind of view is supported by evolutionary and experimental research, which suggests that manual gestures shape speech development and performance (<xref ref-type="bibr" rid="ref75">Shattuck-Hufnagel and Ren, 2018</xref>; <xref ref-type="bibr" rid="ref71">Pouw et al., 2021</xref>; <xref ref-type="bibr" rid="ref84">Vainio, 2019</xref>; <xref ref-type="bibr" rid="ref19">Gentilucci and Volta, 2008</xref>), with disfluencies often mirrored across modalities (<xref ref-type="bibr" rid="ref1001">Kosmala et al., 2023</xref>).</p>
<p>Phonology is a cognitive representation of physical actions. From this type of perspective, AP&#x2019;s gestural score specifies the embodied primitives of vocal tract actions. These primitives are hierarchically built up to syllables, foot, and prosodic words (see <xref ref-type="bibr" rid="ref73">Selkirk, 1980</xref> et seq., for &#x2018;Prosodic Hiearchy&#x2019;). Prosodic words are the basis for phonological phrases and other larger categories, which function as a domain of various phonological rules. For example, rhythm rules were explained in Metrical Theory (<xref ref-type="bibr" rid="ref57">Liberman and Prince, 1977</xref>; <xref ref-type="bibr" rid="ref30">Hayes, 1995</xref>), which provides the cognitive scaffold that structures their temporal organization. The metrical grid encodes hierarchies of strong and weak beats that act as attractors for attention and timing, thereby licensing gestures at prominent positions such as stressed syllables, prosodic word boundaries, and phrasal edges. Recent models of oscillatory entrainment (<xref ref-type="bibr" rid="ref13">Cummins and Port, 1998</xref>; <xref ref-type="bibr" rid="ref16">Doelling et al., 2014</xref>) reinforce this interpretation by showing that prosodic rhythm reflects timing mechanisms, which align the execution of oral and manual gestures with rhythmic beats.</p>
<p>Although research on multimodality has grown steadily, systematic investigations linking gestures overall to phonological forms remain limited. While many gestures synchronize with pitch accents (<xref ref-type="bibr" rid="ref85">Wagner et al., 2014</xref>), other articulators&#x2014;the lips, tongue, cheeks, eyes, eyebrows, and head&#x2014;appear to coordinate with different aspects of linguistic structure. Cross-linguistic studies illustrate this complexity: eyebrow raising, for instance, follows distinct temporal patterns in English and Japanese (<xref ref-type="bibr" rid="ref14">de La Cruz-Pav&#x00ED;a et al., 2020</xref>), and the tongue and lips help establish language-specific articulatory settings across utterances (<xref ref-type="bibr" rid="ref21">Gick et al., 2004</xref>; <xref ref-type="bibr" rid="ref87">Wilson et al., 2025</xref>). For EFL learners, the lack of explicit guidance on how such articulatory gestures should be timed and integrated risks reinforcing unnatural rhythm and persistent accentedness. What emerges, then, is a clear pedagogical imperative: gesture-informed teaching practices&#x2014;drawing on both articulatory and manual cues&#x2014;must be incorporated into pronunciation instruction, not as an optional supplement, but as an essential means of fostering naturalistic fluency and prosody.</p>
</sec>
<sec id="sec3">
<title>L2 based work</title>
<p>L2-based work has employed gestures into pronunciation instruction to boost learners&#x2019; understanding of English suprasegmental traits. For prosody training, &#x2018;beat&#x2019; gestures&#x2014;cyclic up and down movements of a hand&#x2014;when aligned with stressed syllables of English, has been found to help regulate speech rhythm (<xref ref-type="bibr" rid="ref61">McCafferty, 2002</xref>), and to facilitate the students&#x2019; identification and production of syllables, word stress, and the rhythm of speech (<xref ref-type="bibr" rid="ref78">Smotrova, 2017</xref>), since the beat gestures synchronize with prosodic peaks in English (<xref ref-type="bibr" rid="ref53">Leonard and Cummins, 2011</xref>). Empirical studies report benefits for learners, such as reduced perceived accentedness (<xref ref-type="bibr" rid="ref23">Gluhareva and Prieto, 2017</xref>), improved memory for pitch accents (<xref ref-type="bibr" rid="ref47">Kushch et al., 2018</xref>), wider pitch range and durational contrast (<xref ref-type="bibr" rid="ref90">Yamane et al., 2019</xref>), and enhanced pitch control and fluency (<xref ref-type="bibr" rid="ref10">Cavicchio and Bus&#x00E0;, 2023</xref>). Learner-produced beat gestures also show improvements of L2 English pronunciation, particularly among Catalan learners, where training with beat gestures yielded significantly lower accentedness than training without them (<xref ref-type="bibr" rid="ref58">Llanes-Coromina et al., 2018</xref>; <xref ref-type="bibr" rid="ref72">Prieto et al., 2025</xref>).</p>
<p>Compared to suprasegmental trainings, hand gesture benefits to segmental improvements seem to be more limited. <xref ref-type="bibr" rid="ref88">Xi et al. (2024)</xref> found that learners using hand gestures mimicking lip aperture (wide for /&#x00E6;/, narrow for /&#x028C;/) outperformed those mimicking tongue position or those using no gestures, suggesting that lip-focused cues are particularly effective. Hand gestures have been applied to vowel length contrasts as well (<xref ref-type="bibr" rid="ref33">Hirata and Kelly, 2010</xref>; <xref ref-type="bibr" rid="ref34">Hirata et al., 2014</xref>; <xref ref-type="bibr" rid="ref55">Li et al., 2020</xref>, <xref ref-type="bibr" rid="ref56">2021</xref>), which we classify as suprasegmental (i.e., prosodic) feature. Within a framework of Autosegmental Phonology (e.g., <xref ref-type="bibr" rid="ref25">Goldsmith, 1976</xref>; <xref ref-type="bibr" rid="ref30">Hayes, 1995</xref>; <xref ref-type="bibr" rid="ref46">Kubozono, 2017</xref>), vowel length is a property of its association to the prosodic (moraic) tier, where the length contrast is characterized in the number of morae nested by syllable unit (i.e., short vowel has one mora, while long vowel consists of two moras). This interpretation aligns with previous studies showing that manual gestures are particularly effective for suprasegmental features such as rhythm and fluency, whereas segmental accuracy is more directly supported by articulatory feedback. These findings suggest that lower-level segmental gestures, such as consonants and vowels, may benefit less from hand gestures than higher-level prosodic units. Instead, visual feedback on learners&#x2019; own oral articulatory gestures may provide a more effective pathway for improving segmental accuracy (<xref ref-type="bibr" rid="ref79">Suemitsu et al., 2015</xref>; <xref ref-type="bibr" rid="ref3">Antol&#x00ED;k et al., 2019</xref>; <xref ref-type="bibr" rid="ref44">Kocjan&#x010D;i&#x010D; et al., 2024</xref>; <xref ref-type="bibr" rid="ref91">Yamane et al., 2025</xref>), a possibility that warrants further investigation in future research.</p>
<p>Although gestures have been examined at both segmental and suprasegmental levels, systematic comparisons of objective outcomes across these domains remain underexplored, highlighting the need for studies that directly evaluate their relative effectiveness. Furthermore, though some gesture-based pedagogies have been shown to benefit learners in other Asian EFL contexts (<xref ref-type="bibr" rid="ref59">Ma and Jin, 2022</xref>; <xref ref-type="bibr" rid="ref86">Wang et al., 2023</xref>), their specific impact on Japanese learners&#x2019; fluency development has yet to be systematically examined.</p>
<p>The present experiment is designed to address this gap by testing training effects at both levels of phonology, targeting Japanese learners of English. Our focus is not to capture effects at all levels claimed under the prosodic hierarchy, but to explore two domains&#x2014;suprasegmental and segmental levels&#x2014;as an initial step in understanding gesture&#x2013;speech integration. Neurocognitive research further supports this perspective, as delta- (0.5&#x2013;3&#x202F;Hz) and theta-band (3&#x2013;9&#x202F;Hz) rhythms have been shown to align with prosodic and syllabic cycles (<xref ref-type="bibr" rid="ref22">Giraud and Poeppel, 2012</xref>; <xref ref-type="bibr" rid="ref16">Doelling et al., 2014</xref>), providing a biological bridge between abstract prosodic structure and gesture&#x2013;speech integration. Gestures appear to pattern with this same rhythmic system. For example, beat gestures frequently precede word onsets by approximately 100&#x202F;ms, effectively resetting listeners&#x2019; neural oscillations to sharpen temporal prediction and facilitate speech segmentation (<xref ref-type="bibr" rid="ref5">Biau and Soto-Faraco, 2015</xref>; <xref ref-type="bibr" rid="ref6">Biau et al., 2015</xref>). Together, these findings indicate that speech and gesture are not independent channels but coordinated expressions of a shared timing mechanism that underlies both perception and communication. Importantly, the present study integrates both suprasegmental and segmental targets within a single experimental design. By contrasting gesture types&#x2014;hand gestures associated with suprasegmental development and mouth gestures with segmental refinement&#x2014;it seeks to advance theoretical understanding of the rhythm&#x2013;articulation interface while also offering pedagogical guidance for optimizing gesture-based L2 pronunciation training.</p>
</sec>
<sec id="sec4">
<title>Purpose</title>
<p>The purpose of this study is to compare the effects of two gesture-based training methods&#x2014;manual gestures and articulatory gestures&#x2014;on distinct linguistic features of Japanese EFL learners&#x2019; pronunciation. We also consider how the integration of these methods may provide complementary benefits for suprasegmental and segmental development.</p>
<p>Japanese learners, whose first language is based on a mora-timed rhythm (<xref ref-type="bibr" rid="ref70">Port et al., 1987</xref>), tend to produce English with less durational variability in across all vowels in words, mirroring the more regular rhythm of Japanese. This produces English that sounds overly even and less natural to native listeners, often giving the impression of a slowed overall speech pace. The unnaturalness arises from the absence of vowel reduction in unstressed syllables and cliticization, processes through which the stress-timed rhythm of English facilitates phrasing and accelerates speech tempo. Thus, if gestures are carefully designed to guide learners toward temporal alignment with the prosodic peaks of English, they may come to chunk phrases, accelerate speech tempo, and thereby facilitate the development of &#x2018;speed fluency&#x2019; (<xref ref-type="bibr" rid="ref52">Lennon, 1990</xref>; <xref ref-type="bibr" rid="ref15">de Jong, 2023</xref>), an area where Japanese speakers often face persistent difficulties (<xref ref-type="bibr" rid="ref80">Tajima and Port, 2004</xref>; <xref ref-type="bibr" rid="ref40">Kawase et al., 2024</xref>).</p>
<p>As for segmental skills, Japanese learners show consistently struggle with the vowel /&#x00E6;/ (&#x2018;ash&#x2019;; low front vowel), which is absent from their native five-vowel system /a, i, u, e, o/, and is often substituted with /a/ (&#x2018;lower-case a&#x2019;; low central/back vowel) (<xref ref-type="bibr" rid="ref51">Lambacher et al., 2005</xref>). This substitution arises because these two vowels share tongue height and show overlap in F1, although they differ in tongue backness. English /&#x00E6;/ typically has F2 values around 1700&#x2013;2050&#x202F;Hz (<xref ref-type="bibr" rid="ref68">Peterson and Barney, 1952</xref>; <xref ref-type="bibr" rid="ref31">Hillenbrand et al., 1995</xref>), whereas Japanese /a/ F2 averages only 1,283&#x202F;Hz for males and 1,530&#x202F;Hz for females (<xref ref-type="bibr" rid="ref92">Yazawa and Kondo, 2019</xref>). These values place Japanese /a/ much closer to English /&#x028C;/ (&#x2018;wedge&#x2019;; mid back vowel) than to /&#x00E6;/. Orthographic conventions in the Japanese kana loanword system, clearly reflecting this merger (e.g., lab / love &#x2192; &#x30E9;&#x30D6;), seem to be a contributing factor to both perceptual and articulatory confusions. Given these challenges, vowel contrasts such as /&#x00E6;/ versus /a/ emerge as ideal targets for gesture-based training interventions, on par with the importance of speed fluency training.</p>
<p>This study investigates how manual (hand) and articulatory (mouth) gestures can facilitate the acquisition of specific phonological features in L2 learners, advancing an embodied, multimodal approach to pronunciation instruction. The research addresses two primary questions:</p><list list-type="roman-lower">
<list-item>
<p>How do different types of gesture training differentially influence segmental and suprasegmental aspects of L2 speech?</p>
</list-item>
<list-item>
<p>How does the timing of gesture training (hand-first vs. mouth-first) influence the trajectory of improvement across training phases?</p>
</list-item>
</list>
<p>We predict level-specific outcomes: learners trained with hand gestures will show greater improvement in suprasegmental fluency measured by speech rate, whereas learners trained with mouth gestures will demonstrate greater gains in segmental (vowel) accuracy measured by F2. Furthermore, we expect the timing of training to shape the trajectory of improvement: introducing hand training earlier will yield earlier fluency gains, while introducing mouth training earlier will yield earlier segmental gains.</p>
</sec>
</sec>
<sec sec-type="methods" id="sec5">
<title>Method</title>
<p>To test these predictions, we implemented a counterbalanced training design. Learners were divided into two groups that differed in the order of training: one group received hand training followed by mouth training (Hand&#x2013;Mouth, HM), and the other group received mouth training followed by hand training (Mouth&#x2013;Hand, MH). Training effects were assessed across three test phases: Pre, Mid, and Post. This design allowed us to examine not only the overall benefits of each type of gesture training (hand vs. mouth), but also whether the timing of training (earlier vs. later in the sequence) influenced the trajectory of improvement across phases.</p>
<sec id="sec6">
<title>Participants</title>
<p>Fifty Japanese university students from two classes of the English communication course participated in this study. Ten participants were excluded from the analysis because they failed to complete the entire experimental procedure. As a result, data from the remaining 40 participants (aged 18&#x2013;19) were included in the analysis. All participants reported no history of hearing or speech impairments, were informed about the experimental guidelines, and provided written informed consent prior to participation. This study received ethical approval from the institutional review board of Hiroshima University.</p>
<p>Before the pretest session, participants were assigned to two distinct experimental groups according to their class affiliations. Two training methods were implemented over a four-week period. Because students were taught in intact classes determined by the institution, we assigned the two training methods in reverse order across classes to counterbalance potential order effects, ensuring that any improvements could not be attributed solely to the method presented first (Note: While assigning intact classes to different training orders helped mitigate potential order effects, we acknowledge that this quasi-experimental design does not provide the same level of control as full randomization, and we note this as a limitation of the study).</p>
<p>Specifically, the group that first received Hand Gesture Training (HGT) (<xref ref-type="fig" rid="fig1">Figure 1</xref>) followed by Mouth Gesture Training (MGT) (<xref ref-type="fig" rid="fig2">Figure 2</xref>) was designated as the Hand-Mouth group (HM group, <italic>n</italic>&#x202F;=&#x202F;19), while the group that followed the reverse order was labeled as the Mouth-Hand group (MH group, <italic>n</italic>&#x202F;=&#x202F;21) (see <xref ref-type="table" rid="tab1">Table 1</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Hand-gesture-based training (HGT). A hand moves up and down in a circular motion, with the maximum downward extension synchronized with underlined words.: &#x201C;Betty Botter bought some butter, but she said this butter&#x2019;s bitter, if I put it in my batter, &#x2026;&#x201D; One movement cycle roughly corresponds to a phonological phrase (see <xref ref-type="fig" rid="fig5">Figures 5</xref>, <xref ref-type="fig" rid="fig6">6</xref> for the details).</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four images showing the same person gesturing with one hand. The first and third images show a downward circular motion, indicated by an arrow, labeled "Downward." The second and fourth images show an upward circular motion, indicated by an arrow, labeled "Upward." The person wears a black shirt, and the background is plain.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Mouth-gesture-based training (MGT). When the start button is pressed on the web-based face mesh program (<xref ref-type="bibr" rid="ref77">Shitara et al., 2023</xref>), blue vertical and horizontal bars appear over participant&#x2019;s mouth opening area to indicate whether the mouth corners are appropriately raised. Learners then practice the sentence displayed at the top of the screen. A green bar shows the progress of the task.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Computer interface showing a tongue twister, "Betty Botter bought some butter," with a progress bar labeled "1/5" in green. Below, a partially obscured woman's face with highlighted mouth and lines indicating facial analysis.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>TOEIC (L&#x0026;R) scores by group and sex.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Group</th>
<th align="left" valign="top">Sex</th>
<th align="center" valign="top">N</th>
<th align="center" valign="top">M</th>
<th align="center" valign="top">SD</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="3">HM group</td>
<td align="left" valign="top">Female</td>
<td align="center" valign="top">9</td>
<td align="center" valign="top">507.8</td>
<td align="center" valign="top">76.9</td>
</tr>
<tr>
<td align="left" valign="top">Male</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">496.5</td>
<td align="center" valign="top">86.7</td>
</tr>
<tr>
<td align="left" valign="top">All</td>
<td align="center" valign="top">19</td>
<td align="center" valign="top">501.8&#x002A;&#x002A;</td>
<td align="center" valign="top">80.1</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="3">MH group</td>
<td align="left" valign="top">Female</td>
<td align="center" valign="top">15</td>
<td align="center" valign="top">455.3</td>
<td align="center" valign="top">9.3</td>
</tr>
<tr>
<td align="left" valign="top">Male</td>
<td align="center" valign="top">6</td>
<td align="center" valign="top">305.8</td>
<td align="center" valign="top">70.0</td>
</tr>
<tr>
<td align="left" valign="top">All</td>
<td align="center" valign="top">21</td>
<td align="center" valign="top">412.6&#x002A;&#x002A;</td>
<td align="center" valign="top">110.0</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>M, mean; SD, standard deviation. An independent-samples <italic>t</italic>-test revealed a significant difference between HM group and MH Group in TOEIC scores, <italic>p</italic>&#x202F;=&#x202F;0.006 (&#x002A;&#x002A;).</p>
</table-wrap-foot>
</table-wrap>
<p>Regarding their English proficiency, all participants reported having learned English as a second language through school-based instruction and indicated no experience of long-term residence in an English-speaking country. As first-year students, none had received formal training in English pronunciation or taken courses in linguistics or phonetics. Although the HM and MH groups differed significantly in their TOEIC scores (<italic>p</italic>&#x202F;=&#x202F;0.006), the overall proficiency of the participants was relatively low (<italic>M</italic>&#x202F;=&#x202F;455.0&#x202F;&#x00B1;&#x202F;105.9), approximately corresponding to CEFR levels A2&#x2013;B1 and typical of Japanese first-year university students educated primarily through school-based instruction. Moreover, because the TOEIC (L&#x0026;R) primarily assesses receptive skills (listening and reading) and do not fully represent overall English proficiency&#x2014;particularly productive skills&#x2014;we did not consider it appropriate to classify participants into high- and low-proficiency group on this basis. For practical reasons, they were instead assigned to two groups based on their institution-assigned class affiliations rather than their TOEIC scores.</p>
</sec>
<sec id="sec7">
<title>Speech materials</title>
<p>In both training methods, a tongue twister titled &#x201C;Betty Botter&#x201D; was employed as the speech material. This tongue twister was selected because it includes the target vowels /&#x00E6;/ in &#x201C;batter&#x201D; and /&#x028C;/ in &#x201C;butter,&#x201D; which consistently appear in the same phonetic environment, surrounded by two consonants /b/ and /t/. Furthermore, it consisted of 63 syllables, with each word containing no more than two syllables. Such a design ensured that the participants, who were EFL learners, would not be overwhelmed by the potential complexity. The complete text content of &#x201C;Betty Botter&#x201D; is as follows.</p>
<p>
<disp-quote>
<p>Betty Botter bought some butter;</p>
</disp-quote>
<disp-quote>
<p>&#x201C;But&#x201D; she said &#x201C;This butter&#x2019;s bitter!&#x201D;</p>
</disp-quote>
<disp-quote>
<p>If I put it in my batter,</p>
</disp-quote>
<disp-quote>
<p>it would make my batter bitter.</p>
</disp-quote>
<disp-quote>
<p>But a bit of better butter will make my batter better.</p>
</disp-quote>
<disp-quote>
<p>So t&#x2019;was better Betty Botter bought a bit of better butter.</p>
</disp-quote>
</p>
</sec>
<sec id="sec8">
<title>Gesture training procedures</title>
<p>To examine how different gesture modalities contribute to L2 pronunciation development, we implemented two training conditions that target distinct phonological levels. Hand Gesture Training (HGT) was designed to support suprasegmental development by aligning manual movements with rhythmic and stress patterns, thereby reinforcing learners&#x2019; awareness of timing and fluency. In contrast, Mouth Gesture Training (MGT) focused on segmental refinement by drawing learners&#x2019; attention to tongue and lip configurations that differentiate the difficult vowel contrasts /&#x00E6;/ and /&#x028C;/. The HGT, aimed at improving the fluency of English oral reading, and the MGT, aimed at enhancing pronunciation accuracy of the target sounds, were assigned to participants in two groups (the HM group and the MH group) with a reversed training sequence to ensure counterbalancing. Both conditions used the <italic>Betty Botter</italic> passage as practice material, enabling a direct comparison of how suprasegmental versus segmental gesture-based instruction facilitates L2 pronunciation learning.</p>
<p>For HGT, we used &#x2018;circular&#x2019; gestures. Circular gestures occur naturally in everyday speech to emphasize rhythm and prosody, and are particularly used in music performances such as choral music to enhance expression and the quality of the overall performance (<xref ref-type="bibr" rid="ref35">Jansson et al., 2021</xref>; <xref ref-type="bibr" rid="ref42">Kilpatrick, 2020</xref>). In the field of conducting, circular motions are a type of beat gesture, and form a foundational part of gestural vocabulary. These circular and rounded motions are commonly found in almost all types of beat patterns, such as a 4/4 beat pattern, and 2/4 beat pattern in conducting <italic>legato</italic>, or melodious, smooth and continuous melodic lines (<xref ref-type="fig" rid="fig3">Figure 3</xref>). Most notably within the Ilya Musin method, a highly respected school of conducting that emphasizes clarity and expressiveness through wrist-led movement (<xref ref-type="bibr" rid="ref65">Musin, 1967</xref>; <xref ref-type="bibr" rid="ref66">Ogrizovic-Ciric, 2009</xref>), the circular motions are also part of the beat gestures used in conducting single beats, compared to other common traditions of only beating up and down (<xref ref-type="fig" rid="fig4">Figure 4</xref>). In all circular gestures, consistent beat points were positioned at the onset of the hand&#x2019;s upward motion, reflecting conductors&#x2019; metaphorical mapping of spatial rise onto musical crescendo (<xref ref-type="bibr" rid="ref63">Meissl et al., 2022</xref>) or pitch rise (<xref ref-type="bibr" rid="ref64">Morett et al., 2022</xref>). Drawing from this rationale and tradition, we integrated the circular motions from the Musin method into the training protocol, as they provide a controlled yet naturalistic means of coordinating physical gestures with rhythmic patterns in speech.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Circular motions in legato music conducting. Circular motions can be found in the conducting patterns. 2-beat patterns are simplified from the 4-beat pattern. The dots represent the rhythmic point, which systematically corresponds to where the hand starts to rise.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating music conducting patterns. The 4-beat pattern at the top is divided into four sections with a central loop. Below, it splits into two separate 2-beat patterns with distinct shapes: one vertical with loops and one horizontal, resembling a figure-eight.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Circular motions for 1-beat pattern in the Musin-method. 1-beat patterns are decoupled from the 2-beat patterns in Musin-method. The dot representing the beat point remains at the same place in the gesture, which corresponds to where the hand starts to rise. The circular motion provides a continuous movement, compared to the universal pattern of beating up and down.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram showing music conducting patterns. The top left illustrates a 2-beat pattern in an oval loop, the top right shows a Musin-method 1-beat pattern in a figure-eight loop, and the center displays a circular 1-beat pattern. All converge to a downward arc labeled "Music Conducting (Universal) 1-beat pattern."</alt-text>
</graphic>
</fig>
<p>The details of HGT and MGT are given below.</p>
</sec>
<sec id="sec9">
<title>Hand gesture training (HGT)</title>
<p>In Week 1, learners received hand-gesture training to enhance awareness of stressed syllables, followed in Week 2 by training focused on phonological phrases (typically noun phrases and verb phrases). This progression from lower- to higher-level suprasegmental units is aligned with the principles of prosodic hierarchy.</p>
<p>WEEK 1 (Strokes at stressed syllable level):</p><list list-type="bullet">
<list-item>
<p>An instructor showed circular strokes at every stressed syllable; &#x2018;raising&#x2019; phase (x) aligned with every stressed syllable (e.g., <underline>Be</underline>tty <underline>Bo</underline>tter <underline>bought</underline> some <underline>bu</underline>tter), reinforcing their awareness of cycles of stressed syllables (<xref ref-type="fig" rid="fig5">Figure 5</xref>).</p>
</list-item>
<list-item>
<p>Students stood along walls and read aloud in unison while imitating the instructor&#x2019;s gestures. The instructor approached each student, and checked their hand shape, orientation and tension, and gave them verbal and haptic feedback.</p>
</list-item>
<list-item>
<p>Students and the instructor read aloud with doing circular motion in unison about 5 times in total.</p>
</list-item>
</list>
<p>WEEK 2 (Strokes at phonological phrase level):</p><list list-type="bullet">
<list-item>
<p>An instructor showed circular strokes at every phonological phrase: &#x2018;raising&#x2019; phase (x) aligned with the first stressed syllables within phonological phrases (e.g., [<underline>Be</underline>tty Botter] [<underline>bought</underline> some butter]), reinforcing their awareness of cycles of phrases (<xref ref-type="fig" rid="fig6">Figure 6</xref>).</p>
</list-item>
<list-item>
<p>Students and the instructor read aloud with doing circular motion in unison about 5 times in total. When the instructor notice students&#x2019; erroneous hand motion, they were given verbal and haptic feedback.</p>
</list-item>
</list>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Stressed syllable alignment. The figure illustrates a sequence for a spoken phrase with smooth and continuous circular manual motion, with strokes placed on each stressed, similarly to the raising phase of 1-beat pattern in <xref ref-type="fig" rid="fig3">Figures 3</xref>, <xref ref-type="fig" rid="fig4">4</xref>. The <italic>raising</italic> phase (x) of the circular gesture coincides with stress (e.g., <italic><underline>Be</underline>tty <underline>Bo</underline>tter <underline>bought</underline> some <underline>bu</underline>tter</italic>), highlighting the cyclic rhythm of English.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Illustration depicting a sequence for a spoken phrase with smooth and continuous circular motion. The top sequence shows the phrase "Betty Botter bought", with circles and arrows illustrating the motion. The bottom sequence continues with "some butter; 'But,' she said", also with circles and arrows indicating flow. Each segment aligns with syllables in the phrases.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Phonological phrase alignment. The figure illustrates a sequence for a spoken phrase with smooth and continuous circular manual motion, with strokes placed on each phonological, similarly to the raising phase of 1-beat pattern in <xref ref-type="fig" rid="fig3">Figures 3</xref>, <xref ref-type="fig" rid="fig4">4</xref>. The <italic>raising</italic> phase (x) of the circular gesture coincides with the first stressed syllable within a phrase (e.g., <italic>[<underline>Be</underline>tty Botter] [<underline>bought</underline> some butter]</italic>), highlighting the cyclic rhythm of English.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating smooth and continuous circular motions with arrows, along with text from a tongue twister: "Betty Botter bought some butter;&#x201D; and &#x201C;But, she said 'This butter&#x2019;s bitter!'" Circular arrows accompany each word in rows.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec10">
<title>Mouth gesture training (MGT)</title>
<p>In Week 1, learners engaged in listening and imitation tasks to build awareness of articulatory differences between two vowels. In Week 2, they progressed to lingual and lip shaping drills with real-time visual feedback, moving from awareness-raising to self-modulated practice to support changes in articulatory behavior.</p>
<p>WEEK 1 (Listening and imitation):<list list-type="bullet">
<list-item>
<p>An instructor conducted listening quiz contrasting /&#x00E6;/ and /&#x028C;/ using <italic>English Accent Coach</italic> (<xref ref-type="bibr" rid="ref83">Thomson, 2012</xref>).</p>
</list-item>
<list-item>
<p>An instructor showed <italic>Jolly Phonics</italic> videos (<xref ref-type="bibr" rid="ref38">Jolly Learning, 2013</xref>) illustrating <italic>ant</italic> (/&#x00E6;/) vs. <italic>umbrella</italic> (/&#x028C;/), and explain the articulatory differences between the two vowels:</p>
</list-item>
</list><list list-type="simple">
<list-item>
<p>-/&#x00E6;/: front tongue is visible from the front, and lip shape is reverse triangle.</p>
</list-item>
<list-item>
<p>-/&#x028C;/: tongue is positioned like Japanese /o/, but lip shape is similar to Japanese /a/.</p>
</list-item>
<list-item>
<p>Students imitated vowels, practiced in pairs, and checked each other&#x2019;s pronunciation, tongue and lip positions.</p>
</list-item>
</list></p>
<p>WEEK 2 (Face-mesh software training):<list list-type="bullet">
<list-item>
<p>An instructor introduced web-based face mesh program (<xref ref-type="bibr" rid="ref77">Shitara et al., 2023</xref>).</p>
</list-item>
<list-item>
<p>Training emphasized:</p>
</list-item>
</list><list list-type="simple">
<list-item>
<p>-Open the mouth wider than Japanese /a/, and raise mouth corners for /&#x00E6;/.</p>
</list-item>
<list-item>
<p>-Advance the front of the tongue for /&#x00E6;/, and confirm the movement via visual feedback.</p>
</list-item>
<list-item>
<p>Students practiced individually with real-time webcam feedback and scoring. The instructor observed students activities, and gave them oral feedback.</p>
</list-item>
</list></p>
</sec>
<sec id="sec11">
<title>Production testing session</title>
<p>The pronunciation testing session was conducted in a soundproof booth. During the session, participants were seated at a table equipped with a condenser microphone (Audio Technica AT2020) for recording and a monitor for displaying prompt words. The pronunciation data captured by the microphone were transmitted to a laboratory computer via an audio interface (Focusrite Scalett Solo 2nd Gen) and recorded using Praat (<xref ref-type="bibr" rid="ref7">Boersma and Weenink, 1992&#x2013;2024</xref>) with a sampling frequency of 44,100 Hz.</p>
<p>In the paragraph-reading task, the tongue twister &#x201C;Betty Botter,&#x201D; which was used in the training session, was also employed in the pronunciation testing. During the testing, the tongue twister was displayed on the monitor, and participants were instructed to read it aloud one time. They were not instructed to use any hand gestures, allowing us to assess their performance independently of gesture use and thereby isolate the effects of the training. In the picture-naming task, three pictures for &#x201C;batter&#x201D; and three for &#x201C;butter&#x201D; were selected. These pictures were presented on the monitor once each in random order. Participants were required to name the item depicted in each picture, thereby determining whether it was &#x201C;batter&#x201D; or &#x201C;butter.&#x201D;</p>
</sec>
<sec id="sec12">
<title>Experimental procedure</title>
<p>The entire experimental procedure consisted of two training phases and three test sessions over a four-week period. The schedule of the experiment is given in <xref ref-type="fig" rid="fig7">Figure 7</xref>. Before the first training phase, participants completed a pre-test, which included a paragraph-reading task and a picture-naming task, lasting approximately 20&#x202F;min in total. Following the pre-test, the HM group underwent hand gesture training (HGT), while the MH group received mouth gesture training (MGT). Training sessions were conducted twice weekly in a classroom setting under instructor supervision, with each session lasting 20&#x202F;min. After completing four training sessions (totalling 80&#x202F;min), participants took a mid-test, which was identical to the pre-test. Participants then entered a second two-week training phase, in which the other type of training was implemented: the HM group received MGT, and the MH group underwent HGT. After completing another four training sessions, participants took a post-test, which was consistent with the pre-test and mid-test procedures.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Flowchart detailing an experimental design. After completing the pretest (paragraph-reading task and picture-naming task), participants were assigned to either the Hand-first (HM) group or the Mouth-first (MH) group. Each group received their initial gesture training during Weeks 1 and 2, followed by a mid-test. In Weeks 3 and 4, the HM group received mouth gesture training and the MH group received hand gesture training. The posttest was administered after the second phase of training.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart detailing an experimental design. "PRETEST" involves paragraph-reading and picture-naming tasks for twenty minutes. Participants are split into HM and MH groups. Weeks 1-2 involve HGT training for HM group and MGT training for MH group, both lasting twenty minutes, four times. "MIDTEST" repeats the initial tasks. In weeks 3-4, groups switch training types. "POSTTEST" again involves the original tasks for twenty minutes.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec13">
<title>Measurements and analyses</title>
<p>In the reading task, 120 tokens (40 participants &#x00D7; 3 training phrases &#x00D7; 1 repetition) were collected. The total reading duration for each participant of the &#x201C;Betty Botter&#x201D; text was calculated, including all types of pauses and repetitions. About the types of pause, only silent pauses were observed. An examination for filled pauses yielded none, likely because the speakers had already become familiar with the texts. In the picture-naming task, a total of 720 tokens (40 participants &#x00D7; 2 vowel stimuli &#x00D7; 3 training phrases &#x00D7; 3 repetitions) were collected. In the words &#x201C;batter&#x201D; and &#x201C;butter,&#x201D; second formant (F2) of the vowels /&#x00E6;/ and /&#x028C;/ was manually annotated and measured using Praat.</p>
</sec>
<sec id="sec14">
<title>Speech rate</title>
<p>The speech rate was calculated by dividing the fixed total of 63 syllables in the &#x201C;Betty Botter&#x201D; text by each participant&#x2019;s total reading duration (in seconds). The result is expressed as syllables per second (SPS).</p>
</sec>
<sec id="sec15">
<title>Second formant</title>
<p>When using Praat for F2 measurements, parameter settings were as follows: the number of formants was set to 5, and the window length was configured at 40 milliseconds. Furthermore, according to the frequency characteristics of participants&#x2019; voices, the formant ceiling value was fine-tuned within the range of 5,000&#x2013;6,000&#x202F;Hz to achieve optimal formant tracking.</p>
<p>The F2 values were measured at the midpoint of the intervals annotated for the vowels /&#x00E6;/ and /&#x028C;/ in the words &#x201C;batter&#x201D; and &#x201C;butter.&#x201D; The onset of the intervals was defined as the first appearance of a periodic waveform following the consonant /b/, and the offset was marked as the last point of the periodic waveform prior to the consonant /t/. The raw F2 values were normalized using the Lobanov method, as implemented in NORM (<xref ref-type="bibr" rid="ref82">Thomas and Kendall, 2007</xref>), based on each participant&#x2019;s F2 values measured three times under all conditions, to reduce individual differences due to physiological structure. Subsequently, the normalized F2 values obtained from the three measurements under all conditions were averaged and utilized for statistical analysis.</p>
</sec>
<sec id="sec16">
<title>Statistical analyses</title>
<p>For the speech rate measurements, a two-way mixed-design ANOVA was conducted to examine the effects of training phase (pre, mid, post) as one within-subjects factor, and training sequence (HM group vs. MH group) as one between-subjects factor. For the normalized F2 measurements, a three-way mixed-design ANOVA was conducted to examine the effects of vowel type (/&#x00E6;/, /&#x028C;/) and training phase (pre, mid, post) as two within-subjects factors, and training sequence (HM group vs. MH group) as one between-subjects factor.</p>
<p>Both statistical analyses were conducted under the assumption of sphericity, as confirmed by Mauchly&#x2019;s test (<italic>p</italic>&#x202F;&#x003E;&#x202F;0.05). Bonferroni correction was applied in <italic>post hoc</italic> pairwise comparisons to control the family-wise error rate. The significance level (<italic>&#x03B1;</italic>) was set to 0.05. All statistical analyses were performed using JASP (version 0.19.2).</p>
</sec>
</sec>
<sec sec-type="results" id="sec17">
<title>Result</title>
<p>As hypothesized, hand training facilitates suprasegmental improvement, as speech rate increased only following hand training in both groups (<xref ref-type="fig" rid="fig8">Figure 8</xref>). A two-way mixed ANOVA revealed a significant Group &#x00D7; Time interaction on speech rate (<italic>F</italic>(2,76)&#x202F;=&#x202F;9.28, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, &#x03B7;<sup>2</sup>&#x202F;=&#x202F;0.044). The main effect of Time and that of Group were also significant. Post-hoc tests revealed training- and Group-specific effects on the speech rate. For MH group, speech rate was higher in Post test than in Pre test (<italic>t</italic>&#x202F;=&#x202F;3.48, <italic>p</italic>&#x202F;=&#x202F;0.004, Cohen&#x2019;s d&#x202F;=&#x202F;0.69) or in Mid test (<italic>t</italic>&#x202F;=&#x202F;3.45, <italic>p</italic>&#x202F;=&#x202F;0.004, d&#x202F;=&#x202F;0.56), whereas it was not different between Pre and Mid tests (<italic>t</italic>&#x202F;=&#x202F;0.79, <italic>p</italic>&#x202F;=&#x202F;0.105, d&#x202F;=&#x202F;0.14). For HM group, speech rate was higher in Mid test than in Pre test (<italic>t</italic>&#x202F;=&#x202F;3.64, <italic>p</italic>&#x202F;=&#x202F;0.002, d&#x202F;=&#x202F;0.66) or in Post test (<italic>t</italic>&#x202F;=&#x202F;3.27, <italic>p</italic>&#x202F;=&#x202F;0.007, d&#x202F;=&#x202F;0.56). The difference between the Pre and Post tests were not significant (<italic>t</italic>&#x202F;=&#x202F;0.446, <italic>p</italic>&#x202F;=&#x202F;0.661, d&#x202F;=&#x202F;0.10). These results suggest that only Hand training improved the speech rate for both Groups.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Comparison of speech rates of two groups. Speech rate was assessed as the number of syllables per second (SPS). MH group trained with mouth gesture in the first training period (between the Pre and Mid tests), and trained with hand gesture in the second training period (between the Mid and Post tests). &#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Line graph comparing speech rates of two groups, MH and HM, labeled in blue and red. Speech rate is measured in SPS from zero to six over three time points: Pre, Mid, and Post. HM consistently shows higher rates. Asterisks indicate significant differences between measurements.</alt-text>
</graphic>
</fig>
<p>The mouth training improved F2 value of the vowels /&#x00E6;/ and /&#x028C;/ in the words &#x201C;batter&#x201D; and &#x201C;butter&#x201D; (<xref ref-type="fig" rid="fig9">Figure 9</xref>). The 3-way mixed ANOVA revealed a significant Vowel &#x00D7; Time interaction on Lobanov-normalized F2 value (<italic>F</italic>(2,76)&#x202F;=&#x202F;6.26, <italic>p</italic>&#x202F;=&#x202F;0.003, &#x03B7;<sup>2</sup>&#x202F;=&#x202F;0.038). The main effects of Time (<italic>F</italic>(2,76)&#x202F;=&#x202F;16.38, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, &#x03B7;<sup>2</sup>&#x202F;=&#x202F;0.13) and Vowel (<italic>F</italic>(2,76)&#x202F;=&#x202F;22.33, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, &#x03B7;<sup>2</sup>&#x202F;=&#x202F;0.098) were also statistically significant.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Comparison of F2 of two groups. Lobanov-normalized F2 values for the vowels /&#x00E6;/ and /&#x028C;/ in the words &#x201C;batter&#x201D; and &#x201C;butter.&#x201D; MH group trained with mouth gesture in the first training period (between the Pre and Mid tests), and trained with hand gesture in the second training period (between the Mid and Post tests). &#x002A;<italic>p</italic>&#x202F;&#x003C;&#x202F;0.05.</p>
</caption>
<graphic xlink:href="fcomm-10-1620465-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Line graphs compare bAtter F2 and bUtter F2 values for MH and HM groups over Pre, Mid, and Post phases. Both graphs show data points with error bars. MH in blue and HM in red. Significant differences are marked with asterisks.</alt-text>
</graphic>
</fig>
<p>Note that we collapsed the training Group since none of the Group-related interactions nor the main effect of Group were significant. For the vowel /&#x028C;/ in &#x201C;butter,&#x201D; no significant differences were revealed by the post-hoc pairwise comparisons. For the vowel /&#x00E6;/ in &#x201C;batter,&#x201D; significant differences were observed between Pre and Post tests (<italic>t</italic>&#x202F;=&#x202F;5.49, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, d&#x202F;=&#x202F;1.53) and Mid and Post tests (<italic>t</italic>&#x202F;=&#x202F;4.09, <italic>p</italic>&#x202F;&#x003C;&#x202F;0.001, d&#x202F;=&#x202F;0.99).</p>
</sec>
<sec sec-type="discussion" id="sec18">
<title>Discussion</title>
<p>This study tested two predictions: first, that hand gesture training would enhance suprasegmental fluency while mouth gesture training would improve segmental accuracy; and second, that the timing of training would shape the trajectory of improvement. The results supported the first prediction: hand training facilitated speech-rate gains, and mouth training contributed to F2 improvements. The second prediction, however, was not supported, as no significant group-specific timing effects were observed. Each research question and its corresponding results are discussed in detail below.</p>
<p>We postulated research question (i) How do different types of gesture training differentially influence segmental and suprasegmental aspects of L2 speech? Regarding this question, we hypothesized that different types of motor training would selectively facilitate distinct aspects of speech production: hand training would enhance suprasegmental features (e.g., speech rate), while mouth training would enhance segmental features (e.g., vowel articulation, measured by F2). The findings suggest that hand- and mouth-gesture training exert selective influences on different levels of the phonological hierarchy. Mouth gestures facilitated segmental improvement, as shown by clearer /&#x00E6;/&#x2212;/&#x028C;/ contrasts, while hand gestures enhanced suprasegmental fluency, reducing disfluencies and promoting smoother prosodic flow. These results align with neurophysiological evidence that delta-band oscillations (supporting phrasal rhythm and fluency) entrain rapidly to external gestures, whereas theta-band oscillations (supporting syllable-level articulation) require finer motor control and more sustained practice. In this sense, the two gesture modalities appear to direct learners&#x2019; attention to different units of speech&#x2014;mouth gestures to vowel articulation within syllables, and hand gestures to phrasing and timing at higher prosodic levels.</p>
<p>Recurrent hand gestures intersect with structural, semantic, and embodied dimensions of our learning targets, providing multiple layers of support: they can highlight prosodic structure, convey metaphorical meaning, and engage sensorimotor systems that reinforce learning. Structurally, cyclic hand motions align with prosodic phrases, embodying the recursive organization of language; one-circle-per-phrase movement, adapted from music conducting, marks phrase boundaries and facilitates prosodic flow (<xref ref-type="bibr" rid="ref74">Selkirk, 1984</xref>; <xref ref-type="bibr" rid="ref29">Hauser et al., 2002</xref>; <xref ref-type="bibr" rid="ref60">Martins et al., 2017</xref>; <xref ref-type="bibr" rid="ref81">Temperley, 2022</xref>). Semantically, circular motion metaphorically represents smoothness and continuity, reinforcing the sense of flow in speech; such gestures also appear spontaneously in conversation, when speakers searching for words often employ cyclic hand movements (<xref ref-type="bibr" rid="ref48">Ladewig, 2011</xref>; <xref ref-type="bibr" rid="ref49">Ladewig and Bressem, 2013</xref>), consistent with conceptual metaphor theory (<xref ref-type="bibr" rid="ref50">Lakoff and Johnson, 1980</xref>; <xref ref-type="bibr" rid="ref37">Johnson and Lakoff, 2002</xref>). Embodied, the physical enactment of such gestures provides proprioceptive and visual feedback that integrates manual and oral movements, strengthening the sensorimotor foundations of fluent speech production (<xref ref-type="bibr" rid="ref1">Acton et al., 2013</xref>; <xref ref-type="bibr" rid="ref71">Pouw et al., 2021</xref>; <xref ref-type="bibr" rid="ref93">Yu et al., 2024</xref>). Together, these dimensions illustrate how recurrent gestures serve as a multimodal scaffold for speech learning.</p>
<p>Mouth-gesture training improved segmental accuracy, as learners enhanced the /&#x00E6;/&#x2212;/&#x028C;/ contrast by advancing the tongue and spreading the lips, supported by real-time biovisual feedback. Although such feedback on lingual gestures has rarely been reported, these results align with evidence that visual monitoring and self-correction can effectively refine segmental production (<xref ref-type="bibr" rid="ref20">Gick et al., 2008</xref>; <xref ref-type="bibr" rid="ref79">Suemitsu et al., 2015</xref>; <xref ref-type="bibr" rid="ref39">Katz and Mehta, 2015</xref>; <xref ref-type="bibr" rid="ref91">Yamane et al., 2025</xref>). In this way, mouth- and hand-gesture training yielded selective benefits: mouth training improved vowel contrast, while hand training enhanced fluency. These distinct effects reflect the prosodic hierarchy, where segmental and suprasegmental units are governed by separate rules. By directing learners&#x2019; attention to the relevant level, gesture-based training facilitated targeted gains in L2 pronunciation.</p>
<p>We also posed question (ii) How does the timing of gesture training (hand-first vs. mouth-first) influence the trajectory of improvement across training phases? We predicted that introducing hand training earlier would yield earlier fluency gains, while introducing mouth training earlier would yield earlier segmental gains. The results showed that gesture training overall facilitated improvement in both domains: hand gestures enhanced suprasegmental fluency, and mouth gestures contributed to segmental accuracy. However, no Group-related interactions were significant. This indicates that the order of training did not affect the trajectory of improvement. In other words, although different types of gesture training benefitted different aspects of pronunciation, their effectiveness was not dependent on whether they were introduced first or second.</p>
<p>Although no Group &#x00D7; Time interactions reached significance, two descriptive patterns warrant brief discussion, as they may inform future research on gesture-based training. First, although mouth training was expected to yield immediate F2 gains when introduced early, such improvements did not seem to emerge in the MH group. One possible contributing factor is learner proficiency: as noted in the Method section, the MH group had lower average TOEIC scores than the HM group. Descriptive data of raw subgroup means (<xref ref-type="app" rid="app1">Table A1</xref>) further suggest that subgroup variability, particularly among MH males, may have influenced the trajectory of vowel accuracy gains. Learners with lower proficiency may require more extensive practice and auditory&#x2013;motor feedback before segmental adjustments such as /&#x00E6;/ can be reliably achieved (<xref ref-type="bibr" rid="ref18">Flege et al., 1997</xref>; <xref ref-type="bibr" rid="ref2">Alshangiti and Evans, 2024</xref>). Nonetheless, given the absence of significant Group-related interactions, these observations remain exploratory and should be investigated in future research.</p>
<p>Second, the HM group appeared to show a descriptive decline in speech rate at posttest. One possible explanation is that learners may face attentional limits when balancing fluency and segmental refinement, resulting in a temporary trade-off. Although the Group-related interactions were not significant, these descriptive observations likewise remain exploratory and should be investigated in future research with larger and more homogeneous samples.</p>
<sec id="sec19">
<title>Pedagogical implications</title>
<p>The results point to the potential of gesture-based training as a targeted supplement to pronunciation instruction. Rather than treating pronunciation as a uniform skill, training can be designed to address suprasegmental and segmental development in complementary ways. Hand gestures may provide an accessible entry point for building fluency across proficiency levels, whereas mouth gestures may be more effective for learners who already possess the proficiency needed for fine-grained articulatory adjustments. The HM group&#x2019;s improvement in vowel accuracy may have benefited from the fluency gains fostered by hand gestures, a pattern consistent with finding by <xref ref-type="bibr" rid="ref54">Li et al. (2023)</xref>, who showed that hand-based gesture training targeting suprasegmental features also led to improvements at the segmental level. Although the specific suprasegmental measures differed&#x2014;intonation in Li et al. and speech rate in the present study&#x2014;both sets of findings converge on the idea that suprasegmental-focused training can create favorable conditions for subsequent segmental improvement. More broadly, these findings echo evidence that suprasegmental-focused training often produces more noticeable gains in listener judgments than segmental drilling alone (<xref ref-type="bibr" rid="ref27">Gordon and Darcy, 2019</xref>).</p>
<p>Sequencing hand and mouth training could further maximize their complementary benefits. At the same time, descriptive patterns observed in this study suggest that learner variability may influence training outcomes, underscoring the need for flexible instructional designs. Taken together, these findings demonstrate how gesture-based training can differentially support segmental and suprasegmental development, offering a basis for more nuanced and effective approaches to L2 pronunciation pedagogy.</p>
</sec>
<sec id="sec20">
<title>Limitations</title>
<p>While the present findings offer important insights into the developmental relationship between suprasegmental and segmental features in adult L2 speech, several limitations should be acknowledged. This study focused on two dependent variables&#x2014;speech rate as a proxy for suprasegmental development and F2 values as a proxy for segmental articulation&#x2014;which, although informative, cannot capture the full range of prosodic and articulatory changes involved in pronunciation learning. Future research should therefore include additional measures such as intonation, pitch range, stress placement, syllable duration, or consonant clarity to provide a more comprehensive picture of learning trajectories. Moreover, the training types (hand versus mouth gestures) were operationalized as broad modalities, yet the cognitive load, motor demands, and degree of linguistic integration likely varied across participants. Further studies could explore how differences in task complexity and attentional demands influence outcomes, helping to clarify the mechanisms that support learning. Finally, the relatively small sample size may have limited statistical power, possibly obscuring interaction effects or moderating influences. Addressing these issues will be essential for advancing our understanding of how gesture-based training supports L2 pronunciation development.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="sec21">
<title>Conclusion</title>
<p>This study provides evidence that gesture-based training can differentially support suprasegmental and segmental aspects of L2 pronunciation. Hand gestures facilitated gains in fluency, while mouth gestures contributed to improvements in vowel articulation, as reflected in F2 values. Although no group-specific timing effects were observed, the overall pattern suggests that hand and mouth gestures provide distinct yet complementary benefits.</p>
<p>From a cognitive perspective, these findings are consistent with the view that speech and gesture form an integrated system in which multiple rhythmic and articulatory processes jointly shape language production. The distinct effects of hand and mouth gestures suggest that prosodic framing and articulatory refinement engage partly independent yet coordinated sensorimotor routines.</p>
<p>Pedagogically, when viewed through an interactional lens, gesture-based instruction can heighten learners&#x2019; awareness of fluency as an integrative outcome&#x2014;emerging from the coordination of intrapersonal (mind&#x2013;body) and interpersonal (speaker&#x2013;interlocutor) processes. This view further reinforces the notion of inter-fluency (<xref ref-type="bibr" rid="ref45">Kosmala, 2024</xref>), which conceptualizes fluency as a multidimensional phenomenon encompassing speech, interaction, and gesture. Through haptic feedback from manual and lingual&#x2013;labial gestures, learners can monitor and adjust their articulatory movements to maximize visible cues that support mutual intelligibility. Such embodied and socially attuned adjustments help synchronize gesture, speech, and facial expression, promoting fluency as a jointly managed, multimodal skill that integrates precision, rhythm, and interactive alignment. Descriptive patterns further suggest that individual differences in learners&#x2019; sensitivity to gesture or articulatory feedback may influence training effectiveness, highlighting a valuable direction for future research.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec22">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="sec23">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Research Ethics Review Board, Graduate School of Humanities and Social Sciences. The studies were conducted in accordance with the local legislation and institutional requirements. The participants provided their written informed consent to participate in this study. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec sec-type="author-contributions" id="sec24">
<title>Author contributions</title>
<p>NY: Investigation, Supervision, Project administration, Writing &#x2013; review &#x0026; editing, Funding acquisition, Writing &#x2013; original draft, Methodology, Validation, Visualization, Formal analysis, Conceptualization, Data curation, Resources. MS: Conceptualization, Resources, Validation, Funding acquisition, Writing &#x2013; review &#x0026; editing, Methodology, Visualization, Writing &#x2013; original draft. XT: Supervision, Investigation, Data curation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Visualization. AC: Conceptualization, Resources, Writing &#x2013; original draft, Visualization, Funding acquisition, Methodology, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec26">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec27">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec28">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Acton</surname><given-names>W.</given-names></name> <name><surname>Baker</surname><given-names>A.</given-names></name> <name><surname>Burri</surname><given-names>M</given-names></name></person-group>. and <person-group person-group-type="author"><name><surname>Teaman</surname><given-names>B.</given-names></name></person-group>, (<year>2013</year>). &#x201C;<article-title>Preliminaries to haptic-integrated pronunciation instruction</article-title>.&#x201D; In: <person-group person-group-type="editor"><name><surname>Levis</surname><given-names>J.</given-names></name> <name><surname>LeVelle</surname><given-names>K.</given-names></name></person-group>, (eds.), <conf-name>Proceedings of the 4th Pronunciation in Second Language Learning and Teaching Conference, August 2012</conf-name>, <publisher-loc>Ames, IA</publisher-loc>: <publisher-name>Iowa State University</publisher-name>, pp. <fpage>234</fpage>&#x2013;<lpage>244</lpage>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alshangiti</surname><given-names>W.</given-names></name> <name><surname>Evans</surname><given-names>B.</given-names></name></person-group> (<year>2024</year>). <article-title>Learning English vowels: the effects of different phonetic training modes on Arabic learners' production and perceptiona</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>156</volume>, <fpage>284</fpage>&#x2013;<lpage>298</lpage>. doi: <pub-id pub-id-type="doi">10.1121/10.0026451</pub-id>, <pub-id pub-id-type="pmid">38984810</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Antol&#x00ED;k</surname><given-names>T. K.</given-names></name> <name><surname>Pillot-Loiseau</surname><given-names>C.</given-names></name> <name><surname>Kamiyama</surname><given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>The effectiveness of real-time ultrasound visual feedback on tongue movements in L2 pronunciation training: Japanese learners&#x2019; progress on the French vowel contrast/y/&#x2212;/u/</article-title>. <source>J. Second Lang. Pronunc.</source> <volume>5</volume>, <fpage>72</fpage>&#x2013;<lpage>97</lpage>. doi: <pub-id pub-id-type="doi">10.1075/jslp.16022.ant</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Barsalou</surname><given-names>L. W.</given-names></name></person-group> (<year>2008</year>). <article-title>Grounded cognition</article-title>. <source>Annu. Rev. Psychol.</source> <volume>59</volume>, <fpage>617</fpage>&#x2013;<lpage>645</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev.psych.59.103006.093639</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Biau</surname><given-names>E.</given-names></name> <name><surname>Soto-Faraco</surname><given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Synchronization by the hand: the sight of gestures modulates low-frequency activity in brain responses to continuous speech</article-title>. <source>Front. Hum. Neurosci.</source> <volume>9</volume>:<fpage>527</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnhum.2015.00527</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Biau</surname><given-names>E.</given-names></name> <name><surname>Torralba</surname><given-names>M.</given-names></name> <name><surname>Fuentemilla</surname><given-names>L.</given-names></name> <name><surname>de Diego-Balaguer</surname><given-names>R.</given-names></name> <name><surname>Soto-Faraco</surname><given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Speaker&#x2019;s hand gestures modulate speech perception through phase resetting of ongoing neural oscillations</article-title>. <source>Cereb. Cortex</source> <volume>26</volume>, <fpage>246</fpage>&#x2013;<lpage>256</lpage>. doi: <pub-id pub-id-type="doi">10.1093/cercor/bhu207</pub-id>, <pub-id pub-id-type="pmid">25217468</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Boersma</surname><given-names>P.</given-names></name> <name><surname>Weenink</surname><given-names>D.</given-names></name></person-group>, (<year>1992&#x2013;2024</year>). Praat: doing phonetics by computer [computer program]. Available online at: <ext-link xlink:href="http://www.praat.org" ext-link-type="uri">http://www.praat.org</ext-link>. Version 6.4.22 (Accessed October 5, 2024).</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Browman</surname><given-names>C.</given-names></name> <name><surname>Goldstein</surname><given-names>L.</given-names></name></person-group> (<year>1986</year>). <article-title>Towards an articulatory phonology</article-title>. <source>Phonology</source> <volume>3</volume>, <fpage>219</fpage>&#x2013;<lpage>252</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0952675700000658</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Browman</surname><given-names>C.</given-names></name> <name><surname>Goldstein</surname><given-names>L.</given-names></name></person-group> (<year>1989</year>). <article-title>Articulatory gestures as phonological units</article-title>. <source>Phonology</source> <volume>6</volume>, <fpage>201</fpage>&#x2013;<lpage>251</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0952675700001019</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cavicchio</surname><given-names>F.</given-names></name> <name><surname>Bus&#x00E0;</surname><given-names>M. G.</given-names></name></person-group> (<year>2023</year>). <article-title>Lending a hand to speech: gestures help fluency and increase pitch in second language speakers</article-title>. <source>Lang. Interact. Acquis.</source> <volume>14</volume>, <fpage>218</fpage>&#x2013;<lpage>246</lpage>. doi: <pub-id pub-id-type="doi">10.1075/lia.22023.cav</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cummins</surname><given-names>F.</given-names></name> <name><surname>Port</surname><given-names>R.</given-names></name></person-group> (<year>1998</year>). <article-title>Rhythmic constraints on stress timing in English</article-title>. <source>J. Phon.</source> <volume>26</volume>, <fpage>145</fpage>&#x2013;<lpage>171</lpage>. doi: <pub-id pub-id-type="doi">10.1006/jpho.1998.0070</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>de La Cruz-Pav&#x00ED;a</surname><given-names>I.</given-names></name> <name><surname>Gervain</surname><given-names>J.</given-names></name> <name><surname>Vatikiotis-Bateson</surname><given-names>E.</given-names></name> <name><surname>Werker</surname><given-names>J. F.</given-names></name></person-group> (<year>2020</year>). <article-title>Coverbal speech gestures signal phrase boundaries: a production study of Japanese and English infant- and adult-directed speech</article-title>. <source>Lang. Acquis.</source> <volume>27</volume>, <fpage>160</fpage>&#x2013;<lpage>186</lpage>. doi: <pub-id pub-id-type="doi">10.1080/10489223.2019.1659276</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>de Jong</surname><given-names>N. H.</given-names></name></person-group> (<year>2023</year>). <article-title>Fluency in speaking as a dynamic construct</article-title>. <source>Lang. Teach. Res. Q.</source> <volume>37</volume>, <fpage>179</fpage>&#x2013;<lpage>187</lpage>. doi: <pub-id pub-id-type="doi">10.32038/ltrq.2023.37.09</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Doelling</surname><given-names>K. B.</given-names></name> <name><surname>Arnal</surname><given-names>L. H.</given-names></name> <name><surname>Ghitza</surname><given-names>O.</given-names></name> <name><surname>Poeppel</surname><given-names>D.</given-names></name></person-group> (<year>2014</year>). <article-title>Acoustic landmarks drive delta&#x2013;theta oscillations to enable speech comprehension by facilitating perceptual parsing</article-title>. <source>NeuroImage</source> <volume>85</volume>, <fpage>761</fpage>&#x2013;<lpage>768</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neuroimage.2013.06.035</pub-id>, <pub-id pub-id-type="pmid">23791839</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Flege</surname><given-names>J.</given-names></name> <name><surname>Bohn</surname><given-names>O.</given-names></name> <name><surname>Jang</surname><given-names>S.</given-names></name></person-group> (<year>1997</year>). <article-title>Effects of experience on non-native speakers' production and perception of English vowels</article-title>. <source>J. Phon.</source> <volume>25</volume>, <fpage>437</fpage>&#x2013;<lpage>470</lpage>. doi: <pub-id pub-id-type="doi">10.1006/JPHO.1997.0052</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gentilucci</surname><given-names>M.</given-names></name> <name><surname>Volta</surname><given-names>R.</given-names></name></person-group> (<year>2008</year>). <article-title>Spoken language and arm gestures are controlled by the same motor control system</article-title>. <source>Q. J. Exp. Psychol.</source> <volume>61</volume>, <fpage>944</fpage>&#x2013;<lpage>957</lpage>. doi: <pub-id pub-id-type="doi">10.1080/17470210701625683</pub-id>, <pub-id pub-id-type="pmid">18470824</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gick</surname><given-names>B.</given-names></name> <name><surname>Bernhardt</surname><given-names>B. M.</given-names></name> <name><surname>Bacsfalvi</surname><given-names>P.</given-names></name> <name><surname>Wilson</surname><given-names>I.</given-names></name></person-group> (<year>2008</year>). &#x201C;<article-title>11. Ultrasound imaging applications in second language acquisition</article-title>&#x201D; In: <person-group person-group-type="editor"><name><surname>Hansen Edwards</surname><given-names>J. G.</given-names></name> <name><surname>Zampini</surname><given-names>M. L.</given-names></name></person-group>, (eds.) <source>Phonology and second language acquisition</source> (<publisher-name>John Benjamins Publishing Company</publisher-name>), <fpage>309</fpage>&#x2013;<lpage>322</lpage>.</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gick</surname><given-names>B.</given-names></name> <name><surname>Wilson</surname><given-names>I.</given-names></name> <name><surname>Koch</surname><given-names>K.</given-names></name> <name><surname>Cook</surname><given-names>C.</given-names></name></person-group> (<year>2004</year>). <article-title>Language-specific articulatory settings: evidence from inter-utterance rest position</article-title>. <source>Phonetica</source> <volume>61</volume>, <fpage>220</fpage>&#x2013;<lpage>233</lpage>. doi: <pub-id pub-id-type="doi">10.1159/000084159</pub-id>, <pub-id pub-id-type="pmid">15824488</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Giraud</surname><given-names>A. L.</given-names></name> <name><surname>Poeppel</surname><given-names>D.</given-names></name></person-group> (<year>2012</year>). &#x201C;<article-title>Speech perception from a neurophysiological perspective</article-title>&#x201D; In: <person-group person-group-type="editor"><name><surname>Poeppel</surname><given-names>D.</given-names></name> <name><surname>Overath</surname><given-names>T.</given-names></name> <name><surname>Popper</surname><given-names>A. N.</given-names></name> <name><surname>Fay</surname><given-names>R. R.</given-names></name></person-group>, eds. <source>The human auditory cortex</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer New York</publisher-name>), <fpage>225</fpage>&#x2013;<lpage>260</lpage>.</mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gluhareva</surname><given-names>D.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2017</year>). <article-title>Training with rhythmic beat gestures benefits L2 pronunciation in discourse-demanding situations</article-title>. <source>Lang. Teach. Res.</source> <volume>21</volume>, <fpage>609</fpage>&#x2013;<lpage>631</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1362168816651463</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goldin-Meadow</surname><given-names>S.</given-names></name> <name><surname>Alibali</surname><given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>Gesture's role in speaking, learning, and creating language</article-title>. <source>Annu. Rev. Psychol.</source> <volume>64</volume>, <fpage>257</fpage>&#x2013;<lpage>283</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev-psych-113011-143802</pub-id>, <pub-id pub-id-type="pmid">22830562</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Goldsmith</surname><given-names>J. A.</given-names></name></person-group> (<year>1976</year>). <source>Autosegmental phonology</source> <comment>(Doctoral Dissertation)</comment>: <publisher-name>Massachusetts Institute of Technology</publisher-name>, <publisher-loc>Massachusetts</publisher-loc>.</mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goodwin</surname><given-names>C.</given-names></name></person-group> (<year>2007</year>). <article-title>Participation, stance and affect in the organization of activities</article-title>. <source>Discourse Soc.</source> <volume>18</volume>, <fpage>53</fpage>&#x2013;<lpage>73</lpage>. doi: <pub-id pub-id-type="doi">10.1177/0957926507069457</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Gordon</surname><given-names>J.</given-names></name> <name><surname>Darcy</surname><given-names>I.</given-names></name></person-group>, (<year>2019</year>). <article-title>Teaching segmentals vs. suprasegmentals: different effects of explicit instruction on comprehensibility</article-title>. In: <person-group person-group-type="editor"><name><surname>Levis</surname><given-names>J.</given-names></name> <name><surname>Nagle</surname><given-names>C.</given-names></name> <name><surname>Todey</surname><given-names>E.</given-names></name></person-group>, eds. <conf-name>Proceedings of the 10th pronunciation in second language learning and teaching conference, Ames, IA, September 2018</conf-name>. <publisher-loc>Ames, IA</publisher-loc>: <publisher-name>Iowa State University</publisher-name>, pp.<fpage>116</fpage>&#x2013;<lpage>126</lpage>.</mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hauser</surname><given-names>M.</given-names></name> <name><surname>Chomsky</surname><given-names>N.</given-names></name> <name><surname>Fitch</surname><given-names>W.</given-names></name></person-group> (<year>2002</year>). <article-title>The faculty of language: what is it, who has it, and how did it evolve?</article-title> <source>Science</source> <volume>298</volume>, <fpage>1569</fpage>&#x2013;<lpage>1579</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.298.5598.1569</pub-id>, <pub-id pub-id-type="pmid">12446899</pub-id></mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Hayes</surname><given-names>B.</given-names></name></person-group> (<year>1995</year>). <source>Metrical stress theory: principles and case studies</source>. <publisher-loc>Chicago</publisher-loc>: <publisher-name>University of Chicago Press</publisher-name>.</mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hillenbrand</surname><given-names>J.</given-names></name> <name><surname>Getty</surname><given-names>L. A.</given-names></name> <name><surname>Clark</surname><given-names>M. J.</given-names></name> <name><surname>Wheeler</surname><given-names>K.</given-names></name></person-group> (<year>1995</year>). <article-title>Acoustic characteristics of American English vowels</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>97</volume>, <fpage>3099</fpage>&#x2013;<lpage>3111</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.411872</pub-id>, <pub-id pub-id-type="pmid">7759650</pub-id></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hirata</surname><given-names>Y.</given-names></name> <name><surname>Kelly</surname><given-names>S. D.</given-names></name></person-group> (<year>2010</year>). <article-title>Effects of lips and hands on auditory learning of second-language speech sounds</article-title>. <source>J. Speech Lang. Hear. Res.</source> <volume>53</volume>, <fpage>298</fpage>&#x2013;<lpage>310</lpage>. doi: <pub-id pub-id-type="doi">10.1044/1092-4388(2009/08-0243)</pub-id>, <pub-id pub-id-type="pmid">20220023</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hirata</surname><given-names>Y.</given-names></name> <name><surname>Kelly</surname><given-names>S. D.</given-names></name> <name><surname>Huang</surname><given-names>J.</given-names></name> <name><surname>Manansala</surname><given-names>M.</given-names></name></person-group> (<year>2014</year>). <article-title>Effects of hand gestures on auditory learning of second-language vowel length contrasts</article-title>. <source>J. Speech Lang. Hear. Res.</source> <volume>57</volume>, <fpage>2090</fpage>&#x2013;<lpage>2101</lpage>. doi: <pub-id pub-id-type="doi">10.1044/2014_JSLHR-S-14-0049</pub-id>, <pub-id pub-id-type="pmid">25088127</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jansson</surname><given-names>D.</given-names></name> <name><surname>Balsnes</surname><given-names>A.</given-names></name> <name><surname>Durrant</surname><given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>The gesture enigma: reconciling the prominence and insignificance of choral conductor gestures</article-title>. <source>Res. Stud. Music Educ.</source> <volume>44</volume>, <fpage>509</fpage>&#x2013;<lpage>526</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1321103X211031778</pub-id></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Johnson</surname><given-names>M.</given-names></name> <name><surname>Lakoff</surname><given-names>G.</given-names></name></person-group> (<year>2002</year>). <article-title>Why cognitive linguistics requires embodied realism</article-title>. <source>Cogn. Linguist.</source> <volume>13</volume>, <fpage>245</fpage>&#x2013;<lpage>264</lpage>. doi: <pub-id pub-id-type="doi">10.1515/cogl.2002.016</pub-id></mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="other"><person-group person-group-type="author"><collab id="coll1">Jolly Learning</collab></person-group>, (<year>2013</year>). Jolly phonics letter sounds (American English). Jolly learning &#x2013; The home of jolly phonics. Available online at: <ext-link xlink:href="https://youtu.be/3LD7m3luv0Y?si=PQ-HddN7ygWsMCk6" ext-link-type="uri">https://youtu.be/3LD7m3luv0Y?si=PQ-HddN7ygWsMCk6</ext-link> (Accessed September 6, 2025).</mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Katz</surname><given-names>W. F.</given-names></name> <name><surname>Mehta</surname><given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Visual feedback of tongue movement for novel speech sound learning</article-title>. <source>Front. Hum. Neurosci.</source> <volume>9</volume>:<fpage>612</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnhum.2015.00612</pub-id></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kawase</surname><given-names>S.</given-names></name> <name><surname>Davis</surname><given-names>C.</given-names></name> <name><surname>Kim</surname><given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Impact of Japanese L1 rhythm on English L2 speech</article-title>. <source>Lang. Speech</source> <volume>68</volume>, <fpage>118</fpage>&#x2013;<lpage>140</lpage>. doi: <pub-id pub-id-type="doi">10.1177/00238309241247210</pub-id>, <pub-id pub-id-type="pmid">38693793</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kendon</surname><given-names>A.</given-names></name></person-group> (<year>2004</year>). <source>Gesture: visible action as utterance</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>.</mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kilpatrick</surname><given-names>C. E.</given-names></name></person-group> (<year>2020</year>). <article-title>Movement, gesture, and singing: a review of literature</article-title>. <source>Update Appl. Res. Music Educ.</source> <volume>38</volume>, <fpage>29</fpage>&#x2013;<lpage>37</lpage>. doi: <pub-id pub-id-type="doi">10.1177/8755123320908612</pub-id></mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kita</surname><given-names>S.</given-names></name> <name><surname>Alibali</surname><given-names>M.</given-names></name> <name><surname>Chu</surname><given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>How do gestures influence thinking and speaking? The gesture-for-conceptualization hypothesis</article-title>. <source>Psychol. Rev.</source> <volume>124</volume>, <fpage>245</fpage>&#x2013;<lpage>266</lpage>. doi: <pub-id pub-id-type="doi">10.1037/rev0000059</pub-id>, <pub-id pub-id-type="pmid">28240923</pub-id></mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kocjan&#x010D;i&#x010D;</surname><given-names>T.</given-names></name> <name><surname>Bo&#x0159;il</surname><given-names>T.</given-names></name> <name><surname>Hofmann</surname><given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Acoustic and articulatory visual feedback in classroom L2 vowel remediation</article-title>. <source>Lang. Speech</source>:<fpage>00238309231223736</fpage>. doi: <pub-id pub-id-type="doi">10.1177/00238309231223736</pub-id>, <pub-id pub-id-type="pmid">38693788</pub-id></mixed-citation></ref>
<ref id="ref1001"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kosmala</surname><given-names>L.</given-names></name> <name><surname>Horgues</surname><given-names>C.</given-names></name> <name><surname>Scheuer</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). &#x2018;<article-title>A multimodal study of how pronunciation-induced communication breakdowns are managed during tandem interactions</article-title>&#x2019; <source>Research in Language</source> <volume>21</volume>, <fpage>291</fpage>&#x2013;<lpage>312</lpage>. doi: <pub-id pub-id-type="doi">10.18778/1731-7533.21.3.05</pub-id>, <pub-id pub-id-type="pmid">38693788</pub-id></mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kosmala</surname><given-names>L.</given-names></name></person-group> (<year>2024</year>). <source>Beyond disfluency: the interplay of speech, gesture, and interaction</source>. <publisher-loc>Amsterdam / Philadelphia</publisher-loc>: <publisher-name>John Benjamins</publisher-name>.</mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Kubozono</surname><given-names>H.</given-names></name></person-group> (<year>2017</year>). &#x201C;<article-title>Mora and syllable</article-title>&#x201D; in <source>The handbook of Japanese linguistics</source>, <publisher-loc>Oxford</publisher-loc>: <publisher-name>Blackwell</publisher-name>. <fpage>31</fpage>&#x2013;<lpage>61</lpage>. doi: <pub-id pub-id-type="doi">10.1002/9781405166225</pub-id></mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kushch</surname><given-names>O.</given-names></name> <name><surname>Igualada</surname><given-names>A.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2018</year>). <article-title>Prominence in speech and gesture favour second language novel word learning</article-title>. <source>Lang. Cogn. Neurosci.</source> <volume>33</volume>, <fpage>992</fpage>&#x2013;<lpage>1004</lpage>. doi: <pub-id pub-id-type="doi">10.1080/23273798.2018.1435894</pub-id></mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Ladewig</surname><given-names>S. H.</given-names></name></person-group> (<year>2011</year>). &#x201C;<article-title>Putting the cyclic gesture on a cognitive basis</article-title>&#x201D; In: <person-group person-group-type="editor"><name><surname>Lemmens</surname><given-names>M.</given-names></name></person-group>, ed.  <source>CogniTextes. Revue de l&#x2019;Association fran&#x00E7;aise de linguistique cognitive</source>. <publisher-loc>Villeneuve-d&#x2019;Ascq, France</publisher-loc>.</mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ladewig</surname><given-names>S. H.</given-names></name> <name><surname>Bressem</surname><given-names>J.</given-names></name></person-group> (<year>2013</year>). <article-title>New insights into the medium hand: discovering recurrent structures in gestures</article-title>. <source>Semiotica</source> <volume>2013</volume>, <fpage>203</fpage>&#x2013;<lpage>231</lpage>. doi: <pub-id pub-id-type="doi">10.1515/sem-2013-0088</pub-id></mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lakoff</surname><given-names>G.</given-names></name> <name><surname>Johnson</surname><given-names>M.</given-names></name></person-group> (<year>1980</year>). <article-title>The metaphorical structure of the human conceptual system</article-title>. <source>Cogn. Sci.</source> <volume>4</volume>, <fpage>195</fpage>&#x2013;<lpage>208</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0364-0213(80)80017-6</pub-id></mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lambacher</surname><given-names>S. G.</given-names></name> <name><surname>Martens</surname><given-names>W. L.</given-names></name> <name><surname>Kakehi</surname><given-names>K.</given-names></name> <name><surname>Marasinghe</surname><given-names>C. A.</given-names></name> <name><surname>Molholt</surname><given-names>G.</given-names></name></person-group> (<year>2005</year>). <article-title>The effects of identification training on the identification and production of American English vowels by native speakers of Japanese</article-title>. <source>Appl. Psycholinguist.</source> <volume>26</volume>, <fpage>227</fpage>&#x2013;<lpage>247</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0142716405050150</pub-id></mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lennon</surname><given-names>P.</given-names></name></person-group> (<year>1990</year>). <article-title>Investigating fluency in EFL: a quantitative approach</article-title>. <source>Lang. Learn.</source> <volume>40</volume>, <fpage>387</fpage>&#x2013;<lpage>417</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.1467-1770.1990.tb00669.x</pub-id></mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Leonard</surname><given-names>T.</given-names></name> <name><surname>Cummins</surname><given-names>F.</given-names></name></person-group> (<year>2011</year>). <article-title>The temporal relation between beat gestures and speech</article-title>. <source>Lang. Cogn. Process.</source> <volume>26</volume>, <fpage>1457</fpage>&#x2013;<lpage>1471</lpage>. doi: <pub-id pub-id-type="doi">10.1080/01690965.2010.500218</pub-id></mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>P.</given-names></name> <name><surname>Baills</surname><given-names>F.</given-names></name> <name><surname>Baqu&#x00E9;</surname><given-names>L.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2023</year>). <article-title>The effectiveness of embodied prosodic training in L2 accentedness and vowel accuracy</article-title>. <source>Second. Lang. Res.</source> <volume>39</volume>, <fpage>1077</fpage>&#x2013;<lpage>1105</lpage>. doi: <pub-id pub-id-type="doi">10.1177/02676583221124075</pub-id></mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>P.</given-names></name> <name><surname>Baills</surname><given-names>F.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>Observing and producing durational hand gestures facilitates the pronunciation of novel vowel-length contrasts</article-title>. <source>Stud. Second. Lang. Acquis.</source> <volume>42</volume>, <fpage>1015</fpage>&#x2013;<lpage>1039</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S0272263120000054</pub-id></mixed-citation></ref>
<ref id="ref56"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>P.</given-names></name> <name><surname>Xi</surname><given-names>X.</given-names></name> <name><surname>Baills</surname><given-names>F.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2021</year>). <article-title>Training non-native aspirated plosives with hand gestures: learners&#x2019; gesture performance matters</article-title>. <source>Lang. Cogn. Neurosci.</source> <volume>36</volume>, <fpage>1313</fpage>&#x2013;<lpage>1328</lpage>. doi: <pub-id pub-id-type="doi">10.1080/23273798.2021.1937663</pub-id></mixed-citation></ref>
<ref id="ref57"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liberman</surname><given-names>M.</given-names></name> <name><surname>Prince</surname><given-names>A.</given-names></name></person-group> (<year>1977</year>). <article-title>On stress and linguistic rhythm</article-title>. <source>Linguist. Inq.</source> <volume>8</volume>, <fpage>249</fpage>&#x2013;<lpage>336</lpage>. doi: <pub-id pub-id-type="doi">10.2307/4177987</pub-id></mixed-citation></ref>
<ref id="ref58"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Llanes-Coromina</surname><given-names>J.</given-names></name> <name><surname>Prieto Vives</surname><given-names>P.</given-names></name> <name><surname>Rohrer</surname><given-names>P. L.</given-names></name></person-group> (<year>2018</year>). &#x2018;<article-title>Brief training with rhythmic beat gestures helps L2 pronunciation in a reading-aloud task&#x2019;</article-title>, in <source>Proceedings of the 9th International Conference on Speech Prosody (SpeechProsody 2018)</source>, <publisher-loc>Pozna&#x0144;, Poland</publisher-loc>, 13&#x2013;16 June, pp. <fpage>498</fpage>&#x2013;<lpage>502</lpage>. doi: <pub-id pub-id-type="doi">10.21437/SpeechProsody.2018-101</pub-id></mixed-citation></ref>
<ref id="ref59"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname><given-names>S.</given-names></name> <name><surname>Jin</surname><given-names>G.</given-names></name></person-group> (<year>2022</year>). <article-title>The relationship between different types of co-speech gestures and L2 speech performance</article-title>. <source>Front. Psychol.</source> <volume>13</volume>:<fpage>941114</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2022.941114</pub-id>, <pub-id pub-id-type="pmid">36051215</pub-id></mixed-citation></ref>
<ref id="ref60"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Martins</surname><given-names>M.</given-names></name> <name><surname>Gingras</surname><given-names>B.</given-names></name> <name><surname>Puig-Waldm&#x00FC;ller</surname><given-names>E.</given-names></name> <name><surname>Fitch</surname><given-names>W. T.</given-names></name></person-group> (<year>2017</year>). <article-title>Cognitive representation of &#x201C;musical fractals&#x201D;: processing hierarchy and recursion in the auditory domain</article-title>. <source>Cognition</source> <volume>161</volume>, <fpage>31</fpage>&#x2013;<lpage>45</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cognition.2017.01.001</pub-id></mixed-citation></ref>
<ref id="ref61"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McCafferty</surname><given-names>S. G.</given-names></name></person-group> (<year>2002</year>). <article-title>Gesture and creating zones of proximal development for second language learning</article-title>. <source>Mod. Lang. J.</source> <volume>86</volume>, <fpage>192</fpage>&#x2013;<lpage>203</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1540-4781.00144</pub-id></mixed-citation></ref>
<ref id="ref62"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>McNeill</surname><given-names>D.</given-names></name></person-group> (<year>1992</year>). <source>Hand and mind: what gestures reveal about thought</source>. <publisher-loc>Chicago</publisher-loc>: <publisher-name>University of Chicago Press</publisher-name>.</mixed-citation></ref>
<ref id="ref63"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meissl</surname><given-names>K.</given-names></name> <name><surname>Sambre</surname><given-names>P.</given-names></name> <name><surname>Feyaerts</surname><given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>Mapping musical dynamics in space. A qualitative analysis of conductors' movements in orchestra rehearsals</article-title>. <source>Front. Commun.</source> <volume>7</volume>:<fpage>986733</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fcomm.2022.986733</pub-id></mixed-citation></ref>
<ref id="ref64"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Morett</surname><given-names>L. M.</given-names></name> <name><surname>Feiler</surname><given-names>J. B.</given-names></name> <name><surname>Getz</surname><given-names>L. M.</given-names></name></person-group> (<year>2022</year>). <article-title>Elucidating the influences of embodiment and conceptual metaphor on lexical and non-speech tone learning</article-title>. <source>Cognition</source> <volume>222</volume>:<fpage>105014</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cognition.2022.105014</pub-id>, <pub-id pub-id-type="pmid">35033864</pub-id></mixed-citation></ref>
<ref id="ref65"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Musin</surname><given-names>I.</given-names></name></person-group> (<year>1967</year>). <source>The technique of conducting</source>. <publisher-loc>Moscow</publisher-loc>: <publisher-name>Muzyka Publishing House</publisher-name>.</mixed-citation></ref>
<ref id="ref66"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ogrizovic-Ciric</surname><given-names>M.</given-names></name></person-group> (<year>2009</year>). <source>Ilya Musin&#x2019;s language of conducting gestures</source>. <comment>Doctoral Dissertation</comment>. <publisher-loc>Athens, Georgia</publisher-loc>: <publisher-name>University of Georgia</publisher-name>.</mixed-citation></ref>
<ref id="ref67"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Parisse</surname><given-names>C.</given-names></name> <name><surname>Morgenstern</surname><given-names>A.</given-names></name> <name><surname>Ca&#x00EB;t</surname><given-names>S.</given-names></name></person-group> (<year>2022</year>) <article-title>Annotating multimodal data: Interactions across semiotic resources</article-title>, in <conf-name>Proceedings of the thirteenth language resources and evaluation conference</conf-name>. <publisher-name>Marseille</publisher-name>: <publisher-loc>European Language Resources Association (ELRA)</publisher-loc>, pp. <fpage>2755</fpage>&#x2013;<lpage>2764</lpage>. Available online at: <ext-link xlink:href="https://aclanthology.org/2022.lrec-1.297" ext-link-type="uri">https://aclanthology.org/2022.lrec-1.297</ext-link></mixed-citation></ref>
<ref id="ref68"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Peterson</surname><given-names>G. E.</given-names></name> <name><surname>Barney</surname><given-names>H. L.</given-names></name></person-group> (<year>1952</year>). <article-title>Control methods used in a study of the vowels</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>24</volume>, <fpage>175</fpage>&#x2013;<lpage>184</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.1906875</pub-id></mixed-citation></ref>
<ref id="ref70"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Port</surname><given-names>R. F.</given-names></name> <name><surname>Dalby</surname><given-names>J.</given-names></name> <name><surname>O&#x2019;Dell</surname><given-names>M.</given-names></name></person-group> (<year>1987</year>). <article-title>Evidence for mora timing in Japanese</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>81</volume>, <fpage>1574</fpage>&#x2013;<lpage>1585</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.394510</pub-id></mixed-citation></ref>
<ref id="ref71"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pouw</surname><given-names>W.</given-names></name> <name><surname>de Jonge-Hoekstra</surname><given-names>L.</given-names></name> <name><surname>Harrison</surname><given-names>S. J.</given-names></name> <name><surname>Paxton</surname><given-names>A.</given-names></name> <name><surname>Dixon</surname><given-names>J. A.</given-names></name></person-group> (<year>2021</year>). <article-title>Gesture&#x2013;speech physics in fluent speech and rhythmic upper limb movements</article-title>. <source>Ann. N. Y. Acad. Sci.</source> <volume>1491</volume>, <fpage>89</fpage>&#x2013;<lpage>105</lpage>. doi: <pub-id pub-id-type="doi">10.1111/nyas.14532</pub-id></mixed-citation></ref>
<ref id="ref72"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Prieto</surname><given-names>P.</given-names></name> <name><surname>Kushch</surname><given-names>O.</given-names></name> <name><surname>Borr&#x00E0;s-Comes</surname><given-names>J.</given-names></name> <name><surname>Gluhareva</surname><given-names>D.</given-names></name> <name><surname>P&#x00E9;rez-Vidal</surname><given-names>C.</given-names></name></person-group> (<year>2025</year>). <article-title>Training ESL students to reproduce beat gestures in discourse leads to L2 pronunciation improvements</article-title>. <source>Anu. Semin. Filol. Vasca &#x201C;Julio Urquijo&#x201D;</source> <volume>57</volume>, <fpage>805</fpage>&#x2013;<lpage>823</lpage>. doi: <pub-id pub-id-type="doi">10.1387/asju.25982</pub-id></mixed-citation></ref>
<ref id="ref73"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Selkirk</surname><given-names>E. O.</given-names></name></person-group> (<year>1980</year>). <article-title>The role of prosodic categories in English word stress</article-title>. <source>Linguist. Inq.</source> <volume>11</volume>, <fpage>563</fpage>&#x2013;<lpage>605</lpage>, Available online at: <ext-link xlink:href="https://www.jstor.org/stable/4178106" ext-link-type="uri">https://www.jstor.org/stable/4178106</ext-link></mixed-citation></ref>
<ref id="ref74"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Selkirk</surname><given-names>E.</given-names></name></person-group> (<year>1984</year>). &#x201C;<article-title>On the major class features and syllable theory</article-title>,&#x201D; In: <person-group person-group-type="editor"><name><surname>Aronoff</surname><given-names>M.</given-names></name> <name><surname>Oehrle</surname><given-names>R. T.</given-names></name></person-group>, (eds.) <source>Language Sound Structure: Studies in Phonology Presented to Morris Halle by His Teachers and Students</source>, <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name>, pp. <fpage>107</fpage>&#x2013;<lpage>136</lpage>. Available online at: <ext-link xlink:href="https://www.ai.mit.edu/projects/dm/featgeom/selkirk84-sonor.pdf" ext-link-type="uri">https://www.ai.mit.edu/projects/dm/featgeom/selkirk84-sonor.pdf</ext-link></mixed-citation></ref>
<ref id="ref75"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shattuck-Hufnagel</surname><given-names>S.</given-names></name> <name><surname>Ren</surname><given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>The prosodic characteristics of non-referential co-speech gestures in a sample of academic-lecture-style speech</article-title>. <source>Front. Psychol.</source> <volume>9</volume>:<fpage>1514</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2018.01514</pub-id>, <pub-id pub-id-type="pmid">30245649</pub-id></mixed-citation></ref>
<ref id="ref77"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Shitara</surname><given-names>T.</given-names></name> <name><surname>Kimura</surname><given-names>T.</given-names></name> <name><surname>Makino</surname><given-names>T.</given-names></name> <name><surname>Yamane</surname><given-names>N.</given-names></name></person-group>, (<year>2023</year>). <article-title>Feedback effects of mouth shape during speech training</article-title>. <conf-name>Acoustical Society of Japan, Kansai branch, youth interaction meeting, December 2023</conf-name>, <publisher-name>Kindai University</publisher-name>. <comment>(In Japanese)</comment>. Available online at: <ext-link xlink:href="https://os3-314-46534.vs.sakura.ne.jp/text_modifiable/" ext-link-type="uri">https://os3-314-46534.vs.sakura.ne.jp/text_modifiable/</ext-link>(Accessed November 22, 2025).</mixed-citation></ref>
<ref id="ref78"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Smotrova</surname><given-names>T.</given-names></name></person-group> (<year>2017</year>). <article-title>Making pronunciation visible: gesture in teaching pronunciation</article-title>. <source>TESOL Q.</source> <volume>51</volume>, <fpage>59</fpage>&#x2013;<lpage>89</lpage>. doi: <pub-id pub-id-type="doi">10.1002/TESQ.276</pub-id></mixed-citation></ref>
<ref id="ref79"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Suemitsu</surname><given-names>A.</given-names></name> <name><surname>Dang</surname><given-names>J.</given-names></name> <name><surname>Ito</surname><given-names>T.</given-names></name> <name><surname>Tiede</surname><given-names>M.</given-names></name></person-group> (<year>2015</year>). <article-title>A real-time articulatory visual feedback approach with target presentation for second language pronunciation learning</article-title>. <source>J. Acoust. Soc. Am.</source> <volume>138</volume>, <fpage>EL382</fpage>&#x2013;<lpage>EL387</lpage>. doi: <pub-id pub-id-type="doi">10.1121/1.4931827</pub-id></mixed-citation></ref>
<ref id="ref80"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Tajima</surname><given-names>K.</given-names></name> <name><surname>Port</surname><given-names>R.</given-names></name></person-group>, (<year>2004</year>). '<article-title>Speech rhythm in English and Japanese, in Local</article-title>&#x2019;, In: <person-group person-group-type="editor"><name><surname> Ogden</surname><given-names>J.</given-names></name> <name><surname>Temple</surname><given-names>R.</given-names></name></person-group>, (eds.), <conf-name>Papers in Laboratory Phonology VI: Phonetic Interpretation</conf-name>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>, pp. <fpage>322</fpage>&#x2013;<lpage>339</lpage>. doi: <pub-id pub-id-type="doi">10.1017/CBO9780511486425.020</pub-id></mixed-citation></ref>
<ref id="ref81"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Temperley</surname><given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>Music and language</article-title>. <source>Annu. Rev. Linguist.</source> <volume>8</volume>, <fpage>153</fpage>&#x2013;<lpage>170</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev-linguistics-031220-121126</pub-id></mixed-citation></ref>
<ref id="ref82"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Thomas</surname><given-names>E.R.</given-names></name> <name><surname>Kendall</surname><given-names>T.</given-names></name></person-group>, (<year>2007</year>). NORM: the vowel normalization and plotting suite. Online Resource. Available online at: <ext-link xlink:href="http://lingtools.uoregon.edu/norm/norm1.php" ext-link-type="uri">http://lingtools.uoregon.edu/norm/norm1.php</ext-link> (Accessed April 29, 2025).</mixed-citation></ref>
<ref id="ref83"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Thomson</surname><given-names>R.I.</given-names></name></person-group>, (<year>2012</year>). English accent coach [computer program], version 2. Available online at: <ext-link xlink:href="http://www.englishaccentcoach.com" ext-link-type="uri">http://www.englishaccentcoach.com</ext-link> (Accessed September 6, 2025).</mixed-citation></ref>
<ref id="ref84"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vainio</surname><given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>Connection between movements of mouth and hand: perspectives on development and evolution of speech</article-title>. <source>Neurosci. Biobehav. Rev.</source> <volume>100</volume>, <fpage>211</fpage>&#x2013;<lpage>223</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neubiorev.2019.03.005</pub-id>, <pub-id pub-id-type="pmid">30871957</pub-id></mixed-citation></ref>
<ref id="ref85"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wagner</surname><given-names>P.</given-names></name> <name><surname>Malisz</surname><given-names>Z.</given-names></name> <name><surname>Kopp</surname><given-names>S.</given-names></name></person-group> (<year>2014</year>). <article-title>Gesture and speech in interaction: an overview</article-title>. <source>Speech Comm.</source> <volume>57</volume>, <fpage>209</fpage>&#x2013;<lpage>232</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.specom.2013.09.008</pub-id></mixed-citation></ref>
<ref id="ref86"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Gao</surname><given-names>Y.</given-names></name> <name><surname>Cu</surname><given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>Classroom gesture instruction on second language learners&#x2019; academic presentations: evidence from Chinese intermediate English learners</article-title>. <source>J. Engl. Acad. Purp.</source> Vol <volume>66</volume>. doi: <pub-id pub-id-type="doi">10.1016/j.jeap.2023.101304</pub-id></mixed-citation></ref>
<ref id="ref87"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wilson</surname><given-names>I.</given-names></name> <name><surname>Perkins</surname><given-names>J.</given-names></name> <name><surname>Sato</surname><given-names>A.</given-names></name> <name><surname>Ishii</surname><given-names>D.</given-names></name></person-group> (<year>2025</year>). <article-title>Articulatory settings of Japanese&#x2013;English bilinguals</article-title>. <source>Lang. Speech</source>:<fpage>00238309251353727</fpage>. doi: <pub-id pub-id-type="doi">10.1177/00238309251353727</pub-id>, <pub-id pub-id-type="pmid">40884234</pub-id></mixed-citation></ref>
<ref id="ref88"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xi</surname><given-names>X.</given-names></name> <name><surname>Li</surname><given-names>P.</given-names></name> <name><surname>Prieto</surname><given-names>P.</given-names></name></person-group> (<year>2024</year>). <article-title>Improving second language vowel production with hand gestures encoding visible articulation: evidence from picture-naming and paragraph-reading tasks</article-title>. <source>Lang. Learn.</source> <volume>74</volume>, <fpage>884</fpage>&#x2013;<lpage>916</lpage>. doi: <pub-id pub-id-type="doi">10.1111/lang.12647</pub-id></mixed-citation></ref>
<ref id="ref90"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Yamane</surname><given-names>N.</given-names></name> <name><surname>Shinya</surname><given-names>M.</given-names></name> <name><surname>Teaman</surname><given-names>B.</given-names></name> <name><surname>Ogawa</surname><given-names>M.</given-names></name> <name><surname>Akahoshi</surname><given-names>S.</given-names></name></person-group>, (<year>2019</year>). <article-title>Mirroring beat gestures: effects on EFL learners</article-title>. In: <conf-name>Proceedings of the 19th International Congress of Phonetic Sciences (ICPhS 2019). Melbourne, Australia</conf-name>, pp.<fpage>3523</fpage>&#x2013;<lpage>3527</lpage>.</mixed-citation></ref>
<ref id="ref91"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Yamane</surname><given-names>N.</given-names></name> <name><surname>Sun</surname><given-names>K.</given-names></name> <name><surname>Perkins</surname><given-names>J.</given-names></name> <name><surname>Wilson</surname><given-names>I.</given-names></name> <name><surname>Tan</surname><given-names>X.</given-names></name></person-group> (<year>2025</year>) <source>Ultrasound pronunciation training: preset-posttest production and discrimination results</source>. <source>Journal of Monolingual and Bilingual Speech</source>, <source>University of Toronto Press</source>. <volume>6</volume>, doi: <pub-id pub-id-type="doi">10.3138/jmbs-25261-yamane</pub-id></mixed-citation></ref>
<ref id="ref92"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Yazawa</surname><given-names>K.</given-names></name> <name><surname>Kondo</surname><given-names>M.</given-names></name></person-group>, (<year>2019</year>). <article-title>Acoustic characteristics of Japanese short and long vowels: formant displacement effect revisited</article-title>. In <conf-name>Proceedings of the 19th international congress of phonetic sciences</conf-name> (pp. <fpage>671</fpage>&#x2013;<lpage>675</lpage>). <publisher-loc>Canberra, ACT</publisher-loc>: <publisher-name>Australasian Speech Science and Technology Association Inc.</publisher-name></mixed-citation></ref>
<ref id="ref93"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Li</surname><given-names>Z.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Cai</surname><given-names>H.</given-names></name> <name><surname>Li</surname><given-names>L.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Production rather than observation: comparison between the roles of embodiment and conceptual metaphor in L2 lexical tone learning</article-title>. <source>Learn. Instr.</source> <volume>92</volume>:<fpage>101905</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.learninstruc.2024.101905</pub-id></mixed-citation></ref>
</ref-list>
<app-group>
<app id="app1">
<title>Appendix</title>
<table-wrap position="float" id="tab2">
<label>Table A1</label>
<caption>
<p>Raw F2 values (Hz) of /&#x00E6;/ and /&#x028C;/ across Pre-, Mid-, and Post-tests by group (HM, MH) and gender.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="middle">Group</th>
<th align="center" valign="middle">Sex</th>
<th align="center" valign="middle">N</th>
<th align="center" valign="middle">Vowel</th>
<th align="center" valign="middle" colspan="3">F2 (Hz)</th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td/>
<td/>
<td/>
<td align="center" valign="middle">Pre (M &#x00B1; SD)</td>
<td align="center" valign="middle">Mid (M &#x00B1; SD)</td>
<td align="center" valign="middle">Post (M &#x00B1; SD)</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="4">HM group</td>
<td align="center" valign="top" rowspan="2">Female</td>
<td align="center" valign="top" rowspan="2">9</td>
<td align="center" valign="top">/&#x00E6;/</td>
<td align="center" valign="top">1414.05<break/>&#x00B1;168.21</td>
<td align="center" valign="top">1506.95<break/>&#x00B1;181.43</td>
<td align="center" valign="top">1648.85<break/>&#x00B1;164.50</td>
</tr>
<tr>
<td align="center" valign="top">/&#x028C;/</td>
<td align="center" valign="top">1485.67<break/>&#x00B1;133.47</td>
<td align="center" valign="top">1503.34<break/>&#x00B1;94.90</td>
<td align="center" valign="top">1478.77<break/>&#x00B1; 160.43</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="2">Male</td>
<td align="center" valign="top" rowspan="2">10</td>
<td align="center" valign="top">/&#x00E6;/</td>
<td align="center" valign="top">1322.31<break/>&#x00B1;265.11</td>
<td align="center" valign="top">1298.03<break/>&#x00B1;259.79</td>
<td align="center" valign="top">1474.09<break/>&#x00B1; 208.40</td>
</tr>
<tr>
<td align="center" valign="top">/&#x028C;/</td>
<td align="center" valign="top">1204.03<break/>&#x00B1;116.02</td>
<td align="center" valign="top">1229.65<break/>&#x00B1;126.21</td>
<td align="center" valign="top">1294.64<break/>&#x00B1; 219.74</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="4">MH group</td>
<td align="center" valign="top" rowspan="2">Female</td>
<td align="center" valign="top" rowspan="2">15</td>
<td align="center" valign="top">/&#x00E6;/</td>
<td align="center" valign="top">1503.74<break/>&#x00B1;180.95</td>
<td align="center" valign="top">1747.80<break/>&#x00B1;354.12</td>
<td align="center" valign="top">1752.95<break/>&#x00B1;377.24</td>
</tr>
<tr>
<td align="center" valign="top">/&#x028C;/</td>
<td align="center" valign="top">1411.00<break/>&#x00B1;165.32</td>
<td align="center" valign="top">1484.42<break/>&#x00B1;223.70</td>
<td align="center" valign="top">1429.33<break/>&#x00B1;226.30</td>
</tr>
<tr>
<td align="center" valign="top" rowspan="2">Male</td>
<td align="center" valign="top" rowspan="2">6</td>
<td align="center" valign="top">/&#x00E6;/</td>
<td align="center" valign="top">1326.59<break/>&#x00B1;260.52</td>
<td align="center" valign="top">1338.41<break/>&#x00B1;194.97</td>
<td align="center" valign="top">1584.42<break/>&#x00B1;479.53</td>
</tr>
<tr>
<td align="center" valign="top">/&#x028C;/</td>
<td align="center" valign="top">1185.97<break/>&#x00B1;67.53</td>
<td align="center" valign="top">1177.88<break/>&#x00B1;51.87</td>
<td align="center" valign="top">1297.94<break/>&#x00B1;343.05</td>
</tr>
</tbody>
</table>
</table-wrap>
</app>
</app-group>
<fn-group>
<fn id="fn0001" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2760137/overview">Loulou Kosmala</ext-link>, Universit&#x00E9; Paris-Est Cr&#x00E9;teil Val de Marne, France</p></fn>
<fn id="fn0002" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1642401/overview">Plinio Almeida Barbosa</ext-link>, State University of Campinas, Brazil</p><p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3087521/overview">Xiaotong Xi</ext-link>, Shandong University of Finance and Economics, China</p></fn>
</fn-group>
</back>
</article>