<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neuroinform.</journal-id>
<journal-title>Frontiers in Neuroinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neuroinform.</abbrev-journal-title>
<issn pub-type="epub">1662-5196</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fninf.2023.1272791</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Introducing Region Based Pooling for handling a varied number of EEG channels for deep learning models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tveitst&#x000F8;l</surname> <given-names>Thomas</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2319896/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tveter</surname> <given-names>Mats</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2319603/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>P&#x000E9;rez T.</surname> <given-names>Ana S.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2583441/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hatlestad-Hall</surname> <given-names>Christoffer</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1295033/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yazidi</surname> <given-names>Anis</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/638891/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hammer</surname> <given-names>Hugo L.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hebold Haraldsen</surname> <given-names>Ira R. J.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2317110/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Neurology, Oslo University Hospital</institution>, <addr-line>Oslo</addr-line>, <country>Norway</country></aff>
<aff id="aff2"><sup>2</sup><institution>Institute of Clinical Medicine, Faculty of Medicine, University of Oslo</institution>, <addr-line>Oslo</addr-line>, <country>Norway</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Computer Science, Oslo Metropolitan University</institution>, <addr-line>Oslo</addr-line>, <country>Norway</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Holistic Systems, SimulaMet</institution>, <addr-line>Oslo</addr-line>, <country>Norway</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Spase Petkoski, Aix Marseille Universit&#x000E9;, France</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Hassan Aqeel Khan, Aston University, United Kingdom</p>
<p>Jon Kleen, University of California, San Francisco, United States</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Thomas Tveitst&#x000F8;l <email>thtvei&#x00040;ous-hf.no</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>01</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>17</volume>
<elocation-id>1272791</elocation-id>
<history>
<date date-type="received">
<day>07</day>
<month>08</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>12</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Tveitst&#x000F8;l, Tveter, P&#x000E9;rez T., Hatlestad-Hall, Yazidi, Hammer and Hebold Haraldsen.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Tveitst&#x000F8;l, Tveter, P&#x000E9;rez T., Hatlestad-Hall, Yazidi, Hammer and Hebold Haraldsen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>A challenge when applying an artificial intelligence (AI) deep learning (DL) approach to novel electroencephalography (EEG) data, is the DL architecture&#x00027;s lack of adaptability to changing numbers of EEG channels. That is, the number of channels cannot vary neither in the training data, nor upon deployment. Such highly specific hardware constraints put major limitations on the clinical usability and scalability of the DL models.</p></sec>
<sec>
<title>Methods</title>
<p>In this work, we propose a technique for handling such varied numbers of EEG channels by splitting the EEG montages into distinct regions and merge the channels within the same region to a region representation. The solution is termed <italic>Region Based Pooling</italic> (RBP). The procedure of splitting the montage into regions is performed repeatedly with different region configurations, to minimize potential loss of information. As RBP maps a varied number of EEG channels to a fixed number of region representations, both current and future DL architectures may apply RBP with ease. To demonstrate and evaluate the adequacy of RBP to handle a varied number of EEG channels, sex classification based solely on EEG was used as a test example. The DL models were trained on 129 channels, and tested on 32, 65, and 129-channels versions of the data using the same channel positions scheme. The baselines for comparison were zero-filling the missing channels and applying spherical spline interpolation. The performances were estimated using 5-fold cross validation.</p></sec>
<sec>
<title>Results</title>
<p>For the 32-channel system version, the mean AUC values across the folds were: RBP (93.34%), spherical spline interpolation (93.36%), and zero-filling (76.82%). Similarly, on the 65-channel system version, the performances were: RBP (93.66%), spherical spline interpolation (93.50%), and zero-filling (85.58%). Finally, the 129-channel system version produced the following results: RBP (94.68%), spherical spline interpolation (93.86%), and zero-filling (91.92%).</p></sec>
<sec>
<title>Conclusion</title>
<p>In conclusion, RBP obtained similar results to spherical spline interpolation, and superior results to zero-filling. We encourage further research and development of DL models in the cross-dataset setting, including the use of methods such as RBP and spherical spline interpolation to handle a varied number of EEG channels.</p></sec></abstract>
<kwd-group>
<kwd>EEG</kwd>
<kwd>deep learning</kwd>
<kwd>machine learning</kwd>
<kwd>cross-dataset</kwd>
<kwd>cross-channel system</kwd>
<kwd>convolutional neural networks</kwd>
<kwd>time series</kwd>
<kwd>Region Based Pooling</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="1"/>
<equation-count count="7"/>
<ref-count count="38"/>
<page-count count="14"/>
<word-count count="9713"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Recent advancements in artificial intelligence (AI) have opened up new opportunities for the fields of cognitive neuroscience and clinical brain health research. In this context, the EU Horizon 2020 funded project AI-Mind (<ext-link ext-link-type="uri" xlink:href="http://www.ai-mind.eu">www.ai-mind.eu</ext-link>) has been established, which aims at developing AI-based tools to estimate the risk of dementia for people affected by mild cognitive impairment. The project collects a comprehensive set of biomarkers, including blood samples, sociodemographic information, digital cognitive test scores, and electroencephalography (EEG) data. A combination of traditional machine learning (ML) and deep learning (DL)-based algorithms will be employed. While the former commonly provides improved transparency and integration of domain knowledge, the latter has the capacity to find patterns and extract features in complex and unstructured data beyond what can be obtained by hand-crafted features.</p>
<p>DL is a method in AI with potential to significantly transform healthcare services (Hinton, <xref ref-type="bibr" rid="B17">2018</xref>). By processing data in multiple layers, DL learns representations with different levels of abstraction. Breakthroughs of DL include processing of images, video, speech, audio, and text (LeCun et al., <xref ref-type="bibr" rid="B24">2015</xref>). Despite the progress in research and development, there are still significant gaps to be filled for deployment of AI in clinical practice, such as mitigating discriminatory bias and improving generalization to new populations (Kelly et al., <xref ref-type="bibr" rid="B20">2019</xref>; Chen et al., <xref ref-type="bibr" rid="B5">2023</xref>). In particular, AI systems trained on datasets with an underrepresentation of marginalized groups have an elevated risk of bias toward those groups (Rajpurkar et al., <xref ref-type="bibr" rid="B31">2022</xref>). Furthermore, AI algorithms trained on data generated by a single system (e.g., when all imaging data are collected using the same camera with fixed settings) may exhibit single-source bias, resulting in a decrease in performance on inputs collected from other systems (Rajpurkar et al., <xref ref-type="bibr" rid="B31">2022</xref>). For the AI-Mind project, such biases may pose challenges requiring particular considerations. While about two-thirds of dementia cases are in low-income and middle-income countries (LMICs), extrapolating predictive models developed in high-income countries to LMICs is not always feasible (Stephan et al., <xref ref-type="bibr" rid="B36">2020</xref>). A technical prerequisite for extrapolating models to LMICs is the availability of hardware needed for data acquisition. As a neuroimaging modality, EEG is low-cost and mobile compared to magnetic resonance imaging and magnetoencephalography. Moreover, it does not require a dedicated isolated room. Extrapolation of EEG biomarkers to LMICs is thus not hindered by difficulties in installation of the acquisition hardware.</p>
<p>The recent progress of DL has significantly increased its relevance for EEG data analysis (Roy et al., <xref ref-type="bibr" rid="B32">2019</xref>). Domains of application include emotion recognition (Houssein et al., <xref ref-type="bibr" rid="B18">2022</xref>), driver drowsiness (Stancin et al., <xref ref-type="bibr" rid="B35">2021</xref>; Mohammed et al., <xref ref-type="bibr" rid="B27">2022</xref>), classification of alcoholic EEG (Farsi et al., <xref ref-type="bibr" rid="B10">2021</xref>), epileptic seizure detection (Ahmad et al., <xref ref-type="bibr" rid="B2">2022</xref>), mental disorders (de Bardeci et al., <xref ref-type="bibr" rid="B6">2021</xref>), schizophrenia (Oh et al., <xref ref-type="bibr" rid="B28">2019</xref>), major depressive disorder and bipolar disorder detection (Yasin et al., <xref ref-type="bibr" rid="B38">2021</xref>), motor imagery and other brain computer interface (BCI)-related problems (Lotte et al., <xref ref-type="bibr" rid="B26">2018</xref>; Abo Alzahab et al., <xref ref-type="bibr" rid="B1">2021</xref>). Despite the attention of DL in EEG, little research has focused on issues relating to the cross-dataset setting and generalization (Wei et al., <xref ref-type="bibr" rid="B37">2022</xref>). As AI-Mind will use EEG signals for its algorithm development, enabling our tools for deployment on multiple data acquisition systems and mitigating discriminatory bias, is a necessity.</p>
<p>However, a common limitation of many existing DL architectures occurring specifically to EEG is their inherent inability to handle a varied number of channels as input data (Wei et al., <xref ref-type="bibr" rid="B37">2022</xref>). This lack of compatibility conflicts with the real-world high variety of EEG hardware and hinders training and deployment on heterogeneous datasets where both the number of electrodes and their positions on the scalp may vary. Hence, this challenge not only prevents integration of DL models into diverse EEG setups but also limits the inclusion of larger sample sizes as well as more heterogeneous and representative data. Moreover, evidence from clinical neurology research suggests that the number of channels used during EEG recording may have a significant impact on the data&#x00027;s ability to capture spatially limited phenomena (Hatlestad-Hall et al., <xref ref-type="bibr" rid="B14">2023</xref>). The inability to handle this diversity originates from tensors such as matrices and vectors requiring fixed dimensions to be compatible from a linear algebraic perspective. To address this technical issue, this work aims at introducing a simple methodological framework which can be used in combination with current and future DL models to handle a varied number of electrodes. Here, two methods for scaling the data to fit into the DL model are used as baselines for comparison: (1) zero-filling missing channels and (2) applying spherical spline interpolation (Perrin et al., <xref ref-type="bibr" rid="B30">1989</xref>).</p>
<p>There exist several techniques which may leverage external datasets to improve DL models, which we hypothesize will play a significant role in cross-dataset learning and generalization. Approaches such as unsupervised and self-supervised learning may be utilized even in the absence of the target of interest. Improvements may be in terms of, e.g., performance or generalization, and are considered to play an important role for data efficiency of DL (Hinton, <xref ref-type="bibr" rid="B17">2018</xref>; Hendrycks et al., <xref ref-type="bibr" rid="B16">2019</xref>; Banville et al., <xref ref-type="bibr" rid="B4">2021</xref>). In the field of EEG research, Kostas et al. (<xref ref-type="bibr" rid="B22">2021</xref>) obtained improved results on multiple downstream datasets by using contrastive self-supervised learning on a large dataset for pre-training. Furthermore, Banville et al. (<xref ref-type="bibr" rid="B4">2021</xref>) successfully applied self-supervised learning to sleep staging and pathology detection. Another approach on heterogeneous EEG datasets is to use transfer learning, shown in the BEETL competition (Wei et al., <xref ref-type="bibr" rid="B37">2022</xref>). Furthermore, a desired outcome of AI-Mind is to characterize brain networks from EEG data. While metrics from neuroscientific literature have known cognitive relevance (Stam et al., <xref ref-type="bibr" rid="B34">2006</xref>), a DL methodology to obtain features of similar neurophysiological meaning seems non-trivial. This is due to features of DL being <italic>learned</italic> in a data-driven manner rather than human defined to capture the underlying neurophysiological phenomena. We hypothesize, however, that feature learning and pre-trained models may be viable alternatives.</p>
<p>The intended purposes for developing methods for handling a varied number of channels with possibly different positions on the scalp are (1) to enable the application of DL models on a range of existing and varied EEG systems. For clinical implementation, a highly desired property is to have a method which works on the EEG systems currently in use at different clinical centers around the world. The number of channels and channel locations are indeed varied, meaning that it is a necessity to handle this diversity, to maximize outreach and clinical usefulness; (2) to be able to pre-train or perform representation learning on heterogeneous and large amounts of data. There are many open-source datasets from a range of nationalities, pathologies, age groups, and cohorts. To generalize across such data, methods including pre-training and representation learning on multiple and heterogeneous datasets may be a step in the right direction, as it can lead to more robust and generalized features. Improving the robustness and generalization may in turn improve the fairness and equity of the developed AI models. This relevance extends to all medical use and integration of DL in EEG, including the generation of synthetic data (Goodfellow et al., <xref ref-type="bibr" rid="B11">2014</xref>) and digital twins (Grieves and Vickers, <xref ref-type="bibr" rid="B12">2017</xref>), and enabling of simulation techniques for improved clinical treatment selection. Indeed, developing methods to facilitate the evolution of such precision medicine approaches is essential. This study does not carry out such pre-training or representation learning but introduces a framework for enabling it to be performed in a larger scale, with a varied number of electrodes. Instead, this study conducts an initial evaluation to ascertain the efficacy or inadequacy of the framework.</p>
<p>Our framework is designed to be model agnostic, meaning that both current and future DL architectures can apply it with ease. The code is publicly available and may be used to develop customized implementations of the framework, or to combine it with other DL architectures. Furthermore, we aim to experimentally demonstrate that by applying our framework, the algorithm performance in itself remains the same.</p></sec>
<sec sec-type="materials and methods" id="s2">
<title>2 Materials and methods</title>
<p>In this section, the dataset, methods, models, and experiments are described. A high-level overview of the workflow is provided in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>High-level overview of the workflow. The different hyperparameters for each model are described in Section 2.5.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0001.tif"/>
</fig>
<sec>
<title>2.1 Data</title>
<p>The data used for this study is an open-source dataset from Child Mind Institute (Alexander et al., <xref ref-type="bibr" rid="B3">2017</xref>). It contains a large high-density EEG (129 electrodes) dataset from the age distribution 5&#x02013;21 years, including male and female subjects, with varied brain pathologies. The objective of the DL models was to classify the sex of a subject, given the EEG data. After removing samples which did not fulfill the inclusion criteria for data quality (see Section 2.1.1), the dataset was balanced by down sampling the class in abundance, resulting in a final dataset with 1,788 subjects. Only the resting-state EEG data files were extracted. The first 30 s of the recordings were skipped as the first parts of the EEG are more likely to contain unwanted artifacts. The proceeding 10 s was used as input for the models. Only a single 10 s window was used per subject, and the splitting of data was thus made on subject level. The sampling frequency was kept at 500 Hz as in the original dataset.</p>
<sec>
<title>2.1.1 Preprocessing</title>
<p>The raw data was preprocessed using an automated data cleaning pipeline developed in MATLAB, using functions from the EEGLAB toolbox (Delorme and Makeig, <xref ref-type="bibr" rid="B8">2004</xref>). Channels with low-quality data were removed by iterative exclusion of signals with amplitude standard deviation <italic>SD</italic>&#x0003E;75&#x003BC;V or no amplitude variation at all. The EEG file was rejected if the number of excluded channels exceeded 39 (&#x0003E;30%). Line artifacts were removed with Zapline (de Cheveign&#x000E9;, <xref ref-type="bibr" rid="B7">2020</xref>), and the signals were band-pass filtered between 1 and 45 Hz. Excluded channels were replaced with interpolated signals to ensure data dimension consistency. The channels were re-referenced to the average of all scalp channels. The pipeline is available at GitHub.<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref></p></sec></sec>
<sec>
<title>2.2 Inception network</title>
<p>The Inception network is a convolutional neural network (CNN) based architecture, which is the main building block of InceptionTime. Here, the Inception network is briefly described, and for further details on the architecture, the reader is referred to the original study (Ismail Fawaz et al., <xref ref-type="bibr" rid="B19">2020</xref>).</p>
<p>An Inception network is composed of multiple Inception modules, with linear shortcut connections for every third Inception module. A key component of the Inception module is the bottleneck layer, which effectively computes linear combinations of the input time series. Furthermore, the Inception module applies filters of different lengths simultaneously on the same input time series, and resulting feature maps are aggregated by concatenation. After passing the data through all Inception modules, global average pooling is performed in the temporal dimension. Finally, while the original Inception network used a fully connected layer with softmax activation, this was changed to a single fully connected layer with sigmoid activation (Ismail Fawaz et al., <xref ref-type="bibr" rid="B19">2020</xref>).</p>
<p>The hyperparameters of our Inception network was set as described in the original study. This includes a depth of six Inception modules, and 32 number of filters for all convolutional kernels in all Inception modules (Ismail Fawaz et al., <xref ref-type="bibr" rid="B19">2020</xref>).</p></sec>
<sec>
<title>2.3 Methods for handling a varied number of channels</title>
<p>Three methods for handling a varied number of channels were tested on a binary classification problem, sex prediction. The three methods were (1) zero-filling, (2) spherical spline interpolation (Perrin et al., <xref ref-type="bibr" rid="B30">1989</xref>), and our suggested new method (3) Region Based Pooling (RBP). Inception network (Ismail Fawaz et al., <xref ref-type="bibr" rid="B19">2020</xref>) was the DL model used after zero-filling, interpolation, or applying RBP, with the exception that the final layer used scalar output and sigmoid as activation function for predictions.</p></sec>
<sec>
<title>2.4 Region based pooling</title>
<p>RBP splits the topology of the EEG montage into regions, as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. The channels within a single region are pooled into one or more <italic>region representations</italic>, and hence the name <italic>Region Based Pooling</italic>. To minimize the loss of information, multiple splits with different region formations are performed. RBP introduces three new optimization problems; (1) how to split the EEG montage into regions (both the number of montage splits and the algorithm separating the regions), (2) how to pool the channels within the same region, and (3) how to merge the outputs of the different montage splits. The proceeding two subsections intend to illustrate how the first two problems can be addressed and are meant as examples of implementation.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Region based pooling. The EEG montage is split into multiple regions. All channels in the same region are pooled into a region representation. Multiple montage splits may be performed, and the number of montage splits equals to two in this figure, <inline-formula><mml:math id="M44"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>1</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M45"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>2</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. If there is at least one channel in all used regions, the mapping from channels into region representations can be made. This is illustrated as channel system A and channel system B have unequal numbers of channels with different channel locations, and they can both obtain region representations. After pooling channels into region representations, the region representations are stacked/row concatenated. The sequence of stacking represents an arbitrarily chosen design.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0002.tif"/>
</fig>
<p>All RBP models in the experiments of this study merged the outputs of the montage splits by concatenation. Furthermore, all channels within the same region were merged to a single region representation. Finally, all region representations were normalized by subtracting the mean and dividing by the standard deviation in the temporal dimension.</p>
<sec>
<title>2.4.1 Method for splitting into regions</title>
<p>A montage split is a region-based partitioning of the EEG montage. The set of all montage splits are denoted {<inline-formula><mml:math id="M46"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>1</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="M47"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>2</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, ..., <inline-formula><mml:math id="M48"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>}, where <italic>n</italic> is the number of montage splits. Each montage split contains multiple regions, <inline-formula><mml:math id="M1"><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02200;</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, where the regions may or may not overlap. Furthermore, a montage split may or may not cover the entire EEG montage. Given a channel system <italic>C</italic> which is compatible with the partitioning, the <italic>j</italic>-th region of the <italic>i</italic>-th montage split <inline-formula><mml:math id="M2"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> contains the channels <inline-formula><mml:math id="M3"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02283;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:math></inline-formula>, where <inline-formula><mml:math id="M4"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:math></inline-formula> denotes the set of channels of channel system <italic>C</italic>, positioned within the boundaries of <inline-formula><mml:math id="M5"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
<p>The algorithm used in all experiments for splitting the montage into regions is illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>. It follows an iterative procedure and was designed to not have overlapping regions. Furthermore, all regions are used for all montage splits. The algorithm requires one to fix a <italic>split vector</italic> <inline-formula><mml:math id="M6"><mml:mstyle mathvariant="bold"><mml:mtext>k</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mstyle mathvariant="sans-serif"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x02115;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, where the elements of <bold>k</bold> and <italic>p</italic> are design choices/hyperparameters. As a pre-step of the algorithm, all channel positions are mapped to 2D coordinates. Thereafter, the centroid of the channel positions is calculated, and a random angle is generated. With the centroid and the random angle as starting point and angle, <italic>k</italic><sub>1</sub>&#x02212;1 angles are computed such that the angles split the channels into <italic>k</italic><sub>1</sub> equally sized regions. Here, the size of a region refers to the number of channels within it. For all newly generated regions, the same procedure is repeated; (1) compute the centroid (2) generate a random angle, and (3) generate <italic>k</italic><sub>2</sub>&#x02212;1 angles such that <italic>k</italic><sub>2</sub> number of equally sized regions are formed. This iterative approach is executed either <italic>p</italic> times, or until the number of channels in the regions are too low, defined by a stopping criteria <italic>min</italic>_<italic>nodes</italic>.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Example of how the EEG montage may be split into regions. In this example, the split vector was set to <bold>k</bold> &#x0003D; (5, 3). This can be observed, as the montage was first split into five regions, followed by splitting those into three regions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0003.tif"/>
</fig>
<p>For the experiments, there were seven different split vectors, <bold>k</bold> &#x0003D; (3, 3, 3)<sup><italic>T</italic></sup>, <bold>k</bold> &#x0003D; (4, 2, 4)<sup><italic>T</italic></sup>, <bold>k</bold> &#x0003D; (2, 4, 2)<sup><italic>T</italic></sup>, <bold>k</bold> &#x0003D; (2, 2, 2, 2, 2)<sup><italic>T</italic></sup>, <bold>k</bold> &#x0003D; (3, 2, 3)<sup><italic>T</italic></sup>, <bold>k</bold> &#x0003D; (2, 3, 2)<sup><italic>T</italic></sup>, and <bold>k</bold> &#x0003D; (3, 4, 2)<sup><italic>T</italic></sup>. For each montage split, the selection of <bold>k</bold> was made by random sampling with equal probabilities. The stopping criteria was one of the hyperparameters for grid search and included <italic>min</italic>_<italic>nodes</italic>&#x02208;{1, 2, 3}.</p></sec>
<sec>
<title>2.4.2 Pooling operations</title>
<p>To enable compatibility with a varied number of channels with possibly different channel positions, defining pooling mechanisms which can input and handle multivariate time series of different dimensions within the regions, is a prerequisite. That is, to apply mechanisms within the regions which can map a varied number of channels to a single region representation. Finding sophisticated mechanisms with this property may be crucial for RBP. This subsection presents several approaches for pooling mechanisms.</p>
<sec>
<title>2.4.2.1 Average</title>
<p>The first pooling mechanism is to merge the channels within a region by computing its mean in channel dimension. This offers a simple and time-efficient method and aggregates the channels with equal contributions for computing region representations.</p></sec>
<sec>
<title>2.4.2.2 Channel attention</title>
<p>A second pooling mechanism is to select the key channels by first assigning an importance score, and secondly merge the channels by computing a weighted average based on the importance scores. Mathematically, this may be accomplished by defining a function <italic>g</italic>:&#x0211D;<sup>1 &#x000D7; <italic>T</italic></sup> &#x02192; &#x0211D;, where <italic>T</italic> denotes the number of time samples, applied on all time series within the region, and using the values obtained to compute coefficients of a linear combination, as illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref>. Applying <italic>g</italic> to each channel in a region gives an importance scalar for each channel, which is subsequently concatenated and passed to a softmax activation function, giving the channel attention vector of the <italic>i</italic>-th montage split and <italic>j</italic>-th region <inline-formula><mml:math id="M7"><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo stretchy="false">{</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:msup><mml:mo>:</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:math></inline-formula>. The vectors <bold>a</bold><sup>(<italic>i, j</italic>)</sup> have the properties that the entries are positive and sum to one due to the softmax activation function. After computing <bold>a</bold><sup>(<italic>i, j</italic>)</sup>, the channels of the <italic>i</italic>-th montage split and <italic>j</italic>-th region are pooled by weighted averaging <inline-formula><mml:math id="M8"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mi>C</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant="sans-serif"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, where <inline-formula><mml:math id="M9"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo>|</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are the EEG time series of all channels within the region.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Illustration of channel attention mechanism. An importance scalar is computed for each channel, and the attention vector is computed by applying softmax on a concatenation of these. The elements of the attention vector are used as coefficients to compute a linear combination of the channels.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0004.tif"/>
</fig>
<p><italic>ROCKET-based features</italic>: Random Convolutional Kernel Transform (ROCKET) (Dempster et al., <xref ref-type="bibr" rid="B9">2020</xref>) is a highly efficient time series classifier, which obtained high performance in a short time frame in a multivariate time series classification bake off (Ruiz et al., <xref ref-type="bibr" rid="B33">2021</xref>). For feature extraction, ROCKET applies a large number of diverse, random and non-trainable convolutional kernels, and computes the proportion of positive values and maximum value of the resulting feature maps. This was adopted as a pooling mechanism, where the proportion of positive values and max values of the feature maps were used for computing the importance score of a channel. From the <italic>num</italic>_<italic>kernels</italic> &#x000B7; 2 features, a trainable fully connected module with scalar output and specific to the <italic>i</italic>-th montage split and <italic>j</italic>-th region, <italic>FC</italic><sup>(<italic>i, j</italic>)</sup>:&#x0211D;<sup><italic>num</italic>_<italic>kernels</italic>&#x000B7;2</sup> &#x02192; &#x0211D;, was applied. After computing the importance scores for all time series in the region, a softmax activation function was applied to obtain positive coefficients only, which sum to one. A desirable property of using non-trainable convolutional kernels is that the output feature maps (along with proportion of positive values and max values) are being computed only once per subject, prior to training. Therefore, the computational cost of a large number of convolutions may be justified by its property to be pre-computed.</p>
<p>The number of convolutional kernels was set to 1000, and the maximum receptive field in the temporal dimension to 250, which corresponds to half a second with the given sampling rate. This was based on computational feasibility, taking both time consumption and memory usage on limited hardware into account. Furthermore, no padding was used, in contrast to the original implementation. The ROCKET features were pre-computed prior to training, as the convolutional kernel weights were frozen, and the proportion of positive values and max values of the feature maps were thus constant per channel and subject during training. Furthermore, the ROCKET kernels were shared across all regions and montage splits to reduce runtime. The FC modules mapping the <italic>num</italic>_<italic>kernels</italic>&#x000B7;2 features to a single coefficient, used only a single fully connected layer with linear activation function. That is, for every subject, the importance score of the <italic>k</italic>-th channel in the <italic>j</italic>-th region of the <italic>i</italic>-th montage split prior to softmax normalization, was computed as <inline-formula><mml:math id="M10"><mml:msup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>F</mml:mi><mml:msup><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>w</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mstyle mathvariant="sans-serif"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, where <inline-formula><mml:math id="M11"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>w</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mstyle class="text"><mml:mtext>_</mml:mtext></mml:mstyle><mml:mi>k</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> is a trainable weight vector of the <italic>j</italic>-th region of the <italic>i</italic>-th montage split, <inline-formula><mml:math id="M12"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is the time series of the <italic>k</italic>-th channel, and <inline-formula><mml:math id="M13"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mstyle class="text"><mml:mtext>_</mml:mtext></mml:mstyle><mml:mi>k</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> is the pre-computed ROCKET features of channel <italic>k</italic>.</p></sec>
<sec>
<title>2.4.2.3 Continuous channel attention</title>
<p>Another possible pooling mechanism is to apply continuous channel attention, which is illustrated in <xref ref-type="fig" rid="F5">Figure 5</xref>. In the channel attention mechanism explained in Section 2.4.2.2, it is impossible for the model to adapt its channel attention in time. Therefore, <italic>continuous channel attention</italic> is implemented by defining a function <italic>g</italic>:&#x0211D;<sup>1 &#x000D7; <italic>T</italic></sup> &#x02192; &#x0211D;<sup>1 &#x000D7; <italic>T</italic></sup>, apply <italic>g</italic> to every channel, and apply softmax activation function in the channel dimension. That is, what was in Section 2.4.2.2 an attention vector of the <italic>j</italic>-th region in the <italic>i</italic>-th montage split <inline-formula><mml:math id="M14"><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:msup><mml:mo>:</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>q</mml:mtext></mml:mstyle><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> is replaced by an attention matrix <inline-formula><mml:math id="M15"><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Q</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo>|</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>:</mml:mo><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Q</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mo>:</mml:mo><mml:mo>,</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02200;</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, where all elements are positive and each column sum to 1 due to the softmax activation function. The region representation of the <italic>j</italic>-th region in the <italic>i</italic>-th montage split is followingly computed as <inline-formula><mml:math id="M16"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mi>C</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>1</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="sans-serif"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02299;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, where <bold>1</bold> is a vector of ones, &#x02299; is the Hadamard product (element-wise multiplication), and <inline-formula><mml:math id="M17"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo>|</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is the EEG data of the channels in <inline-formula><mml:math id="M18"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:math></inline-formula>. This formulation is equivalent to applying a unique attention vector per time step.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Illustration of continuous channel attention. An importance scalar is computed for every channel and time step, and the attention matrix is computed by applying softmax on a concatenation of these in the channel dimension. The attention matrix is used to compute a linear combination of the channels per time step. That is, a new linear combination is computed for each time step, allowing the pooling mechanism to shift its attention through time.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0005.tif"/>
</fig>
<p>In the experiments, an Inception network (Ismail Fawaz et al., <xref ref-type="bibr" rid="B19">2020</xref>) was used as <italic>g</italic>. The depth of the architecture was set to two Inception modules, and the number of filters was set to two for all convolutional kernels and Inception modules. These hyperparameters were set smaller than in the original study due to high memory consumption.</p></sec>
<sec>
<title>2.4.2.4 Region based pooling with head region</title>
<p>With the pooling mechanisms described in Sections 2.4.2.1, 2.4.2.2 and 2.4.2.3, RBP is not able to tailor the region representations based on other regions. As this may be an important property to possess, RBP can be extended to <italic>Region Based Pooling with a Head Region</italic>, which is illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>. A <italic>head-region</italic> is selected, which exhibits the property of being able to influence the aggregation of channels in non-head regions.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Region based pooling with a head region. The head region may influence how the channels in the non-head region should be aggregated. This is done by passing an embedding vector of the head region to the aggregation functions. By passing different embeddings to the different non-head regions, the head region is allowed to search for different features in the different spatial locations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0006.tif"/>
</fig>
<p>The region representation is computed as an aggregation of the channels, given a vector embedding of the head region. For every montage split <inline-formula><mml:math id="M49"><mml:mi mathvariant="script">M</mml:mi></mml:math></inline-formula><sub><italic>i</italic></sub>&#x02208;{<inline-formula><mml:math id="M50"><mml:mi mathvariant="script">M</mml:mi></mml:math></inline-formula><sub>1</sub>, <inline-formula><mml:math id="M51"><mml:mi mathvariant="script">M</mml:mi></mml:math></inline-formula><sub>2</sub>, ..., <inline-formula><mml:math id="M52"><mml:mi mathvariant="script">M</mml:mi></mml:math></inline-formula><sub><italic>n</italic></sub>}, a head region <inline-formula><mml:math id="M19"><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is selected. The region representation of all non-head regions <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>\</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> is computed as</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>X</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mi>C</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>G</mml:mi><mml:msup><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02200;</mml:mo><mml:mi>j</mml:mi><mml:mo>:</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>\</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E2"><label>(2)</label><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M23"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:math></inline-formula> is the search vector embedding of the head region <italic>H</italic><sup>(<italic>i</italic>)</sup> with relevance to region <inline-formula><mml:math id="M24"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>\</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M25"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:math></inline-formula> is the function mapping the channels of the head region to <inline-formula><mml:math id="M26"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:math></inline-formula>, and <italic>AGG</italic><sup>(<italic>i, j</italic>)</sup> is an aggregation function. The vector embedding of the head region may thus depend on the region to compute a region representation of. The motivation of this is that the head region systematically searches for certain characteristics in the other regions, and such characteristics may depend on the given regions.</p>
<p>The region representation of the head region was computed as in ROCKET channel attention, introduced in Section 2.4.2.2. The search embeddings <inline-formula><mml:math id="M27"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:math></inline-formula> were computed as</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02299;</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mstyle mathvariant='bold'><mml:mn>1</mml:mn></mml:mstyle><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E4"><label>(4)</label><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E5"><label>(5)</label><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003C3; is the softmax activation function computed in the channel dimension, <inline-formula><mml:math id="M31"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>Z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>u</mml:mi><mml:mi>m</mml:mi><mml:mstyle class="text"><mml:mtext>_</mml:mtext></mml:mstyle><mml:mi>k</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mo>|</mml:mo><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula> is a concatenation of the ROCKET features, and <inline-formula><mml:math id="M32"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M33"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> are trainable weight matrices of the search embedding function of region <inline-formula><mml:math id="M34"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>. The use of softmax allows the search embedding to weight the different channels in the head region differently for each ROCKET feature. The region representation of region <inline-formula><mml:math id="M35"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>\</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> are computed per subject as</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M36"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>G</mml:mi><mml:msup><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant="sans-serif"><mml:mi>T</mml:mi></mml:mstyle></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>X</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E7"><label>(7)</label><mml:math id="M37"><mml:mrow><mml:msub><mml:mi>a</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mo stretchy='false'>&#x0007B;</mml:mo><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>i</mml:mi><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mi>R</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>&#x0007D;</mml:mo></mml:mrow><mml:mrow><mml:mstyle displaystyle='true'><mml:msub><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mi>R</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup><mml:mo>&#x02229;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mi>e</mml:mi></mml:mstyle><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mo stretchy='false'>&#x0007B;</mml:mo><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mi>c</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>i</mml:mi><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mi>R</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>&#x0007D;</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>with <italic>a</italic><sub><italic>k</italic></sub> being the elements of <bold>a</bold>. Note that the same embedding functions (<inline-formula><mml:math id="M38"><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>) are used on the channels of <inline-formula><mml:math id="M39"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">M</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>\</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> as on the channels of the head region <italic>H</italic><sup>(<italic>i</italic>)</sup>. This may be beneficial, as the embeddings share the same space, and computing similarity may thus be more meaningful.</p>
<p>For the experiments in this study, the number of rows in the weight matrices <inline-formula><mml:math id="M40"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M41"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> (and hence the dimensionality of the search vector embeddings <inline-formula><mml:math id="M42"><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x02192;</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:msubsup></mml:math></inline-formula>) were set to 64, for all <italic>i</italic> and <italic>j</italic>.</p></sec></sec></sec>
<sec>
<title>2.5 Experiments</title>
<p>All models were implemented using PyTorch (Paszke et al., <xref ref-type="bibr" rid="B29">2019</xref>), version 1.10.1&#x0002B;cu113. The hardware used was a computer equipped with an NVIDIA GeForce RTX 3060 12GB GPU. The code is publicly available on GitHub.<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref></p>
<p>All models were run with learning rate set to 0.0001. The maximum number of epochs was set to 50, except for RBP with continuous channel attention, which used 20 epochs due to high time consumption. The batch size was mainly set to 16 although some models required smaller batch size due to memory constraints. The exceptions are listed in <xref ref-type="table" rid="T1">Table 1</xref>. Experiments using zero-filling and spherical spline interpolation were run with batch size set to 4, 8, 16, and 32, to ensure that potential improvements were not due to differences in batch size. Adam (Kingma and Ba, <xref ref-type="bibr" rid="B21">2015</xref>) and binary crossentropy (with logits loss for improved numerical stability) were used as optimization technique and loss function, respectively.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Overview of batch sizes used for grid search.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Pooling mechanism</bold></th>
<th valign="top" align="left"><bold>Min. number of electrodes</bold></th>
<th valign="top" align="left"><bold>Number of montage splits</bold></th>
<th valign="top" align="left"><bold>Batch size</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">With head region</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">50</td>
<td valign="top" align="left">8</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">2</td>
<td valign="top" align="left">50</td>
<td valign="top" align="left">8</td>
</tr>
<tr>
<td valign="top" align="left">Continuous attention</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">2</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">3</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">1</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">2</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">3</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">2</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">1</td>
<td valign="top" align="left">25</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">2</td>
<td valign="top" align="left">25</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">3</td>
<td valign="top" align="left">25</td>
<td valign="top" align="left">1</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>All RBP models not listed used a batch size of 16.</p>
</table-wrap-foot>
</table-wrap>
<p>For all experiments, a 5-fold cross validation strategy was carried out. For every fold, the 4 folds not used for testing were split into training and validation 75/25. The training data was used to optimize the trainable parameters of the DL models, whereas the validation data was used to estimate what epoch to stop at. During a single fold, only the model parameters which obtained the highest area under the receiver operating characteristics curve (AUC) on the validation set (computed as the mean performance on the 32, 65, and 129-channel versions of the channel system) was used when testing on the test data fold.</p>
<p>To evaluate the sensitivity with respect to two new hyperparameters introduced by RBP, a grid search was made for all pooling mechanisms. The first hyperparameter was <italic>min</italic>_<italic>nodes</italic>, which is the smallest number of channels allowed in the 32-channel version of the channel system. The smaller the <italic>min</italic>_<italic>nodes</italic>, the smaller the regions are allowed to be when splitting the montage. The second hyperparameter was <italic>num</italic>_<italic>montage</italic>_<italic>splits</italic>, which is the number of montage splits performed. The grid search was carried out with <italic>min</italic>_<italic>nodes</italic>&#x02208;{1, 2, 3} and <italic>num</italic>_<italic>montage</italic>_<italic>splits</italic>&#x02208;{5, 10, 25, 50}, with the exception of RBP using continuous channel attention, which was restricted to <italic>num</italic>_<italic>montage</italic>_<italic>splits</italic>&#x02208;{5, 10, 25} due to memory limitations.</p></sec></sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p><xref ref-type="fig" rid="F7">Figures 7</xref>&#x02013;<xref ref-type="fig" rid="F9">9</xref> show the results of grid search for the different pooling methods, on 32, 65, and 129 number of channels, respectively. The number in each entry represents the average performance estimate on the test sets after conducting a 5-fold cross validation. The results show that the performance is more sensitive to the selected hyperparameters for the low-resolution channel systems than the 129-channel system version. In particular, RBP seems to favor smaller regions per montage split for the downsampled channel systems.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Mean performance on the channel system with 32 electrodes, as a function of number of montage splits and number of allowed electrodes in the smallest channel system.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0007.tif"/>
</fig>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Mean performance on the channel system with 65 electrodes, as a function of number of montage splits and number of allowed electrodes in the smallest channel system.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0008.tif"/>
</fig>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Mean performance on the channel system with 129 electrodes, as a function of number of montage splits and number of allowed electrodes in the smallest channel system.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0009.tif"/>
</fig>
<p><xref ref-type="fig" rid="F10">Figure 10</xref> compares the performance of using RBP, spherical spline interpolation, and zero-filling. The RBP model selected used ROCKET channel attention as pooling mechanism, with number of montage splits set to 25, and <italic>min</italic>_<italic>nodes</italic> set to 1. The model selection was based on the mean validation performance on 5-fold cross validation and maximizing the mean performance on the three channel systems. The selected models using spherical spline interpolation and zero-filling used batch size set to 32 and 8, respectively, following the same model selection procedure as for RBP. For the 32-channel system version, the mean AUC values were as follows: RBP (93.34%), spherical spline interpolation (93.36%), and zero-filling (76.82%). On the 65-channel system version, the performances were RBP (93.66%), spherical spline interpolation (93.50%), and zero-filling (85.58%). Finally, the 129-channel system version produced the following results: RBP (94.68%), spherical spline interpolation (93.86%), and zero-filling (91.92%).</p>
<fig id="F10" position="float">
<label>Figure 10</label>
<caption><p>Results of sex prediction using Inception network in combination with RBP (blue), spherical spline interpolation (orange), and zero-filling (green). The splitting into 5 folds were equal for the different methods, and only the five performance estimates from the test sets are plotted. For the channel system with <italic>c</italic> &#x0003D; 129, interpolation and zero-filling are technically the same, as there are no channels to interpolate nor zero-fill. The model selection procedure, however, selected different batch sizes, and the performance differences are therefore attributed to both the model selection and differences in initialization of weights.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fninf-17-1272791-g0010.tif"/>
</fig></sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<sec>
<title>4.1 RBP for handling a varied number of channels</title>
<p>RBP shows highly similar performance to spherical spline interpolation for all channel systems, as seen in <xref ref-type="fig" rid="F10">Figure 10</xref>. Both RBP and spherical spline interpolation demonstrate robustness in handling a varied number of channels, as indicated by the minor performance degradation observed on the down-sampled channel systems. A potential decrease in performance when reducing the number of channels is not necessarily to be evaluated as weaknesses in these methods but may be due to a loss of information when removing channels. The objective of the methods is to handle the channel down-sampling with the smallest reduction in performance as possible although no method can restore the fully lost information. In contrast to RBP and spherical spline interpolation, zero-filling missing channels vastly reduce the performance on the lower resolution channel systems. Zero-filling is therefore not a recommended approach for handling missing channels, despite its use in, e.g., the official preprocessed version of the EEG data of the Child Mind Institute (Alexander et al., <xref ref-type="bibr" rid="B3">2017</xref>).</p>
<p>The results from the grid searches on the different pooling mechanisms indicate that the selection of pooling mechanism was unimportant for the selected task and dataset, except for continuous channel attention for 25 number of montage splits. However, the batch size was set to 1 due to memory constraints, which is not optimal for training, and thus a strong confounder. More research is therefore needed to assess if a high number of montage splits failed in continuous channel attention due to inadequacy of the pooling mechanism or if it is solely due to the batch size. No pooling mechanism was superior to the others for all hyperparameters. A consistent trend appears to be that RBP benefits from smaller regions, as the performance on especially the channel systems with 32 and 65 channels seem to increase when the stopping criteria <italic>min</italic>_<italic>nodes</italic> decrease. This is not an unexpected finding as using smaller regions increases the spatial resolution per montage split. The current results further suggest that solely increasing the number of montage splits is insufficient when the regions are excessively large. However, as future work may include even smaller channel systems, larger regions may be beneficial from a practical point of view. Finding the optimal balance between low resolution channel systems compatibility and model performance may therefore be important for future research. However, as the model was trained only on 129 channels, the performance on the low-resolution channel systems may be increased by including them in the training data as well. For extension to the large-scale setting with multiple datasets, this is likely to be a feasible approach. Furthermore, it may be used as a data augmentation technique, in particular when the high-resolution channel system has low-resolution equivalents.</p>
<p>This study proposed an algorithm for splitting the EEG montage into regions although no optimization of montage splits was performed. It is likely that different EEG related problems may benefit from different montage splits. This is because the important spatial features may be task related and require higher or lower resolution of some areas. Furthermore, as only one algorithm for splitting the EEG montage into regions was tested, future work could benefit from exploring and evaluating alternative methods. Note that with the current use of regions having defined boundaries, where an electrode is either inside or not inside a region, optimizing montage splits by gradient based methods cannot work directly. This is because an infinitely small change to the boundaries of the region will either cause zero change in output or an output change of fixed size (not infinitely small, as required). The gradients would thus be either zero or infinite, making gradient based learning infeasible. Two potential solutions are further discussed in Section 4.4.2.</p></sec>
<sec>
<title>4.2 Related work</title>
<p>As discussed in Wei et al. (<xref ref-type="bibr" rid="B37">2022</xref>), limited studies has focused on generalizing DL models to handle the cross-dataset setting and a varied number of channels. A desired outcome of the BEETL competition was to develop transfer learning techniques in the cross-dataset setting (Wei et al., <xref ref-type="bibr" rid="B37">2022</xref>). However, the top three entries selected simple methods to handle a varied number of channels and the difference in channel locations; channel removal, dataset removal, or both. Furthermore, to handle a varied number of channels in the pre-training and downstream training, Kostas et al. (<xref ref-type="bibr" rid="B22">2021</xref>) mapped all datasets to 19 channels, and in that process, sacrificed a considerable part of the data for several of the datasets used for downstream training. However, research from clinical neurology suggests that certain characteristics require high-density EEG with an increased number of channels (Kuhnke et al., <xref ref-type="bibr" rid="B23">2018</xref>; Hatlestad-Hall et al., <xref ref-type="bibr" rid="B14">2023</xref>). The feasibility of downsampling the spatial resolution may therefore be limited to only a subset of EEG-related tasks.</p>
<p>Li and Metsis (<xref ref-type="bibr" rid="B25">2022</xref>) developed SPP-EEGNET, an architecture designed for inter-dataset transfer learning, and is compatible with a varied number of channels. However, SPP-EEGNET pools the feature maps by spatial pyramid pooling (SPP) (He et al., <xref ref-type="bibr" rid="B15">2014</xref>) after convolutions have been applied channel-wise. Cross-channel patterns can therefore not be extracted by the convolutional module of SPP-EEGNET as the receptive field of the feature maps are bounded to their respective single channel. Such cross-channel patterns may only be extracted by the fully connected module, after applying the SPP layer. As the success of signal processing is mostly attributed to the convolutional module, this approach may be sub-optimal. Furthermore, many existing DL architectures for EEG data apply 1D convolutions across channels, hindering its application to many of the currently existing architectures. This contrasts with RBP, which is compatible with any DL model for multivariate time series classification/regression.<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref> This is beneficial, as the current high-performing models from literature may apply RBP with ease (simply use RBP as the initial layer), meaning the accumulated research and development on DL architectures over time is respected. Furthermore, it offers a simple solution for working on the cross-dataset and cross-channel system setting in the future. Note also that although this study represented the EEG data as time series, using other representations such as power spectral density or operating on wavelet transformed images are popular choices of input to DL models. RBP is indeed compatible with such representations although the pooling mechanisms must be tailored to fit the input domain. Finally, the pooling in RBP is performed based on the spatial positions of the electrodes, whereas SPP-EEGNET does not precisely specify how the feature maps of the different channels were merged. If the pooling is made only by the data matrix <bold>X</bold> [as if it was an image, following the original SPP-net (He et al., <xref ref-type="bibr" rid="B15">2014</xref>)], then inconsistency in which channels end up in which spatial region will occur.</p></sec>
<sec>
<title>4.3 Limitations of the study</title>
<p>A limitation of this study is its reliance on a single dataset and classification problem, which may restrict the generalizability of the findings. In particular, the size of the dataset was larger than what is commonly available for EEG datasets with more clinically relevant labels. When the total number of region representations exceeds the number of channels in a given channel system, RBP effectively expands the dimensionality of the data. This is especially the case when the regions are small, and the number of montage splits are many. For smaller datasets in particular, this may lead to an increased risk of overfitting. The generalizability of the results to smaller datasets, and in particular, the effect of the hyperparameters <italic>min</italic>_<italic>nodes</italic> and <italic>num</italic>_<italic>montage</italic>_<italic>splits</italic> is therefore poorly investigated. While testing the methods on sex classification allowed for a large dataset with low chance of false labeling, its clinical utility is low. Thus, classification/regression problems with higher clinical relevance should be considered in the future. Furthermore, only a single model (Inception network) was used in combination with the three different methods for handling a varied number of electrodes. Although Inception network is an effective DL model for multivariate time series analysis, generalization to other models was not assessed. This is needed due to the high number of DL models used for EEG analysis. Finally, hardware limitations constrained the training of all RBP models using the same batch size, potentially reducing the performance of the models with smaller batch size. By testing with more models, datasets, and classification/regression problems, the relevance of the methods will thus be better addressed. In particular, to fully explore the potential and relevance of the investigated methods, experiments including datasets with even smaller numbers of EEG channels, such as 19 or 25, are required.</p></sec>
<sec>
<title>4.4 Future work</title>
<sec>
<title>4.4.1 Pooling mechanisms and hyperparameters</title>
<p>The use of features as computed in ROCKET, and a single linear layer to compute the importance score of a channel, provides a light-weight method for computing channel attention. It was selected based on its light-weightedness as the sole purpose of the pooling mechanism is to compute coefficients of a linear combination. Furthermore, the extracted ROCKET features could be pre-computed prior to training, making it a pragmatic choice for run-time efficiency. Using more powerful DL models was hypothesized to be unnecessary and overpowered for such a task although in the absence of proper experimental results in this regard, final conclusions cannot be drawn. Using pooling mechanisms which selects not only the channels of interest but also the frequency bands of interest is a possible future direction.</p>
<p>All pooling mechanisms used in the experiments were compatible with a single channel per region. This is the case, e.g., for computing channel attention using ROCKET features, as the function <italic>g</italic> for computing the importance score of a channel only uses the features of that very channel. Future work may attempt to define pooling mechanisms which require more than one channel per region. This may be accomplished by e.g. extending the input domain and output range of <italic>g</italic> to <inline-formula><mml:math id="M43"><mml:mi>g</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02192;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula>, where <italic>p</italic><sub><italic>in</italic></sub> is the lower bound of accepted number of channels in a region, and <italic>p</italic><sub><italic>out</italic></sub> is the number of output features per application of <italic>g</italic>. However, as this may either require larger regions (which by the current results does not appear to be favorable) or lead to incompatibility with the low-resolution channel systems, it is important to determine if the potential benefits outweigh the drawbacks in future research.</p>
<p>All experiments in this study merged the different montage splits by concatenation directly after the pooling was made. Another approach could be to apply convolutional modules separately on the montage splits, prior to merging them. Furthermore, other approaches such as summation, averaging, or alternating between applying convolution and adding a montage split such as skip connections, are examples of other possible pooling strategies. In particular, merging montage splits by skip connections and using dynamic neural networks (Han et al., <xref ref-type="bibr" rid="B13">2022</xref>) to e.g. perform a sample or channel system conditioned number of montage splits by early exiting or layer skipping is a possible future direction. By using dynamic architectures, more montage splits could be used on the high-resolution channel systems, and fewer montage splits could be used on the low-resolution channel systems. Furthermore, montage splits with small regions could be used on high-resolution channel systems only, possibly alleviating the here observed trade-off between performance and low-resolution compatibility.</p></sec>
<sec>
<title>4.4.2 Splitting into regions</title>
<p>While the current study did not perform any optimization of the splitting of the EEG montage into regions, two possible solutions which may be explored in the future are (1) use other techniques for optimizing. One approach could be to generate many splits and apply sparsity. (2) Introduce soft regions, where electrodes are assigned a non-binary weight to its presence in the region. A region could e.g. be represented as a Gaussian, where the mean and standard deviation are treated as trainable parameters. The influence of a specific channel on a region representation would be determined by both an importance score calculated from a function <italic>g</italic> operating on the time series, and its spatial importance given the properties of the region (e.g., mean and standard deviation).</p></sec>
<sec>
<title>4.4.3 Training strategies with large amounts of data</title>
<p>A major motivation behind RBP is to enable the use of multiple and heterogeneous datasets with a varied number of channels for different training strategies. Large-scaled use of multiple datasets should be tested for methods such as pre-training (e.g., transfer learning or self-supervised learning), representation learning (e.g., self-supervised or unsupervised learning), and simply using more datasets if the same targets are available. Fixing different electrode arrays and using spherical spline interpolation in the case of varied channel systems across the datasets, should be used as baselines.</p>
<p>For the AI-Mind project, this may be of high relevance for both improving the DL model performance and generalization. While the project aims at collecting a dataset comprised of 1,000 participants and possibly expanding this with synthetic data, this is not guaranteed to be sufficient for DL models. Improving data efficiency and model performance by the abovementioned training strategies may be enhanced by enabling them in the cross-channel system setting. Furthermore, data collection from four different countries and five different clinical sites is likely to mitigate bias to some extent. However, its sufficiency is difficult to address a priori. Two arguments against, are that (1) all clinical sites are situated in European countries, and (2) the hardware for EEG recordings are the same. Thus, by applying the abovementioned training strategies to heterogeneous datasets, the ability of the DL models to generalize across populations and hardware may be improved.</p></sec></sec></sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusion</title>
<p>Region based pooling was introduced for deep learning models to handle a varied number of EEG channels. Furthermore, its adequacy in maintaining performance when downsampling the channel system was experimentally demonstrated. Grid search was used to assess the effect of two new hyperparameters, which relates to the size of the regions and the number of montage splits. Several pooling mechanisms were introduced and tested, yielding highly similar results. Region based pooling obtained similar results to spherical spline interpolation, and superior results to zero-filling missing channels when downsampling the channel system to 65 and 32 channels. Zero-filling missing channels is therefore not a recommended method for handling a varied number of channels. Future work includes applying region based pooling on multiple and heterogeneous datasets with different EEG channel systems. In particular, large-scale pre-training and representation learning in combination with region based pooling will be investigated.</p></sec>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p></sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements. Written informed consent to participate in this study was not required from the participants or the participants&#x00027; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p></sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>TT: Conceptualization, Data curation, Formal analysis, Methodology, Visualization, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing. MT: Data curation, Validation, Writing&#x02014;review &#x00026; editing. AP: Writing&#x02014;review &#x00026; editing. CH-H: Data curation, Supervision, Writing&#x02014;review &#x00026; editing, Project administration. AY: Supervision, Writing&#x02014;review &#x00026; editing. HH: Methodology, Supervision, Validation, Writing&#x02014;review &#x00026; editing. IH: Funding acquisition, Project administration, Resources, Supervision, Writing&#x02014;review &#x00026; editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This project has received funding from the European Union&#x00027;s Horizon 2020 research and innovation programme under grant agreement No 964220.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest. The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Author disclaimer</title>
<p>This publication reflects views of the author and the European Commission is not responsible for any use that may be made of the information it contains.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/hatlestad-hall/prep-childmind-eeg">https://github.com/hatlestad-hall/prep-childmind-eeg</ext-link></p></fn>
<fn id="fn0002"><p><sup>2</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/thomastveitstol/RegionBasedPoolingEEG">https://github.com/thomastveitstol/RegionBasedPoolingEEG</ext-link></p></fn>
<fn id="fn0003"><p><sup>3</sup>Although one requirement is differentiability, if the pooling mechanism has parameters to be optimized as part of the gradient based learning.</p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abo Alzahab</surname> <given-names>N.</given-names></name> <name><surname>Apollonio</surname> <given-names>L.</given-names></name> <name><surname>Di Iorio</surname> <given-names>A.</given-names></name> <name><surname>Alshalak</surname> <given-names>M.</given-names></name> <name><surname>Iarlori</surname> <given-names>S.</given-names></name> <name><surname>Ferracuti</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Hybrid deep learning (hdl)-based brain-computer interface (bci) systems: a systematic review</article-title>. <source>Brain Sci</source>. <volume>11</volume>, <fpage>75</fpage>. <pub-id pub-id-type="doi">10.3390/brainsci11010075</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahmad</surname> <given-names>I.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhu</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Pi</surname> <given-names>Y.</given-names></name> <name><surname>Khan</surname> <given-names>J. A.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Eeg-based epileptic seizure detection via machine/deep learning approaches: a systematic review</article-title>. <source>Intell. Neurosci</source>. <volume>2022</volume>, <fpage>6486570</fpage>. <pub-id pub-id-type="doi">10.1155/2022/6486570</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alexander</surname> <given-names>L. M.</given-names></name> <name><surname>Escalera</surname> <given-names>J.</given-names></name> <name><surname>Ai</surname> <given-names>L.</given-names></name> <name><surname>Andreotti</surname> <given-names>C.</given-names></name> <name><surname>Febre</surname> <given-names>K.</given-names></name> <name><surname>Mangone</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>An open resource for transdiagnostic research in pediatric mental health and learning disorders</article-title>. <source>Sci. Data</source> <volume>4</volume>, <fpage>170181</fpage>. <pub-id pub-id-type="doi">10.1101/149369</pub-id><pub-id pub-id-type="pmid">29257126</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Banville</surname> <given-names>H.</given-names></name> <name><surname>Chehab</surname> <given-names>O.</given-names></name> <name><surname>Hyvarinen</surname> <given-names>A.</given-names></name> <name><surname>Engemann</surname> <given-names>D.-A.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>Uncovering the structure of clinical EEG signals with self-supervised learning</article-title>. <source>J. Neural Eng</source>. <volume>18</volume>, <fpage>46020</fpage>. <pub-id pub-id-type="doi">10.1088/1741-2552/abca18</pub-id><pub-id pub-id-type="pmid">33181507</pub-id></citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>R. J.</given-names></name> <name><surname>Wang</surname> <given-names>J. J.</given-names></name> <name><surname>Williamson</surname> <given-names>D. F. K.</given-names></name> <name><surname>Chen</surname> <given-names>T. Y.</given-names></name> <name><surname>Lipkova</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>M. Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Algorithmic fairness in artificial intelligence for medicine and healthcare</article-title>. <source>Nat. Biomed. Eng</source>. <volume>7</volume>, <fpage>719</fpage>&#x02013;<lpage>742</lpage>. <pub-id pub-id-type="doi">10.1038/s41551-023-01056-8</pub-id><pub-id pub-id-type="pmid">37380750</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Bardeci</surname> <given-names>M.</given-names></name> <name><surname>Ip</surname> <given-names>C. T.</given-names></name> <name><surname>Olbrich</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>Deep learning applied to electroencephalogram data in mental disorders: a systematic review</article-title>. <source>Biol. Psychol</source>. <volume>162</volume>, <fpage>108117</fpage>. <pub-id pub-id-type="doi">10.1016/j.biopsycho.2021.108117</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Cheveign&#x000E9;</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Zapline: a simple and effective method to remove power line artifacts</article-title>. <source>Neuroimage</source> <volume>207</volume>, <fpage>116356</fpage>. <pub-id pub-id-type="doi">10.1016/j.neuroimage.2019.116356</pub-id><pub-id pub-id-type="pmid">31786167</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delorme</surname> <given-names>A.</given-names></name> <name><surname>Makeig</surname> <given-names>S.</given-names></name></person-group> (<year>2004</year>). <article-title>Eeglab: an open source toolbox for analysis of single-trial eeg dynamics including independent component analysis</article-title>. <source>J. Neurosci. Methods</source> <volume>134</volume>, <fpage>9</fpage>&#x02013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1016/j.jneumeth.2003.10.009</pub-id><pub-id pub-id-type="pmid">15102499</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dempster</surname> <given-names>A.</given-names></name> <name><surname>Petitjean</surname> <given-names>F.</given-names></name> <name><surname>Webb</surname> <given-names>G. I.</given-names></name></person-group> (<year>2020</year>). <article-title>ROCKET: exceptionally fast and accurate time series classification using random convolutional kernels</article-title>. <source>Data Min. Knowl. Discov</source>. <volume>34</volume>, <fpage>1454</fpage>&#x02013;<lpage>1495</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-020-00701-z</pub-id></citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Farsi</surname> <given-names>L.</given-names></name> <name><surname>Siuly</surname> <given-names>S.</given-names></name> <name><surname>Kabir</surname> <given-names>E.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>Classification of alcoholic eeg signals using a deep learning method</article-title>. <source>IEEE Sens. J</source>. <volume>21</volume>, <fpage>3552</fpage>&#x02013;<lpage>3560</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2020.3026830</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Goodfellow</surname> <given-names>I.</given-names></name> <name><surname>Pouget-Abadie</surname> <given-names>J.</given-names></name> <name><surname>Mirza</surname> <given-names>M.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <name><surname>Warde-Farley</surname> <given-names>D.</given-names></name> <name><surname>Ozair</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>&#x0201C;Generative adversarianets,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Vol. 27</source>, eds Z. Ghahramani, M. Welling, C. Cortes, N. Lawrence, and K. Weinberger (Curran Associates, Inc.).</citation>
</ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Grieves</surname> <given-names>M.</given-names></name> <name><surname>Vickers</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <source>Digital Twin: Mitigating Unpredictable, Undesirable Emergent Behavior in Complex Systems</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>85</fpage>&#x02013;<lpage>113</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>G.</given-names></name> <name><surname>Song</surname> <given-names>S.</given-names></name> <name><surname>Yang</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Dynamic neural networks: a survey</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>44</volume>, <fpage>7436</fpage>&#x02013;<lpage>7456</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3117837</pub-id><pub-id pub-id-type="pmid">34613907</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hatlestad-Hall</surname> <given-names>C.</given-names></name> <name><surname>Bru&#x000F1;a</surname> <given-names>R.</given-names></name> <name><surname>Liljestr&#x000F6;m</surname> <given-names>M.</given-names></name> <name><surname>Renvall</surname> <given-names>H.</given-names></name> <name><surname>Heuser</surname> <given-names>K.</given-names></name> <name><surname>Taub&#x000F8;ll</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Reliable evaluation of functional connectivity and graph theory measures in source-level eeg: how many electrodes are enough?</article-title> <source>Clin. Neurophysiol</source>. <volume>150</volume>, <fpage>1</fpage>&#x02013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1016/j.clinph.2023.03.002</pub-id><pub-id pub-id-type="pmid">36972647</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Spatial pyramid pooling in deep convolutional networks for visual recognition,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2014</source>, eds D. Fleet, T. Pajdla, B. Schiele, and T. Tuytelaars (Cham: Springer International Publishing), <fpage>346</fpage>&#x02013;<lpage>361</lpage>.<pub-id pub-id-type="pmid">26353135</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hendrycks</surname> <given-names>D.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Mazeika</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Using pre-training can improve model robustness and uncertainty,&#x0201D;</article-title> in <source>Proceedings of the 36th International Conference on Machine Learning, volume 97 of Proceedings of Machine Learning Research</source>, eds K. Chaudhuri, and R. Salakhutdinov (PMLR), <fpage>2712</fpage>&#x02013;<lpage>2721</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>Deep learning&#x02014;a technology with the potential to transform health care</article-title>. <source>JAMA</source> <volume>320</volume>, <fpage>1101</fpage>&#x02013;<lpage>1102</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2018.11100</pub-id><pub-id pub-id-type="pmid">30178065</pub-id></citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Houssein</surname> <given-names>E.</given-names></name> <name><surname>Hamad</surname> <given-names>A.</given-names></name> <name><surname>Ali</surname> <given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>Human emotion recognition from eeg-based brain&#x02013;computer interface using machine learning: a comprehensive review</article-title>. <source>Neural Comp. Appl</source>. <volume>34</volume>, <fpage>12527</fpage>&#x02013;<lpage>12557</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-022-07292-4</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ismail Fawaz</surname> <given-names>H.</given-names></name> <name><surname>Lucas</surname> <given-names>B.</given-names></name> <name><surname>Forestier</surname> <given-names>G.</given-names></name> <name><surname>Pelletier</surname> <given-names>C.</given-names></name> <name><surname>Schmidt</surname> <given-names>D. F.</given-names></name> <name><surname>Weber</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Inceptiontime: Finding alexnet for time series classification</article-title>. <source>Data Min. Knowl. Discov</source>. <volume>34</volume>, <fpage>1936</fpage>&#x02013;<lpage>1962</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-020-00710-y</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kelly</surname> <given-names>C. J.</given-names></name> <name><surname>Karthikesalingam</surname> <given-names>A.</given-names></name> <name><surname>Suleyman</surname> <given-names>M.</given-names></name> <name><surname>Corrado</surname> <given-names>G.</given-names></name> <name><surname>King</surname> <given-names>D.</given-names></name></person-group> (<year>2019</year>). <article-title>Key challenges for delivering clinical impact with artificial intelligence</article-title>. <source>BMC Med</source>. <volume>17</volume>, <fpage>195</fpage>. <pub-id pub-id-type="doi">10.1186/s12916-019-1426-2</pub-id><pub-id pub-id-type="pmid">31665002</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Adam: a method for stochastic optimization,&#x0201D;</article-title> in <source>3rd International Conference on Learning Representations, ICLR 2015, Conference Track Proceedings</source>, eds Y. Bengio, and Y. LeCun (San Diego, CA), <fpage>7</fpage>&#x02013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kostas</surname> <given-names>D.</given-names></name> <name><surname>Aroca-Ouellette</surname> <given-names>S.</given-names></name> <name><surname>Rudzicz</surname> <given-names>F.</given-names></name></person-group> (<year>2021</year>). <article-title>Bendr: using transformers and a contrastive self-supervised learning task to learn from massive amounts of EEG data</article-title>. <source>Front. Hum. Neurosci</source>. <volume>15</volume>, <fpage>653659</fpage>. <pub-id pub-id-type="doi">10.3389/fnhum.2021.653659</pub-id><pub-id pub-id-type="pmid">34248521</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kuhnke</surname> <given-names>N.</given-names></name> <name><surname>Schwind</surname> <given-names>J.</given-names></name> <name><surname>D&#x000FC;mpelmann</surname> <given-names>M.</given-names></name> <name><surname>Mader</surname> <given-names>M.</given-names></name> <name><surname>Schulze-Bonhage</surname> <given-names>A.</given-names></name> <name><surname>Jacobs</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>High frequency oscillations in the ripple band (80-250 hz) in scalp EEG: Higher density of electrodes allows for better localization of the seizure onset zone</article-title>. <source>Brain Topogr</source>. <volume>31</volume>, <fpage>1059</fpage>&#x02013;<lpage>1072</lpage>. <pub-id pub-id-type="doi">10.1007/s10548-018-0658-3</pub-id><pub-id pub-id-type="pmid">29980967</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep learning</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>436</fpage>-444. <pub-id pub-id-type="doi">10.1038/nature14539</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Metsis</surname> <given-names>V.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;SPP-EEGNET: an input-agnostic self-supervised EEG representation model for inter-dataset transfer learning,&#x0201D;</article-title> in <source>Proceedings of the 18th International Conference on Computing and Information Technology (IC2IT 2022)</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>173</fpage>&#x02013;<lpage>182</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-99948-3_17</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lotte</surname> <given-names>F.</given-names></name> <name><surname>Bougrain</surname> <given-names>L.</given-names></name> <name><surname>Cichocki</surname> <given-names>A.</given-names></name> <name><surname>Clerc</surname> <given-names>M.</given-names></name> <name><surname>Congedo</surname> <given-names>M.</given-names></name> <name><surname>Rakotomamonjy</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>A review of classification algorithms for EEG-based brain&#x02013;computer interfaces: a 10 year update</article-title>. <source>J. Neural Eng</source>. <volume>15</volume>, <fpage>031005</fpage>. <pub-id pub-id-type="doi">10.1088/1741-2552/aab2f2</pub-id><pub-id pub-id-type="pmid">29488902</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mohammed</surname> <given-names>R.</given-names></name> <name><surname>Miften</surname> <given-names>F.</given-names></name> <name><surname>George</surname> <given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>Driver drowsiness detection methods using eeg signals: a systematic review</article-title>. <source>Comp. Methods Biomech. Biomed. Eng</source>. <volume>26</volume>, <fpage>1</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1080/10255842.2022.2112574</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oh</surname> <given-names>S. L.</given-names></name> <name><surname>Vicnesh</surname> <given-names>J.</given-names></name> <name><surname>Ciaccio</surname> <given-names>E. J.</given-names></name> <name><surname>Yuvaraj</surname> <given-names>R.</given-names></name> <name><surname>Acharya</surname> <given-names>U. R.</given-names></name></person-group> (<year>2019</year>). <article-title>Deep convolutional neural network model for automated diagnosis of schizophrenia using eeg signals</article-title>. <source>Appl. Sci</source>. <volume>9</volume>, <fpage>2870</fpage>. <pub-id pub-id-type="doi">10.3390/app9142870</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Paszke</surname> <given-names>A.</given-names></name> <name><surname>Gross</surname> <given-names>S.</given-names></name> <name><surname>Massa</surname> <given-names>F.</given-names></name> <name><surname>Lerer</surname> <given-names>A.</given-names></name> <name><surname>Bradbury</surname> <given-names>J.</given-names></name> <name><surname>Chanan</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2019</year>). <source>PyTorch: An Imperative Style, High-Performance Deep Learning Library</source>. <publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc</publisher-name>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Perrin</surname> <given-names>F.</given-names></name> <name><surname>Pernier</surname> <given-names>J.</given-names></name> <name><surname>Bertrand</surname> <given-names>O.</given-names></name> <name><surname>Echallier</surname> <given-names>J.</given-names></name></person-group> (<year>1989</year>). <article-title>Spherical splines for scalp potential and current density mapping</article-title>. <source>Electroencephalogr. Clin. Neurophysiol</source>. <volume>72</volume>, <fpage>184</fpage>&#x02013;<lpage>187</lpage>. <pub-id pub-id-type="doi">10.1016/0013-4694(89)90180-6</pub-id><pub-id pub-id-type="pmid">2464490</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rajpurkar</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>E.</given-names></name> <name><surname>Banerjee</surname> <given-names>O.</given-names></name> <name><surname>Topol</surname> <given-names>E. J.</given-names></name></person-group> (<year>2022</year>). <article-title>Ai in health and medicine</article-title>. <source>Nat. Med</source>. <volume>28</volume>, <fpage>31</fpage>-38. <pub-id pub-id-type="doi">10.1038/s41591-021-01614-0</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roy</surname> <given-names>Y.</given-names></name> <name><surname>Banville</surname> <given-names>H.</given-names></name> <name><surname>Albuquerque</surname> <given-names>I.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name> <name><surname>Falk</surname> <given-names>T. H.</given-names></name> <name><surname>Faubert</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Deep learning-based electroencephalography analysis: a systematic review</article-title>. <source>J. Neural Eng</source>. <volume>16</volume>, <fpage>051001</fpage>. <pub-id pub-id-type="doi">10.1088/1741-2552/ab260c</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ruiz</surname> <given-names>A.</given-names></name> <name><surname>Flynn</surname> <given-names>M.</given-names></name> <name><surname>Large</surname> <given-names>J.</given-names></name> <name><surname>Middlehurst</surname> <given-names>M.</given-names></name> <name><surname>Bagnall</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>The great multivariate time series classification bake off: a review and experimental evaluation of recent algorithmic advances</article-title>. <source>Data Min. Knowl. Discov</source>. <volume>35</volume>, <fpage>1</fpage>&#x02013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1007/s10618-020-00727-3</pub-id><pub-id pub-id-type="pmid">33679210</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stam</surname> <given-names>C.</given-names></name> <name><surname>Jones</surname> <given-names>B.</given-names></name> <name><surname>Nolte</surname> <given-names>G.</given-names></name> <name><surname>Breakspear</surname> <given-names>M.</given-names></name> <name><surname>Scheltens</surname> <given-names>P.</given-names></name></person-group> (<year>2006</year>). <article-title>Small-world networks and functional connectivity in Alzheimer&#x00027;s disease</article-title>. <source>Cereb. Cortex</source> <volume>17</volume>, <fpage>92</fpage>&#x02013;<lpage>99</lpage>. <pub-id pub-id-type="doi">10.1093/cercor/bhj127</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stancin</surname> <given-names>I.</given-names></name> <name><surname>Cifrek</surname> <given-names>M.</given-names></name> <name><surname>Jovic</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>A review of eeg signal features and their application in driver drowsiness detection systems</article-title>. <source>Sensors</source> <volume>21</volume>, <fpage>3786</fpage>. <pub-id pub-id-type="doi">10.3390/s21113786</pub-id><pub-id pub-id-type="pmid">34070732</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stephan</surname> <given-names>B. C. M.</given-names></name> <name><surname>Pakpahan</surname> <given-names>E.</given-names></name> <name><surname>Siervo</surname> <given-names>M.</given-names></name> <name><surname>Licher</surname> <given-names>S.</given-names></name> <name><surname>Muniz-Terrera</surname> <given-names>G.</given-names></name> <name><surname>Mohan</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Prediction of dementia risk in low-income and middle-income countries (the 10/66 study): an independent external validation of existing models</article-title>. <source>Lancet Global Health</source> <volume>8</volume>, <fpage>e524</fpage>&#x02013;<lpage>e535</lpage>. <pub-id pub-id-type="doi">10.1016/S2214-109X(20)30062-0</pub-id><pub-id pub-id-type="pmid">32199121</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>X.</given-names></name> <name><surname>Faisal</surname> <given-names>A. A.</given-names></name> <name><surname>Grosse-Wentrup</surname> <given-names>M.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name> <name><surname>Chevallier</surname> <given-names>S.</given-names></name> <name><surname>Jayaram</surname> <given-names>V.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;2021 beetl competition: advancing transfer learning for subject independence &#x00026; heterogenous eeg data sets,&#x0201D;</article-title> in <source>Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track, volume 176 of Proceedings of Machine Learning Research</source>, eds D. Kiela, M. Ciccone, and B. Caputo (PMLR), <fpage>205</fpage>&#x02013;<lpage>219</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yasin</surname> <given-names>S.</given-names></name> <name><surname>Hussain</surname> <given-names>S. A.</given-names></name> <name><surname>Aslan</surname> <given-names>S.</given-names></name> <name><surname>Raza</surname> <given-names>I.</given-names></name> <name><surname>Muzammel</surname> <given-names>M.</given-names></name> <name><surname>Othmani</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>Eeg based major depressive disorder and bipolar disorder detection using neural networks:a review</article-title>. <source>Comput. Methods Programs Biomed</source>. <volume>202</volume>, <fpage>106007</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106007</pub-id></citation>
</ref>
</ref-list>
</back>
</article>