<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Trop. Dis</journal-id>
<journal-title>Frontiers in Tropical Diseases</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Trop. Dis</abbrev-journal-title>
<issn pub-type="epub">2673-7515</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fitd.2021.769968</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Tropical Diseases</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A Comparative Study of Machine Learning Techniques for Multi-Class Classification of Arboviral Diseases</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Tabosa de Oliveira</surname>
<given-names>Thom&#xe1;s</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1584853"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>da Silva Neto</surname>
<given-names>Sebasti&#xe3;o Rog&#xe9;rio</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1321384"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Teixeira</surname>
<given-names>Igor Vitor</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1614438"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Aguiar de Oliveira</surname>
<given-names>Samuel Benjamin</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1467407"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>de Almeida Rodrigues</surname>
<given-names>Maria Gabriela</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1554270"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sampaio</surname>
<given-names>Vanderson Souza</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1146163"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Endo</surname>
<given-names>Patricia Takako</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1322427"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Programa de P&#xf3;s-Gradua&#xe7;&#xe3;o em Engenharia da Computa&#xe7;&#xe3;o, Universidade de&#xa0;Pernambuco</institution>, <addr-line>Recife</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Programa de&#xa0;P&#xf3;s-Gradua&#xe7;&#xe3;o em Medicina Tropical, Universidade do Estado do Amazonas</institution>, <addr-line>Manaus</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Funda&#xe7;&#xe3;o de Medicina Tropical Dr. Heitor Vieira Dourado</institution>, <addr-line>Manaus</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Funda&#xe7;&#xe3;o de Vigil&#xe2;ncia em Sa&#xfa;de Dra. Rosemary Costa Pinto</institution>, <addr-line>Manaus</addr-line>, <country>Brazil</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Manoel Barral-Netto, Gon&#xe7;alo Moniz Institute (IGM), Brazil</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Rajnikant Dixit, National Institute of Malaria Research (ICMR), India; Ricardo Khouri, Oswaldo Cruz Foundation (Fiocruz), Brazil</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Patricia Takako Endo, <email xlink:href="mailto:patricia.endo@upe.br">patricia.endo@upe.br</email>
</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Major Tropical Diseases, a section of the journal Frontiers in Tropical Diseases</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>18</day>
<month>02</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>2</volume>
<elocation-id>769968</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>09</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>12</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Tabosa de Oliveira, da Silva Neto, Teixeira, Aguiar de Oliveira, de Almeida Rodrigues, Sampaio and Endo</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Tabosa de Oliveira, da Silva Neto, Teixeira, Aguiar de Oliveira, de Almeida Rodrigues, Sampaio and Endo</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Among the neglected tropical diseases (NTDs), arboviral diseases present a significant number of cases worldwide. Their correct classification is a complex process due to the similarity of symptoms and the lack of tests in Brazil countryside is a big challenge to be overcome. Given this context, this paper proposes a comparative study of machine learning techniques for multi-class classification of arboviral diseases, which considers three classes: DENGUE, CHIKUNGUNYA and OTHERS, and uses clinical and socio-demographic data from patients. Feature selection techniques were also used for selecting the best subset of attributes for each model. Gradient boosting machines presented the best result in the metrics and a good subset of attributes for daily usage by the physicians that resulted in a 76.58% recall on the CHIKUNGUNYA class.</p>
</abstract>
<kwd-group>
<kwd>arboviral diseases</kwd>
<kwd>neglected tropical disease (NTD)</kwd>
<kwd>machine learning</kwd>
<kwd>multi-class classification</kwd>
<kwd>dengue (DENV)</kwd>
<kwd>Chikungunya (CHIKV)</kwd>
</kwd-group>
<contract-num rid="cn001">062.00249/2020</contract-num>
<contract-sponsor id="cn001">Funda&#xe7;&#xe3;o de Amparo &#xe0; Pesquisa do Estado do Amazonas<named-content content-type="fundref-id">10.13039/501100004916</named-content>
</contract-sponsor>
<counts>
<fig-count count="2"/>
<table-count count="11"/>
<equation-count count="0"/>
<ref-count count="31"/>
<page-count count="10"/>
<word-count count="6510"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>In 2015, the 2030 Agenda<sup>
<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>
</sup> was conceived by representatives of the member states of the United Nations (UN), and its main purpose is focused on eradicating poverty in all forms and dimensions <italic>via</italic> the implementation of sustainable development around the world. To achieve this major objective, 17 sustainable development goals (SDGs) were developed. Among them, Goal 3 (health and well-being) seeks to promote well-being for all, at all ages. Target 3.3 aims to end epidemics of AIDS, tuberculosis, malaria, and neglected tropical diseases (NTD), as well as combating hepatitis, waterborne diseases and other communicable diseases by the year 2030.</p>
<p>Arboviral diseases are NTDs caused by viruses and are transmitted by mosquitoes as their vector. Currently, there are about 545 known species of arboviruses, of which about 150 of them cause diseases in humans (<xref ref-type="bibr" rid="B1">1</xref>). In addition to Dengue virus (DENV), in the last 10 years, the emergence of other arboviruses, such as Chikungunya virus (CHIKV), Zika virus (ZIKV) and West Nile virus (WNV), has been observed. According to Lima-Camara (2016), disorganised urban growth and the modification of the environment by human actions are some of the reasons that influenced the increase in this type of disease (<xref ref-type="bibr" rid="B2">2</xref>).</p>
<p>According to reports released by the Pan American Health Organization (PAHO)<sup>
<xref ref-type="fn" rid="fn2">
<sup>2</sup>
</xref>
</sup>,<sup>
<xref ref-type="fn" rid="fn3">
<sup>3</sup>
</xref>
</sup> in 2020, together Dengue and Chikungunya accounted for a total of 2,402,128 cases in the Americas. However, most of these cases were classified as suspected cases due to the difficulty involved in their confirmation. For example, only 43.81% of reported Dengue cases (1,007,939 cases) were actually confirmed, and for Chikungunya, as few as 39% (39,619 cases) were confirmed. The low proportion of confirmed cases is due to the high complexity in the classification of these diseases in terms of their signs and symptoms. According to the Health Library of Primary Health Care (from Portuguese <italic>Biblioteca Virtual em</italic> Sa&#xfa;de da Aten&#xe7;&#xe3;o Prim&#xe1;ria &#xe0; Sa&#xfa;de) (BVS APS)<sup>
<xref ref-type="fn" rid="fn4">
<sup>4</sup>
</xref>
</sup>, most of cases are limited to the patients&#x2019; signs and symptoms and the local epidemiological status. In addition, rapid tests available at primary healthcare centers have low accuracy. Despite (<xref ref-type="bibr" rid="B3">3</xref>) state that &#x201c;<italic>cross-reactions with DENV or ZIKV infections are unlikely, because CHIKV is an alphavirus, while DENV and ZIKV are antigenically unrelated flaviviruses</italic>&#x201d;, it can be a concern. Actually, the cross-reactivity is one of the issues that pose barriers to the correct diagnosis for all arboviruses diseases at low-level health units. However, the lack of tests is also a major issue in the Amazon countryside. Therefore, accurate testing require specific equipment and time, though this also presents operational costs.</p>
<p>As a tropical country, Brazil has a huge diversity of both flora and fauna, and this includes mosquitos, which play an important role as vectors of illnesses such as arboviral diseases (<xref ref-type="bibr" rid="B4">4</xref>). According to PAHO, Brazil had the highest number of Dengue cases in the Americas in 2020, with 1,040,481 cases (65% of the total). Clinical classification of an arboviral disease is particularly a complex task in Brazil because of concomitant circulation of other arboviruses, such as Mayaro virus (MAYV), Venezuelan equine encephalitis virus (VEEV), Eastern equine encephalitis virus (EEEV), and Rocio virus (ROCV), which present a similar clinical profile (<xref ref-type="bibr" rid="B2">2</xref>). Besides the difficulty in clinical classification, cross-reaction is an issue for the current rapid tests that are available and this reduces their accuracy (<xref ref-type="bibr" rid="B2">2</xref>). Although high lethality has not been evidenced so far, the occurrence of coinfection with several arboviruses or concomitant circulation is cause for concern.</p>
<p>The Brazilian Unified Health System (from Portuguese, <italic>Sistema &#xda;nico de Sa&#xfa;de</italic>) SUS has suffered over the years from a reduction in funding and this imposes an additional barrier to expanding quality diagnostic testing and presents a major public health challenge, highlighting the need for a low-cost diagnostic approach. The use of Machine Learning (ML) techniques becomes an interesting alternative, as they are able to recognise and develop a classification without the need for immediate laboratory tests. This would avoid the costs of collecting them and running these tests. As stated by Bulbul and Unsal, &#x201c;<italic>compared to classical methods, the process of obtaining information is much more accurate and faster with data mining and ML</italic>&#x201d; (<xref ref-type="bibr" rid="B5">5</xref>). ML models estimate results by learning from previously entered information. In addition, these models do not require computational power and can be executed in tablets or cell phones.</p>
<p>Most studies that deal with this problem have proposed models for diagnosing Dengue (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>); Chikungunya (<xref ref-type="bibr" rid="B8">8</xref>); or Zika (<xref ref-type="bibr" rid="B9">9</xref>) individually; and, to the best of our knowledge, only one study has provided a model for distinguishing of two arboviral diseases (Dengue and Chikungunya) (<xref ref-type="bibr" rid="B10">10</xref>), however the study also used laboratory data to perform the classification. Despite improving the results, we do not employ these types of data, as they, in addition to needing adequate equipment, would prevent the ML model from being used for a quick diagnosis at the time of the patient&#x2019;s arrival at the health unit. Furthermore, most of the existing works did not present a clear methodology that describes the pre-processing of data, hyperparameter optimization techniques, or feature selection. In our work, the entire data pre-processing and balancing are systematically presented, as well as a comparison of feature selection techniques with grid search. We present not only the best attributes for each model, but also the best configuration for each scenario. We also provide a discussion regarding the model that was trained with the best features selected by the sequential feature algorithm (SFA) techniques and a model designed with features selected by health specialists.</p>
<p>The present work proposes different ML models and compares them for multi-class classification of Dengue, Chikungunya and other diseases, using the clinical and socio-demographic data of the patients. The objective is to assist the physician in a rapid diagnosis at the time of arrival of the patient at the health unit by providing an auxiliary tool for decision making.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<title>2 Materials and Methods</title>
<sec id="s2_1">
<title>2.1 Feature Selection</title>
<p>Feature selection is a technique that is used to reduce the dimensionality of the data set, which leads to better learning performance and/or lower computational cost. This technique selects the most relevant attributes in the data set by removing noisy, irrelevant and redundant features (<xref ref-type="bibr" rid="B11">11</xref>). Different feature selection techniques can be found in the literature, and can be categorised according to the search strategy. There are three main approaches: filter, wrapper, and embedded (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>In this work, the wrapper approach is used, since it makes use of a learning algorithm to determine the best subset of attributes, called features, where an evaluation is usually made in terms of predictive accuracy. Due to the use and dependence of a learning model, this type of approach can become computationally expensive, though the possibility of selecting irrelevant features is less likely (<xref ref-type="bibr" rid="B12">12</xref>). Among the wrapper techniques, we used the SFA. This technique has four different types, and each type differs in the way it selects or removes features from the data set: sequential forward selection (SFS), sequential backward selection (SBS), sequential forward floating selection (SFFS) and sequential backward floating selection (SBFS).</p>
</sec>
<sec id="s2_2">
<title>2.2 Grid Search</title>
<p>Grid search is an exhaustive search technique for setting hyperparameters of a given model. With it, it is possible to analyse the results of a ML model, and then decide which configuration best fits the target problem. According to Bergstra and Bengio (<xref ref-type="bibr" rid="B13">13</xref>), despite having limitations, this technique is widely used along with the manual search technique.</p>
</sec>
<sec id="s2_3">
<title>2.3 Machine Learning Techniques</title>
<p>ML is a branch of artificial intelligence that is composed of several techniques that have been widely used for pattern learning (<xref ref-type="bibr" rid="B8">8</xref>, <xref ref-type="bibr" rid="B14">14</xref>&#x2013;<xref ref-type="bibr" rid="B18">18</xref>). The ML models used in this work are Random Forest (RF), Adaptative Boosting (Adaboost), Gradient Boosting Machines (GBM), eXtreme Gradient Boosting (Xgboost), k-Nearest Neighbours (KNN), Naive Bayes (NB) and Multilayer Perceptron (MLP).</p>
</sec>
<sec id="s2_4">
<title>2.4 Evaluation Metrics</title>
<p>The following metrics are used: accuracy, precision, sensitivity and F1-Score. With the exception of accuracy, in the other metrics, the value of the metric in each class and the macro average of each one of them is also analysed.</p>
</sec>
<sec id="s2_5">
<title>2.5 Data Set</title>
<p>In this work, data regarding Dengue and Chikungunya notifications from the state of Amazonas and the city of Recife, Pernambuco from 2015 to 2020 are used. Regarding the state of Amazonas, data were retrieved from the Health Problem and Notification Information System, from Portuguese <italic>Sistema de Informa&#xe7;&#xe3;o de Agravo de Notifica&#xe7;&#xe3;o</italic> (SINAN)<sup>
<xref ref-type="fn" rid="fn5">
<sup>5</sup>
</xref>
</sup>. SINAN is the official system for disease reporting in Brazil. Diseases from the national list of compulsory notification must be reported, and this list includes Dengue and Chikungunya. This data set contains 57,445 entries and 146 variables and hereafter is referred to as &#x201c;SINAN-db&#x201d;.</p>
<p>The data set for Recife was retrieved from an open data set named Portal de Dados Abertos do Recife (<xref ref-type="bibr" rid="B19">19</xref>), maintained by the Recife Health Department, whose primary source is also the SINAN, and therefore it follows the same dictionary pattern, and allows integration without further issues. This data set contains 83,073 registers and 124 variables and is referred to as &#x201c;Recife-db&#x201d; in this work.</p>
<p>
<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref> illustrates the steps taken during the pre-processing of the data set. First, both data sets were integrated. Variables available in only one of the data sets were disregarded. The resulting data set from the integration of SINAN-db and Recife-db has 140,518 registers and 120 variables.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Data set pre-processing steps.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fitd-02-769968-g001.tif"/>
</fig>
<p>The output classes were grouped into three distinct classes:</p>
<list list-type="bullet">
<list-item>
<p>DENGUE: Patients with confirmed Dengue;</p>
</list-item>
<list-item>
<p>CHIKUNGUNYA: Patients with confirmed Chikungunya; and</p>
</list-item>
<list-item>
<p>OTHERS: Patients classified as &#x201c;inconclusive&#x201d; or &#x201c;negative&#x201d; for both Dengue and Chikungunya.</p>
</list-item>
</list>
<p>Only records confirmed or denied by clinical diagnoses were selected. Registers that did not relate signs or symptoms were discarded since they are the most important information for classification models. Moreover, variables with more than 50% of data missing were also removed. Besides the original variables, a new one (DIAS) was created so that the time (in days) from onset of these symptoms to the date of notification could be added to the models. For the selection of attributes, specialists were consulted. After coding variables as numbers, duplicates were removed, and missing values were replaced by &#x201c;not informed&#x201d; for each variable. Registers with missing values for all variables were also removed. Finally, the clean data set consisted of 17,948 registers in the DENGUE class, 5,724 in the CHIKUNGUNYA class and 16,704 in the OTHERS class, totalling 40,376 registers with 27 variables. In data science, a higher number of registers of a specific class compared to another in the same data set is known as imbalance and it can bias the ML model, which favours the classification of the class that has the largest number of registers (<xref ref-type="bibr" rid="B20">20</xref>).</p>
<p>In order to balance the data set, the random undersampling technique was performed. In this technique, the class with the least number of registers defines the amount of the other classes, so that all classes have the same number of registers. After balancing, the data set still had 27 attributes and 17,172 records, with 5,724 for each of the three classes. The 27 variables resulting from the pre-processing are described in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. The data set can be accessed in Mendeley Data (<xref ref-type="bibr" rid="B21">21</xref>).</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Database attributes after pre-processing.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Attribute </th>
<th valign="top" align="center">Description </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">NU_IDADE_N</td>
<td valign="top" align="left">Patient age</td>
</tr>
<tr>
<td valign="top" align="left">CS_SEXO</td>
<td valign="top" align="left">Patient sex</td>
</tr>
<tr>
<td valign="top" align="left">CS_GESTANT</td>
<td valign="top" align="left">Gestational Age of the Patient (Quarter), in case CS_SEXO=F</td>
</tr>
<tr>
<td valign="top" align="left">CS_RACA</td>
<td valign="top" align="left">Patient Race</td>
</tr>
<tr>
<td valign="top" align="left">CS_ZONA</td>
<td valign="top" align="left">Residence area</td>
</tr>
<tr>
<td valign="top" align="left">FEBRE</td>
<td valign="top" align="left">Symptom - Fever</td>
</tr>
<tr>
<td valign="top" align="left">MIALGIA</td>
<td valign="top" align="left">Symptom - Myalgia</td>
</tr>
<tr>
<td valign="top" align="left">CEFALEIA</td>
<td valign="top" align="left">Symptom - Headache</td>
</tr>
<tr>
<td valign="top" align="left">EXANTEMA</td>
<td valign="top" align="left">Symptom - Rash</td>
</tr>
<tr>
<td valign="top" align="left">VOMITO</td>
<td valign="top" align="left">Symptom - Vomiting</td>
</tr>
<tr>
<td valign="top" align="left">NAUSEA</td>
<td valign="top" align="left">Symptom - Nausea</td>
</tr>
<tr>
<td valign="top" align="left">DOR_COSTAS</td>
<td valign="top" align="left">Symptom - Back Pain</td>
</tr>
<tr>
<td valign="top" align="left">CONJUNTVIT</td>
<td valign="top" align="left">Symptom - Conjunctivitis</td>
</tr>
<tr>
<td valign="top" align="left">ARTRITE</td>
<td valign="top" align="left">Symptom - Arthritis</td>
</tr>
<tr>
<td valign="top" align="left">ARTRALGIA</td>
<td valign="top" align="left">Symptom - Arthralgia</td>
</tr>
<tr>
<td valign="top" align="left">PETEQUIA_N</td>
<td valign="top" align="left">Symptom - Petechiae</td>
</tr>
<tr>
<td valign="top" align="left">LACO</td>
<td valign="top" align="left">Symptom - Tourniquet test</td>
</tr>
<tr>
<td valign="top" align="left">DOR_RETRO</td>
<td valign="top" align="left">Symptom - Eye pain</td>
</tr>
<tr>
<td valign="top" align="left">DIABETES</td>
<td valign="top" align="left">Pre-existing disease - Diabetes</td>
</tr>
<tr>
<td valign="top" align="left">HEMATOLOG</td>
<td valign="top" align="left">Pre-existing disease - Haematological diseases</td>
</tr>
<tr>
<td valign="top" align="left">HEPATOPAT</td>
<td valign="top" align="left">Pre-existing disease - Liver diseases</td>
</tr>
<tr>
<td valign="top" align="left">RENAL</td>
<td valign="top" align="left">Pre-existing disease - Kidney disease</td>
</tr>
<tr>
<td valign="top" align="left">HIPERTENSA</td>
<td valign="top" align="left">Pre-existing disease - Hypertension</td>
</tr>
<tr>
<td valign="top" align="left">ACIDO_PEPT</td>
<td valign="top" align="left">Pre-existing disease - Peptic acid disease</td>
</tr>
<tr>
<td valign="top" align="left">AUTO_IMUNE</td>
<td valign="top" align="left">Pre-existing disease - autoimmune disease</td>
</tr>
<tr>
<td valign="top" align="left">DIAS</td>
<td valign="top" align="left">Days that the patient is feeling the symptoms</td>
</tr>
<tr>
<td valign="top" align="left">CLASSI_FIN</td>
<td valign="top" align="left">Final patient classification</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_6">
<title>2.6 Experiments</title>
<p>The experiment is divided into three main steps: <italic>(a)</italic> optimisation of hyperparameters and attribute selection, using Grid Search and SFA; <italic>(b)</italic> evaluation of models performance; and <italic>(c)</italic> specialist evaluation.</p>
<sec id="s2_6_1">
<title>2.6.1 Optimisation of Hyperparameters and Attribute Selection</title>
<p>The grid search technique was performed for each model individually and, on each model, not only were the combinations of the hyperparameters tested, but we also defined which SFA technique offers the best subset of attributes.</p>
<p>
<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref> illustrates how the grid search process was executed considering the model&#x2019;s hyperparameters together with the SFA techniques. We used the Python library sklearn GridSearchCV<sup>
<xref ref-type="fn" rid="fn6">
<sup>6</sup>
</xref>
</sup>, using the training set (70% of the data set). The cross-validation technique (<xref ref-type="bibr" rid="B22">22</xref>) with k=10 was used. At the end of the grid search of each model, the result was the best combination of model hyperparameters and the best subset of data set attributes for the same configuration.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Grid Search flowchart with SFA.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fitd-02-769968-g002.tif"/>
</fig>
<p>
<xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> shows the hyperparameters of each model that were tested in the grid search and their respective value ranges. All&#xa0;models, except Xgboost, were executed using the Python library sklearn.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Parameters used in Grid Search.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<td valign="top" align="left">Model</td>
<td valign="top" align="center">Parameters</td>
<td valign="top" align="center">Values</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="left">learning_rate</td>
<td valign="top" align="left">[0.36, 1, 1.5]</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators</td>
<td valign="top" align="left">[25, 50, 100]</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="left">criterion</td>
<td valign="top" align="left">[gini, entropy]</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators</td>
<td valign="top" align="left">[50, 100, 200]</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="left">max_depth</td>
<td valign="top" align="left">[1, 3, 5]</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators</td>
<td valign="top" align="left">[50, 100, 200]</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="left">eta</td>
<td valign="top" align="left">[0.3, 0.5]</td>
</tr>
<tr>
<td valign="top" align="left">max_depth</td>
<td valign="top" align="left">[2, 6]</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="left">metric</td>
<td valign="top" align="left">[euclidean, manhattan]</td>
</tr>
<tr>
<td valign="top" align="left">n_neighbors</td>
<td valign="top" align="left">[2, 5, 10]</td>
</tr>
<tr>
<td valign="top" align="left">weights</td>
<td valign="top" align="left">[uniform, distance]</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="left">hidden_layer_sizes</td>
<td valign="top" align="left">[(100), (100,100), (100,100,100)]</td>
</tr>
<tr>
<td valign="top" align="left">learning_rate_init</td>
<td valign="top" align="left">[0.001, 0.01, 0.1]</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The Adaboost was executed with the AdaBoostClassifier<sup>
<xref ref-type="fn" rid="fn7">
<sup>7</sup>
</xref>
</sup> and two hyperparameters were tested: <italic>learning_rate</italic> and <italic>n_estimators</italic>. <italic>n_estimators</italic> is the maximum number of stumps that the model will produce in the training, and <italic>learning_rate</italic> is a weight applied to each stump at each iteration. A higher <italic>learning_rate</italic> increases the contribution of each classifier. The higher the <italic>learning_rate</italic>, the greater the contribution of stumps during training. Low values decrease correct classification, while high values are associated with model instability (<xref ref-type="bibr" rid="B23">23</xref>).</p>
<p>The RF was executed with the RandomForestClassifier<sup>
<xref ref-type="fn" rid="fn8">
<sup>8</sup>
</xref>
</sup> and two hyperparameters were tested: <italic>criterion</italic> and <italic>n_estimators</italic>. <italic>n_estimators</italic>, as in Adaboost, is the maximum number of Decision Tree (DT) that the model produces and <italic>criterion</italic> is the function that determines which are the best splits in each node.</p>
<p>The GBM was executed with the GradientBoostingClassifier<sup>
<xref ref-type="fn" rid="fn9">
<sup>9</sup>
</xref>
</sup>, and two hyperparameters were tested, <italic>max_depth</italic> and <italic>n_estimators</italic>. <italic>max_depth</italic> is the level of depth that each DT within the model has. The higher the level, consequently, the more nodes the DT has. <italic>n_estimators</italic>, as in Adaboost and RF, is the maximum number of DT that the model produces.</p>
<p>The Xgboost was executed with the Python library XGBoost<sup>
<xref ref-type="fn" rid="fn10">
<sup>10</sup>
</xref>
</sup> and two hyperparameters were tested, <italic>max_depth</italic> and <italic>eta</italic>. <italic>max_depth</italic>, as in GBM, is the level of depth that each DT within the model has; and <italic>eta</italic>, also known as learning rate, is the shrinkage in update to prevent overfitting.</p>
<p>The KNN was executed with the KNeighborsClassifier<sup>
<xref ref-type="fn" rid="fn11">
<sup>11</sup>
</xref>
</sup> and three hyperparameters were tested, namely, <italic>metric</italic>, <italic>n_neighbors</italic> and <italic>weights</italic>. <italic>n_neighbors</italic> is the number of neighbours that is used in the training. <italic>weights</italic> contains the function that determines the weights each neighbour has in the training, and <italic>metric</italic> is the function used to calculate the distance to each neighbour.</p>
<p>The MLP was executed with the MLPClassifier<sup>
<xref ref-type="fn" rid="fn12">
<sup>12</sup>
</xref>
</sup> and two hyperparameters were tested, in this case, <italic>hidden_layer_sizes</italic> and <italic>learning_rate</italic>_init. <italic>hidden_layer_sizes</italic> defines the number of hidden layers and the number of neurons that each layer has. <italic>learning_rate</italic>_init is the value that determines how often the weights of each layer will be updated during training.</p>
<p>Lastly, the NB<sup>
<xref ref-type="fn" rid="fn13">
<sup>13</sup>
</xref>
</sup> was executed with the GaussianNB. As NB does not have hyperparameters, the Grid Search of this model was executed only with SFA techniques.</p>
</sec>
<sec id="s2_6_2">
<title>2.6.2 Evaluation of Models</title>
<p>After the execution of the grid search, the models were evaluated using the remaining 30% of the data set that was not part of the training, which was called the test set. The models were evaluated using the metrics described in subsection 2.4. The tests were executed 30 times and the metrics were averaged in order to be compared. The model chosen was the one that best fitted the needs of the experiment. After that, the model was submitted to specialists so that the application in the health care routine could be assessed.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<title>3 Results</title>
<p>The scenario of low-income countries and limited-resource settings requires physicians to make a diagnosis often using only clinical parameters and without laboratory data support. ML techniques can aid in the classification of arboviral diseases using only these clinical parameters. Therefore this work evaluated seven ML techniques using only clinical and socio-demographic features.</p>
<p>Overall and per-disease baseline characteristics are presented in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. Baseline characteristics show an overall mean (SD) age over 30 years and a predominance of men and in urban areas for each arboviral disease. Fever (85.3%), headache (60.6%), myalgia (58.4%), and arthralgia (51.1%) were the most frequent symptoms.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Clinical and socio-demographic findings of patients at baseline.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" rowspan="2" align="left">Variables</th>
<th valign="top" align="center">Total</th>
<th valign="top" align="center">Dengue</th>
<th valign="top" align="center">Chikungunya</th>
<th valign="top" align="center">Others</th>
</tr>
<tr>
<th valign="top" align="center">N=17172</th>
<th valign="top" align="center">N=5724</th>
<th valign="top" align="center">N=5724</th>
<th valign="top" align="center">N=5724</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Gender Women, %</td>
<td valign="top" align="center">7267/17172 (42.3)</td>
<td valign="top" align="center">2540/5724 (44.4)</td>
<td valign="top" align="center">2200/5724 (38.4)</td>
<td valign="top" align="center">2527/5724 (44.1)</td>
</tr>
<tr>
<td valign="top" align="left">Age, Mean (SD)</td>
<td valign="top" align="center">32.6 (20.1)</td>
<td valign="top" align="center">31.0 (19.8)</td>
<td valign="top" align="center">36.6 (20.0)</td>
<td valign="top" align="center">30.1 (19.9)</td>
</tr>
<tr>
<td valign="top" align="left">Race, %</td>
<td valign="top" colspan="4" align="center"/>
</tr>
<tr>
<td valign="top" align="left">&#x2003;White</td>
<td valign="top" align="center">690/17172 (4.0)</td>
<td valign="top" align="center">223/5724 (3.9)</td>
<td valign="top" align="center">203/5724 (3.5)</td>
<td valign="top" align="center">264/5724 (4.6)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Black</td>
<td valign="top" align="center">156/17172 (0.9)</td>
<td valign="top" align="center">53/5724 (0.9)</td>
<td valign="top" align="center">56/5724 (1.0)</td>
<td valign="top" align="center">47/5724 (0.8)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Yellow</td>
<td valign="top" align="center">34/17172 (0.2)</td>
<td valign="top" align="center">10/5724 (0.2)</td>
<td valign="top" align="center">11/5724 (0.2)</td>
<td valign="top" align="center">13/5724 (0.2)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Admixed</td>
<td valign="top" align="center">5292/17172 (30.8)</td>
<td valign="top" align="center">1806/5724 (31.6)</td>
<td valign="top" align="center">954/5724 (16.7)</td>
<td valign="top" align="center">2532/5724 (44.2)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Indigenous</td>
<td valign="top" align="center">176/17172 (1.0)</td>
<td valign="top" align="center">104/5724 (1.8)</td>
<td valign="top" align="center">22/5724 (0.4)</td>
<td valign="top" align="center">50/5724 (0.9)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Missing</td>
<td valign="top" align="center">10824/17172 (63.0)</td>
<td valign="top" align="center">3528/5724 (61.6)</td>
<td valign="top" align="center">4478/5724 (78.2)</td>
<td valign="top" align="center">2818/5724 (49.2)</td>
</tr>
<tr>
<td valign="top" align="left">Pregnant, %</td>
<td valign="top" colspan="4" align="center"/>
</tr>
<tr>
<td valign="top" align="left">&#x2003;1st Quarter</td>
<td valign="top" align="center">53/17172 (0.3)</td>
<td valign="top" align="center">9/5724 (0.2)</td>
<td valign="top" align="center">13/5724 (0.2)</td>
<td valign="top" align="center">31/5724 (0.5)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;2nd Quarter</td>
<td valign="top" align="center">77/17172 (0.4)</td>
<td valign="top" align="center">25/5724 (0.4)</td>
<td valign="top" align="center">22/5724 (0.4)</td>
<td valign="top" align="center">30/5724 (0.5)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;3rd Quarter</td>
<td valign="top" align="center">75/17172 (0.4)</td>
<td valign="top" align="center">17/5724 (0.3)</td>
<td valign="top" align="center">27/5724 (0.5)</td>
<td valign="top" align="center">31/5724 (0.5)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Ignored gestational age</td>
<td valign="top" align="center">19/17172 (0.1)</td>
<td valign="top" align="center">4/5724 (0.1)</td>
<td valign="top" align="center">7/5724 (0.1)</td>
<td valign="top" align="center">8/5724 (0.1)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Missing</td>
<td valign="top" align="center">16948/17172 (98.7)</td>
<td valign="top" align="center">5669/5724 (99.0)</td>
<td valign="top" align="center">5655/5724 (98.8)</td>
<td valign="top" align="center">5624/5724 (98.3)</td>
</tr>
<tr>
<td valign="top" align="left">Residence area, %</td>
<td valign="top" colspan="4" align="center"/>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Urban</td>
<td valign="top" align="center">14658/17172 (85.4)</td>
<td valign="top" align="center">4775/5724 (83.4)</td>
<td valign="top" align="center">5187/5724 (90.6)</td>
<td valign="top" align="center">4696/5724 (82.0)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Rural</td>
<td valign="top" align="center">175/17172 (1.0)</td>
<td valign="top" align="center">27/5724 (0.5)</td>
<td valign="top" align="center">9/5724 (0.2)</td>
<td valign="top" align="center">139/5724 (2.4)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Periurban</td>
<td valign="top" align="center">5/17172 (0.0)</td>
<td valign="top" align="center">2/5724 (0.0)</td>
<td valign="top" align="center">2/5724 (0.0)</td>
<td valign="top" align="center">1/5724 (0.0)</td>
</tr>
<tr>
<td valign="top" align="left">&#x2003;Missing</td>
<td valign="top" align="center">2334/17172 (13.6)</td>
<td valign="top" align="center">920/5724 (16.1)</td>
<td valign="top" align="center">526/5724 (9.2)</td>
<td valign="top" align="center">888/5724 (15.5)</td>
</tr>
<tr>
<td valign="top" align="left">Fever, %</td>
<td valign="top" align="center">14647/17172 (85.3)</td>
<td valign="top" align="center">5190/5724 (90.7)</td>
<td valign="top" align="center">5300/5724 (92.6)</td>
<td valign="top" align="center">4157/5724 (72.6)</td>
</tr>
<tr>
<td valign="top" align="left">Myalgia, %</td>
<td valign="top" align="center">10029/17172 (58.4)</td>
<td valign="top" align="center">3948/5724 (69.0)</td>
<td valign="top" align="center">3364/5724 (58.8)</td>
<td valign="top" align="center">2717/5724 (47.5)</td>
</tr>
<tr>
<td valign="top" align="left">Headache, %</td>
<td valign="top" align="center">10406/17172 (60.6)</td>
<td valign="top" align="center">4020/5724 (70.2)</td>
<td valign="top" align="center">3316/5724 (57.9)</td>
<td valign="top" align="center">3070/5724 (53.6)</td>
</tr>
<tr>
<td valign="top" align="left">Rash, %</td>
<td valign="top" align="center">4395/17172 (25.6)</td>
<td valign="top" align="center">1765/5724 (30.8)</td>
<td valign="top" align="center">1637/5724 (28.6)</td>
<td valign="top" align="center">993/5724 (17.3)</td>
</tr>
<tr>
<td valign="top" align="left">Vomit, %</td>
<td valign="top" align="center">3312/17172 (19.3)</td>
<td valign="top" align="center">1440/5724 (25.2)</td>
<td valign="top" align="center">992/5724 (17.3)</td>
<td valign="top" align="center">880/5724 (15.4)</td>
</tr>
<tr>
<td valign="top" align="left">Nausea, %</td>
<td valign="top" align="center">3517/17172 (20.5)</td>
<td valign="top" align="center">1610/5724 (28.1)</td>
<td valign="top" align="center">1076/5724 (18.8)</td>
<td valign="top" align="center">831/5724 (14.5)</td>
</tr>
<tr>
<td valign="top" align="left">Back pain, %</td>
<td valign="top" align="center">2612/17172 (15.2)</td>
<td valign="top" align="center">1088/5724 (19.0)</td>
<td valign="top" align="center">877/5724 (15.3)</td>
<td valign="top" align="center">647/5724 (11.3)</td>
</tr>
<tr>
<td valign="top" align="left">Conjunctivitis, %</td>
<td valign="top" align="center">678/17172 (3.9)</td>
<td valign="top" align="center">297/5724 (5.2)</td>
<td valign="top" align="center">222/5724 (3.9)</td>
<td valign="top" align="center">159/5724 (2.8)</td>
</tr>
<tr>
<td valign="top" align="left">Arthritis, %</td>
<td valign="top" align="center">1641/17172 (9.6)</td>
<td valign="top" align="center">638/5724 (11.1)</td>
<td valign="top" align="center">715/5724 (12.5)</td>
<td valign="top" align="center">288/5724 (5.0)</td>
</tr>
<tr>
<td valign="top" align="left">Arthralgia, %</td>
<td valign="top" align="center">8770/17172 (51.1)</td>
<td valign="top" align="center">2394/5724 (41.8)</td>
<td valign="top" align="center">4890/5724 (85.4)</td>
<td valign="top" align="center">1486/5724 (26.0)</td>
</tr>
<tr>
<td valign="top" align="left">Petechiae, %</td>
<td valign="top" align="center">802/17172 (4.7)</td>
<td valign="top" align="center">421/5724 (7.4)</td>
<td valign="top" align="center">211/5724 (3.7)</td>
<td valign="top" align="center">170/5724 (3.0)</td>
</tr>
<tr>
<td valign="top" align="left">Tourniquet test, %</td>
<td valign="top" align="center">290/17172 (1.7)</td>
<td valign="top" align="center">207/5724 (3.6)</td>
<td valign="top" align="center">38/5724 (0.7)</td>
<td valign="top" align="center">45/5724 (0.8)</td>
</tr>
<tr>
<td valign="top" align="left">Retroorbital pain, %</td>
<td valign="top" align="center">2555/17172 (14.9)</td>
<td valign="top" align="center">1407/5724 (24.6)</td>
<td valign="top" align="center">622/5724 (10.9)</td>
<td valign="top" align="center">526/5724 (9.2)</td>
</tr>
<tr>
<td valign="top" align="left">Diabetes, %</td>
<td valign="top" align="center">216/17172 (1.3)</td>
<td valign="top" align="center">57/5724 (1.0)</td>
<td valign="top" align="center">103/5724 (1.8)</td>
<td valign="top" align="center">56/5724 (1.0)</td>
</tr>
<tr>
<td valign="top" align="left">Haematological diseases, %</td>
<td valign="top" align="center">58/17172 (0.3)</td>
<td valign="top" align="center">22/5724 (0.4)</td>
<td valign="top" align="center">16/5724 (0.3)</td>
<td valign="top" align="center">20/5724 (0.3)</td>
</tr>
<tr>
<td valign="top" align="left">Liver diseases, %</td>
<td valign="top" align="center">72/17172 (0.4)</td>
<td valign="top" align="center">21/5724 (0.4)</td>
<td valign="top" align="center">25/5724 (0.4)</td>
<td valign="top" align="center">26/5724 (0.5)</td>
</tr>
<tr>
<td valign="top" align="left">Kidney disease, %</td>
<td valign="top" align="center">50/17172 (0.3)</td>
<td valign="top" align="center">10/5724 (0.2)</td>
<td valign="top" align="center">20/5724 (0.3)</td>
<td valign="top" align="center">20/5724 (0.3)</td>
</tr>
<tr>
<td valign="top" align="left">Hypertension, %</td>
<td valign="top" align="center">454/17172 (2.6)</td>
<td valign="top" align="center">128/5724 (2.2)</td>
<td valign="top" align="center">191/5724 (3.3)</td>
<td valign="top" align="center">135/5724 (2.4)</td>
</tr>
<tr>
<td valign="top" align="left">Peptic acid disease, %</td>
<td valign="top" align="center">97/17172 (0.6)</td>
<td valign="top" align="center">27/5724 (0.5)</td>
<td valign="top" align="center">28/5724 (0.5)</td>
<td valign="top" align="center">42/5724 (0.7)</td>
</tr>
<tr>
<td valign="top" align="left">Autoimmune disease, %</td>
<td valign="top" align="center">42/17172 (0.2)</td>
<td valign="top" align="center">10/5724 (0.2)</td>
<td valign="top" align="center">16/5724 (0.3)</td>
<td valign="top" align="center">16/5724 (0.3)</td>
</tr>
<tr>
<td valign="top" align="left">Symptom time in days, Mean (SD)</td>
<td valign="top" align="center">21.0 (217.3)</td>
<td valign="top" align="center">17.0 (32.8)</td>
<td valign="top" align="center">22.6 (58.2)</td>
<td valign="top" align="center">23.3 (370.5)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Our results are presented in three parts: <italic>(a)</italic> the results obtained from each model using grid search; <italic>(b)</italic> evaluation of the models using the configurations found by the grid search; and <italic>(c)</italic> comparison of the best model with a model designed with features selected by health specialists.</p>
<sec id="s3_1">
<title>3.1 Grid Search</title>
<p>
<xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref> presents the results from the Grid Search technique of the seven models: Adaboost, RF, GBM, Xgboost, KNN, MLP and NB.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Results from Grid Search.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Hyper parameters</th>
<th valign="top" align="center">QTD. Att</th>
<th valign="top" align="center">SFA</th>
<th valign="top" align="center">Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="left">Learning_rate: 0.36</td>
<td valign="top" rowspan="2" align="center">10</td>
<td valign="top" rowspan="2" align="center">SBS</td>
<td valign="top" rowspan="2" align="center">0.5972</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators: 25</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="left">criterion: gini</td>
<td valign="top" rowspan="2" align="center">16</td>
<td valign="top" rowspan="2" align="center">SFFS</td>
<td valign="top" rowspan="2" align="center">0.6061</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators: 200</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="left">max_depth: 3</td>
<td valign="top" rowspan="2" align="center">18</td>
<td valign="top" rowspan="2" align="center">SFFS</td>
<td valign="top" rowspan="2" align="center">0.6218</td>
</tr>
<tr>
<td valign="top" align="left">n_estimators: 200</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="left">eta: 0.3</td>
<td valign="top" rowspan="2" align="center">20</td>
<td valign="top" rowspan="2" align="center">SFFS</td>
<td valign="top" rowspan="2" align="center">0.6230</td>
</tr>
<tr>
<td valign="top" align="left">max_depth:2</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="left">metric: euclidean</td>
<td valign="top" rowspan="3" align="center">19</td>
<td valign="top" rowspan="3" align="center">SBS</td>
<td valign="top" rowspan="3" align="center">0.5739</td>
</tr>
<tr>
<td valign="top" align="left">n_neighbors: 2</td>
</tr>
<tr>
<td valign="top" align="left">weights: uniform</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="left">hidden_layer_sizes: (100),</td>
<td valign="top" rowspan="2" align="center">15</td>
<td valign="top" rowspan="2" align="center">SFFS</td>
<td valign="top" rowspan="2" align="center">0.6153</td>
</tr>
<tr>
<td valign="top" align="left">learning_rate_init: 0.1</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="left">&#x2013;</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">SBFS</td>
<td valign="top" align="center">0.585</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Regarding SFA, the techniques that presented the best performance were SFFS and SBS. The size of the subset of attributes ranged between 10 and 20 attributes, and the most common attributes were CS_RACA, CS_ZONA, FEBRE, EXANTEMA, NAUSEA, ARTRALGIA, DOR_RETRO, which appeared in all subsets. <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> shows the attributes selected by the SFA techniques for each model.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Attributes select by the SFA techniques for each model.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model</th>
<th valign="top" align="center">Attributes </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="left">NU_IDADE_N, CS_RACA, CS_ZONA, FEBRE, CEFALEIA</td>
</tr>
<tr>
<td valign="top" align="left">EXANTEMA, NAUSEA, ARTRALGIA, LACO, DOR_RETRO</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="left">CS_RACA, CS_ZONA, FEBRE, MIALGIA, CEFALEIA, EXANTEMA,</td>
</tr>
<tr>
<td valign="top" align="left">NAUSEA, ARTRITE, ARTRALGIA, PETEQUIA_N, DOR_RETRO,</td>
</tr>
<tr>
<td valign="top" align="left">DIABETES, HEMATOLOG, HEPATOPAT, RENAL, AUTO_IMUNE</td>
</tr>
<tr>
<td valign="top" rowspan="4" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="left">CS_RACA, CS_ZONA, FEBRE, MIALGIA, CEFALEIA, EXANTEMA,</td>
</tr>
<tr>
<td valign="top" align="left">NAUSEA, DOR_COSTAS, CONJUNTVIT, ARTRITE, ARTRALGIA,</td>
</tr>
<tr>
<td valign="top" align="left">PETEQUIA_N, DOR_RETRO, DIABETES, HIPERTENSA,</td>
</tr>
<tr>
<td valign="top" align="left">ACIDO_PEPT, AUTO_IMUNE, DIAS</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="left">NU_IDADE_N, CS_RACA, CS_ZONA, FEBRE, MIALGIA, CEFALEIA,</td>
</tr>
<tr>
<td valign="top" align="left">EXANTEMA, VOMITO, NAUSEA, DOR_COSTAS, CONJUNTVIT, ARTRITE, ARTRALGIA, PETEQUIA_N, DOR_RETRO, DIABETES, HEMATOLOG, HIPERTENSA, ACIDO_PEPT, DIAS</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="left">CS_GESTANT, CS_RACA, CS_ZONA, FEBRE, MIALGIA, CEFALEIA,</td>
</tr>
<tr>
<td valign="top" align="left">VOMITO, NAUSEA, DOR_COSTAS, CONJUNTVIT, ARTRITE, ARTRALGIA, PETEQUIA_N, LACO, DOR_RETRO, DIABETES, HEMATOLOG, HIPERTENSA, ACIDO_PEPT</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="left">CS_SEXO, CS_RACA, FEBRE, MIALGIA, CEFALEIA, EXANTEMA, VOMITO NAUSEA, ARTRALGIA, PETEQUIA_N, LACO, DOR_RETRO, DIABETES, HEMATOLOG, HEPATOPAT</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="left">CS_RACA, CS_ZONA, FEBRE, MIALGIA, EXANTEMA, NAUSEA, ARTRALGIA, LACO, DOR_RETRO, ACIDO_PEPT</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The model that best performed was the Xgboost model, using the SFFS technique with 20 attributes (the largest subset size in this experiment), <italic>eta</italic> = 0.3 and <italic>max_depth</italic> = 2, which obtained 62.3% accuracy. On the other hand, the KNN model with 19 attributes, selected by the SBS technique, <italic>metric</italic> = <italic>euclidian</italic>, <italic>n_neighbors</italic> = 2 and <italic>weights</italic> = <italic>uniform</italic>, was the worst model in the grid search, with 57.39% accuracy.</p>
</sec>
<sec id="s3_2">
<title>3.2 Evaluation of Models</title>
<p>
<xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> presents the results of accuracy and macro medians from recall, precision and F1-score. The GBM model outperformed all the models. It is interesting to note that the MLP model showed poor performance in comparison with the result it presented in the grid search. This difference may indicate that the MLP model failed to generalize the data during training and underfitting probably occurred and, as consequence, the MLP model did not performed well when using the test set.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The result from accuracy and macro median of recall, precision, and F1-score.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model </th>
<th valign="top" align="center">Accuracy </th>
<th valign="top" align="center">Recall </th>
<th valign="top" align="center">Precision </th>
<th valign="top" align="center">F1-score </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="center">0.5879</td>
<td valign="top" align="center">0.5903</td>
<td valign="top" align="center">0.5837</td>
<td valign="top" align="center">0.5782</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="center">0.6011</td>
<td valign="top" align="center">0.6033</td>
<td valign="top" align="center">0.5965</td>
<td valign="top" align="center">0.5949</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="center">
<bold>0.6240</bold>
</td>
<td valign="top" align="center">
<bold>0.6257</bold>
</td>
<td valign="top" align="center">
<bold>0.6205</bold>
</td>
<td valign="top" align="center">
<bold>0.6196</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="center">0.6153</td>
<td valign="top" align="center">0.6173</td>
<td valign="top" align="center">0.6116</td>
<td valign="top" align="center">0.6093</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="center">0.5411</td>
<td valign="top" align="center">0.5410</td>
<td valign="top" align="center">0.5519</td>
<td valign="top" align="center">0.5222</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="center">0.5380</td>
<td valign="top" align="center">0.5424</td>
<td valign="top" align="center">0.5569</td>
<td valign="top" align="center">0.4967</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="center">0.5798</td>
<td valign="top" align="center">0.5833</td>
<td valign="top" align="center">0.5782</td>
<td valign="top" align="center">0.5704</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The results of the CHIKUNGUNYA class are presented in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>. This class obtained the highest results, achieving more than 80% recall in KNN and MLP, although, those same models demonstrated the lowest values of precision. The other models had a better balance between these two metrics. For the F1-score metric, the situation is very similar, with the only difference being that the MLP outperformed the NB. In general, the GBM model obtained the best results.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>The result from recall, precision, and F1-score for CHIKUNGUNYA class.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model </th>
<th valign="top" align="center">Recall </th>
<th valign="top" align="center">Precision </th>
<th valign="top" align="center">F1-score </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="center">0.7992</td>
<td valign="top" align="center">0.6045</td>
<td valign="top" align="center">0.6884</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="center">0.7667</td>
<td valign="top" align="center">0.6360</td>
<td valign="top" align="center">0.6943</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="center">0.7787</td>
<td valign="top" align="center">
<bold>0.6561</bold>
</td>
<td valign="top" align="center">
<bold>0.7122</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="center">0.7881</td>
<td valign="top" align="center">0.6382</td>
<td valign="top" align="center">0.7053</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="center">
<bold>0.8396</bold>
</td>
<td valign="top" align="center">0.5365</td>
<td valign="top" align="center">0.6546</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="center">0.8100</td>
<td valign="top" align="center">0.5902</td>
<td valign="top" align="center">0.6745</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="center">0.7190</td>
<td valign="top" align="center">0.6272</td>
<td valign="top" align="center">0.6699</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The results for DENGUE class are presented in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref>. Recall values were below 50% for all models, which were the lowest values. The results of the precision and F1-score were not much better either, i.e., below 60%. Overall, the GBM model obtained the best results again.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>The result from recall, precision, and F1-score for DENGUE class.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model </th>
<th valign="top" align="center">Recall </th>
<th valign="top" align="center">Precision </th>
<th valign="top" align="center">F1-score </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="center">0.4020</td>
<td valign="top" align="center">0.5582</td>
<td valign="top" align="center">0.4674</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="center">0.4355</td>
<td valign="top" align="center">0.5638</td>
<td valign="top" align="center">0.4919</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="center">
<bold>0.4870</bold>
</td>
<td valign="top" align="center">
<bold>0.5949</bold>
</td>
<td valign="top" align="center">
<bold>0.5356</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="center">0.4589</td>
<td valign="top" align="center">0.5842</td>
<td valign="top" align="center">0.5140</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="center">0.4352</td>
<td valign="top" align="center">0.5033</td>
<td valign="top" align="center">0.4668</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="center">0.2202</td>
<td valign="top" align="center">0.4843</td>
<td valign="top" align="center">0.2902</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="center">0.3637</td>
<td valign="top" align="center">0.5642</td>
<td valign="top" align="center">0.4423</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>
<xref ref-type="table" rid="T9">
<bold>Table&#xa0;9</bold>
</xref> presents the results of recall, precision and F1-score for the OTHERS class. Results were similar and were around 60%, with the exception of the KNN model, which had a considerable drop in recall (34.83%), and the MLP and NB models, which also showed poor performance regarding the precision metric (51.49% and 54.32%, respectively). In this class, none of the models stood out, and GBM, Xgboost and NB models obtained the best values for recall, precision, and F1-score, respectively. GBM and Xgboost model were considered the best for classifying this class.</p>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>The result from recall, precision, and F1-score for OTHERS class.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Model </th>
<th valign="top" align="center">Recall </th>
<th valign="top" align="center">Precision </th>
<th valign="top" align="center">F1-score </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">
<bold>Adaboost</bold>
</td>
<td valign="top" align="center">0.5695</td>
<td valign="top" align="center">0.5882</td>
<td valign="top" align="center">0.5787</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>RF</bold>
</td>
<td valign="top" align="center">0.6085</td>
<td valign="top" align="center">0.5881</td>
<td valign="top" align="center">0.5982</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>GBM</bold>
</td>
<td valign="top" align="center">0.6115</td>
<td valign="top" align="center">0.6104</td>
<td valign="top" align="center">
<bold>0.6110</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>Xgboost</bold>
</td>
<td valign="top" align="center">0.6049</td>
<td valign="top" align="center">
<bold>0.6123</bold>
</td>
<td valign="top" align="center">0.6086</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>KNN</bold>
</td>
<td valign="top" align="center">0.3483</td>
<td valign="top" align="center">0.6161</td>
<td valign="top" align="center">0.4450</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>MLP</bold>
</td>
<td valign="top" align="center">0.6463</td>
<td valign="top" align="center">0.5149</td>
<td valign="top" align="center">0.5714</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>NB</bold>
</td>
<td valign="top" align="center">
<bold>0.6673</bold>
</td>
<td valign="top" align="center">0.5432</td>
<td valign="top" align="center">0.5989</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s3_2_1">
<title>3.2.1 Specialist Evaluation</title>
<p>The specialists analysed the attributes used in the GBM model, called GBM-SFA, and requested the removal of four attributes: CS_RACA, CS_ZONA, ACIDO_PEPT and AUTO_IMUNE. As a result, the remaining attributes (<xref ref-type="table" rid="T10">
<bold>Table&#xa0;10</bold>
</xref>) were used as input for training a new GBM model, called GBM-Specialist. In order to achieve maximum performance, another grid search was executed, with the same GBM hyperparameters that are presented in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. The best configuration for hyperparameters was <italic>max_pedth</italic> = 5, higher than the GBM-SFA, and <italic>n_estimators</italic> = 100 half the GBM-SFA, with a validation accuracy of 60.15%.</p>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Attributes selected by the specialist.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Attribute </th>
<th valign="top" align="center">Description </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FEBRE</td>
<td valign="top" align="left">Symptom - Fever</td>
</tr>
<tr>
<td valign="top" align="left">MIALGIA</td>
<td valign="top" align="left">Symptom - Myalgia</td>
</tr>
<tr>
<td valign="top" align="left">CEFALEIA</td>
<td valign="top" align="left">Symptom - Headache</td>
</tr>
<tr>
<td valign="top" align="left">EXANTEMA</td>
<td valign="top" align="left">Symptom - Rash</td>
</tr>
<tr>
<td valign="top" align="left">NAUSEA</td>
<td valign="top" align="left">Symptom - Nausea</td>
</tr>
<tr>
<td valign="top" align="left">DOR_COSTAS</td>
<td valign="top" align="left">Symptom - Back Pain</td>
</tr>
<tr>
<td valign="top" align="left">CONJUNTVIT</td>
<td valign="top" align="left">Symptom - Conjunctivitis</td>
</tr>
<tr>
<td valign="top" align="left">ARTRITE</td>
<td valign="top" align="left">Symptom - Arthritis</td>
</tr>
<tr>
<td valign="top" align="left">ARTRALGIA</td>
<td valign="top" align="left">Symptom - Arthralgia</td>
</tr>
<tr>
<td valign="top" align="left">PETEQUIA_N</td>
<td valign="top" align="left">Symptom - Petechiae</td>
</tr>
<tr>
<td valign="top" align="left">DOR_RETRO</td>
<td valign="top" align="left">Symptom - Eye pain</td>
</tr>
<tr>
<td valign="top" align="left">DIABETES</td>
<td valign="top" align="left">Pre-existing disease - Diabetes</td>
</tr>
<tr>
<td valign="top" align="left">HIPERTENSA</td>
<td valign="top" align="left">Pre-existing disease - Hypertension</td>
</tr>
<tr>
<td valign="top" align="left">DIAS</td>
<td valign="top" align="left">Days that the patient is feeling the symptoms</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>
<xref ref-type="table" rid="T11">
<bold>Table&#xa0;11</bold>
</xref> shows the results of all metrics for the GBM-SFA and GBM-Specialist models. The GBM-SFA presented the best performance for all metrics, except in the precision of the DENGUE class, though with only a very small difference.</p>
<table-wrap id="T11" position="float">
<label>Table&#xa0;11</label>
<caption>
<p>Comparison between the GBM model with SFA attributes and GBM model with specialist attributes.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Classes</th>
<th valign="top" align="center">Metrics </th>
<th valign="top" align="center">GBM-SFA </th>
<th valign="top" align="center">GBM-Specialist </th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="4" align="left">
<bold>Macro</bold>
</td>
<td valign="top" align="left">Accuracy</td>
<td valign="top" align="center">
<bold>0.6240</bold>
</td>
<td valign="top" align="center">0.6075</td>
</tr>
<tr>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">
<bold>0.6257</bold>
</td>
<td valign="top" align="center">0.6094</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">
<bold>0.6205</bold>
</td>
<td valign="top" align="center">0.6053</td>
</tr>
<tr>
<td valign="top" align="left">F1-score</td>
<td valign="top" align="center">
<bold>0.6196</bold>
</td>
<td valign="top" align="center">0.6021</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>DENGUE</bold>
</td>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">
<bold>0.4870</bold>
</td>
<td valign="top" align="center">0.4600</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">0.5949</td>
<td valign="top" align="center">
<bold>0.5993</bold>
</td>
</tr>
<tr>
<td valign="top" align="left">F1-score</td>
<td valign="top" align="center">
<bold>0.5356</bold>
</td>
<td valign="top" align="center">0.5204</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>CHIKUNGUNYA</bold>
</td>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">
<bold>0.7787</bold>
</td>
<td valign="top" align="center">0.7658</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">
<bold>0.6561</bold>
</td>
<td valign="top" align="center">0.6313</td>
</tr>
<tr>
<td valign="top" align="left">F1-score</td>
<td valign="top" align="center">
<bold>0.7122</bold>
</td>
<td valign="top" align="center">0.6921</td>
</tr>
<tr>
<td valign="top" rowspan="3" align="left">
<bold>OTHERS</bold>
</td>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">
<bold>0.6115</bold>
</td>
<td valign="top" align="center">0.6025</td>
</tr>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">
<bold>0.6104</bold>
</td>
<td valign="top" align="center">0.5860</td>
</tr>
<tr>
<td valign="top" align="left">F1-score</td>
<td valign="top" align="center">
<bold>0.6110</bold>
</td>
<td valign="top" align="center">0.5941</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>In bold: These were the results that stood out, i.e., the highest value of each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s3_3">
<title>3.3 Discussions</title>
<p>The grid search results did not present a large variation, ranging from 57% to 62% accuracy. In addition, none of the models presented accuracy above 70%, which shows the difficulty involved in classifying arboviral diseases using only clinical and socio-demographic data.</p>
<p>It was possible to observe that the DENGUE class was the class with the lowest performance, thus highlighting the difficulty in classifying this disease with the data used. However, there are some reasons that may explain this: <italic>(a)</italic> the classification is a multi-class task, which makes it difficult to generalize the three classes;<italic>(b)</italic> the applying of the undersampling technique to balance the data may also have affected the DENGUE class, as at the end of the pre-processing there were almost 18,000 Dengue registers that could have been used, but this number had to be reduced to almost 6,000 due to the low amount of Chikungunya registers; <italic>(c)</italic> lack of laboratory attributes often associated to Dengue such as leucocyte count, haematocrit or thrombocytes.</p>
<p>Any model intended to predict Dengue would need to be applied earlier in the illness to identify who must be closely monitored for plasma leakage (<xref ref-type="bibr" rid="B24">24</xref>). Although, in early phases, Dengue is often indistinguishable from other arboviral diseases (<xref ref-type="bibr" rid="B25">25</xref>). Using only clinical and socio-demographic data, as proposed in our work, may be a limitation in Dengue diagnosis. However, in remote areas that lack human and laboratory resources, the models can play an essential role in surveillance by identifying possible epidemics.</p>
<p>The OTHERS class performed a little better than the DENGUE class, but still showed weak performance. In this case, the great diversification may be the main cause, as this class includes all patients who were admitted with a suspected case of arbovirus, but were classified as inconclusive and discarded, so here the patients may include a wide variety of diseases.</p>
<p>The CHIKUNGUNYA class presented the best results in this work, principally in regards to the recall metric, with over 80%. These results show that, besides the difficulty, it is possible to&#xa0;make a good classification using only clinical and socio-demographic data. As such, our models can be used as a low-cost and rapid alternative, which would be useful in a resource-limited scenario (<xref ref-type="bibr" rid="B10">10</xref>). Note that arthralgia is considered a very common presentation in Chikungunya fever. Together with high fever, it has a specificity of 99.6% and a positive predictive value of 84.6% for infection classification. Since other arboviruses infections such as ZIKV, DENV and other alphaviruses also present with arthralgia, the epidemiological scenario must be considered before classifying every case as CHIKV infection based only on this symptom (<xref ref-type="bibr" rid="B26">26</xref>).</p>
<p>The tree-based models (Adaboost, RF, GBM and Xgboost) presented the best overall results. Tree-based models are generally the best models for problems that use tabular data. The MLP model presented the worst results, and a drop in performance was observed compared to training with grid search, thus indicating a possible underfitting.</p>
<p>The GBM model obtained the best results, and its attributes were analysed by the specialists and a new GBM model was designed. Despite obtaining slightly inferior results, it did have better interpretability for physicians and these results show that the GBM-SFA model is the most accurate model. On the order hand, the GBM-Specialist model is more interpretable and consequently would be more accepted for use by physicians. According to Ozaydin et&#xa0;al. (2021), &#x201c;<italic>interpretability and accuracy may often have to be sacrificed for each other</italic>&#x201d; <italic>(</italic>
<xref ref-type="bibr" rid="B27">27</xref>). It is useless for a model to be precise if physicians cannot use it daily because they do not trust the model or do not understand the attributes used, for example. In this sense, the GBM-Specialist has a big advantage over the GBM-SFA model and, despite achieving lower performance, the difference in results is around 2% for each metric.</p>
<p>Although multi-class classification better represents real-world problems, most classification techniques are focused on binary classification (<xref ref-type="bibr" rid="B28">28</xref>). This happens, among other factors, due to the high complexity of training a model to be able to generalize more than two classes. This greater complexity contributes to multi-class models having lower performance when compared to binary models.</p>
<p>As future work, we plan to make an ensemble of two binary models, one trained to classify Dengue and another trained to classify Chikungunya. In this way, we can use more data from Dengue notifications, and we believe that with more data for training it is possible to improve the results of the DENGUE class.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Conclusions</title>
<p>Arboviruses are diseases that have similar symptoms, which makes it difficult to make decisions regarding their treatment. For this reason, the correct classification of arboviral diseases when the patient arrives for treatment becomes a very useful tool in the daily life of hospitals. To help solve this problem, ML models were proposed for multi-class classification of Dengue, Chikungunya, and other common illnesses in Brazil, using only clinical and socio-demographic data.</p>
<p>In this work, seven ML models were evaluated: Adaboost, RF, GBM, Xgboost, KNN, MLP and NB. A grid search was executed for each model along with a SFA technique for optimization of the hyperparameters and attribute selection. The tree-based models (Adaboost, RF, GBM and Xgboost) presented the best overall results. The MLP model presented the worst results, and a drop in performance was observed compared to training with grid search, indicating a possible underfitting. The GBM model, named GBM-SFA, obtained the best results and its attributes were analysed by the specialists and a new GBM model was designed and named the GBM-specialist model.</p>
<p>When comparing the metrics of the GBM-SFA and GBM-specialist models for classification of both classes, the GBM-SFA outperformed the GBM-specialist model, showing that despite professionals being specialists in the field of infectious diseases, the difficulty and limitations of human clinical diagnosis of these arboviruses is real, as the signs and symptoms are very similar and arboviruses circulate concomitantly in Brazil (<xref ref-type="bibr" rid="B29">29</xref>&#x2013;<xref ref-type="bibr" rid="B31">31</xref>).</p>
<p>The models evaluated in this work showed high sensitivity rates in relation to the CHIKUNGUNYA class. However, more sensitive ML models could aid in the identification and classification of arbovirus cases, and provide clinicians with a diagnostic tool based on real data that would complement clinical judgment, as well as being an effective surveillance tool in a pre-epidemic period. More specific models should be explored to identify laboratory-confirmed arbovirus cases during peak and post-peak periods, as the number of cases increases dramatically during these periods (<xref ref-type="bibr" rid="B17">17</xref>).</p>
<p>Our results showed that, in addition to the difficulty, it is possible to make a good classification using only clinical and socio-demographic data. Our models can be used as a low-cost and quick alternative, and would be useful in a scenario of limited resources in which only information from the patient that is obtained at the health unit is available.</p>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <uri xlink:href="https://data.mendeley.com/datasets/bv26kznkjs/1">https://data.mendeley.com/datasets/bv26kznkjs/1</uri>.</p>
</sec>
<sec id="s6" sec-type="ethics-statement">
<title>Ethics Statement</title>
<p>All methods were performed in accordance with the Brazilian regulations that do not require consent for studies using unidentified data from the Brazilian data health systems.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author Contributions</title>
<p>TT, SN, IT, and PE conceived the methodology and the design of the experiments of the work. TT, SN, and IT performed to the pre-processing of the data set and the experiments with the ML models. SO, MA, and VS performed the statistical analysis. PE analysed the experiments and the statistical analysis. SO, MA, and VS acted as health specialists. All authors contributed to the writing and review of the manuscript, read and approved the submitted version.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>VS has a grant (062.00249/2020 [EDITAL N. 006/2019 - UNIVERSAL AMAZONAS]) from Funda&#xe7;&#xe3;o de Amparo &#xe0; Pesquisa do Estado do Amazonas (FAPEAM) (<uri xlink:href="http://www.fapeam.am.gov.br/">http://www.fapeam.am.gov.br/</uri>). The sponsor had no role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ack>
<title>Acknowledgments</title>
<p>The authors would like to thank Conselho Nacional de Desenvolvimento Cient&#xed;fico e Tecnol&#xf3;gico (CNPq); Funda&#xe7;&#xe3;o de Amparo &#xe0; Pesquisa do Estado do Amazonas (FAPEAM); Funda&#xe7;&#xe3;o de Vigil&#xe2;ncia em Sa&#xfa;de Dra. Rosemary Costa Pinto; Funda&#xe7;&#xe3;o de Amparo a Ci&#xea;ncia e Tecnologia do Estado de Pernambuco (FACEPE); and Universidade de Pernambuco (UPE), an entity of the Government of the State of Pernambuco focused on the promotion of Teaching, Research and Extension.</p>
</ack>
<sec id="s11" sec-type="supplementary-material">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fitd.2021.769968/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fitd.2021.769968/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="DataSheet_1.pdf" id="SM1" mimetype="application/pdf"/>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>
<uri xlink:href="http://www.agenda2030.com.br">http://www.agenda2030.com.br</uri>
</p>
</fn>
<fn id="fn2">
<label>2</label>
<p>
<uri xlink:href="https://www3.paho.org/data/index.php/es/temas/indicadores-dengue.html">https://www3.paho.org/data/index.php/es/temas/indicadores-dengue.html</uri>, accessed Nov 11, 2020</p>
</fn>
<fn id="fn3">
<label>3</label>
<p>
<uri xlink:href="https://www3.paho.org/data/index.php/es/temas/chikv-es.html">https://www3.paho.org/data/index.php/es/temas/chikv-es.html</uri>, accessed Nov 11, 2020</p>
</fn>
<fn id="fn4">
<label>4</label>
<p>
<uri xlink:href="https://aps.bvs.br/aps/qual-a-especificidade-e-sensibilidade-do-teste-rapido-da-dengue-e-que-tipos-existem/">https://aps.bvs.br/aps/qual-a-especificidade-e-sensibilidade-do-teste-rapido-da-dengue-e-que-tipos-existem/</uri>
</p>
</fn>
<fn id="fn5">
<label>5</label>
<p>
<uri xlink:href="http://portalsinan.saude.gov.br/">http://portalsinan.saude.gov.br/</uri>
</p>
</fn>
<fn id="fn6">
<label>6</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html">https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html</uri>
</p>
</fn>
<fn id="fn7">
<label>7</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html</uri>
</p>
</fn>
<fn id="fn8">
<label>8</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html</uri>
</p>
</fn>
<fn id="fn9">
<label>9</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html</uri>
</p>
</fn>
<fn id="fn10">
<label>10</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html</uri>
</p>
</fn>
<fn id="fn11">
<label>11</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html</uri>
</p>
</fn>
<fn id="fn12">
<label>12</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html">https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html</uri>
</p>
</fn>
<fn id="fn13">
<label>13</label>
<p>
<uri xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html">https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html</uri>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lopes</surname> <given-names>N</given-names>
</name>
<name>
<surname>Nozawa</surname> <given-names>C</given-names>
</name>
<name>
<surname>Linhares</surname> <given-names>REC</given-names>
</name>
</person-group>. <article-title>Caracter&#xed;sticas Gerais E Epidemiologia Dos Arbov&#xed;rus Emergentes No Brasil</article-title>. <source>Rev Pan-Amaz&#xf4;nica Sa&#xfa;de</source> (<year>2014</year>) <volume>5</volume>:<page-range>10&#x2013;0</page-range>. doi: <pub-id pub-id-type="doi">10.5123/S2176-62232014000300007</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lima-Camara</surname> <given-names>TN</given-names>
</name>
</person-group>. <article-title>Emerging Arboviruses and Public Health Challenges in Brazil</article-title>. <source>Rev Saude Publica</source> (<year>2016</year>) <volume>50</volume>:<fpage>36</fpage>. doi: <pub-id pub-id-type="doi">10.1590/S1518-8787.2016050006791</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kikuti</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tauro</surname> <given-names>LB</given-names>
</name>
<name>
<surname>Moreira</surname> <given-names>PS</given-names>
</name>
<name>
<surname>Nascimento</surname> <given-names>LCJ</given-names>
</name>
<name>
<surname>Portilho</surname> <given-names>MM</given-names>
</name>
<name>
<surname>Soares</surname> <given-names>GC</given-names>
</name>
<etal/>
</person-group>. <article-title>Evaluation of Two Commercially Available Chikungunya Virus Igm Enzyme-Linked Immunoassays (Elisa) in a Setting of Concomitant Transmission of Chikungunya, Dengue and Zika Viruses</article-title>. <source>Int J Infect Dis</source> (<year>2020</year>) <volume>91</volume>:<fpage>38</fpage>&#x2013;<lpage>43</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijid.2019.11.001</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Figueiredo</surname> <given-names>LTM</given-names>
</name>
</person-group>. <article-title>Emergent Arboviruses in Brazil</article-title>. <source>Rev da Sociedade Bras Med Trop</source> (<year>2007</year>) <volume>40</volume>:<page-range>224&#x2013;9</page-range>. doi: <pub-id pub-id-type="doi">10.1590/S0037-86822007000200016</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bulbul</surname> <given-names>HI</given-names>
</name>
<name>
<surname>Unsal</surname> <given-names>&#xd6;</given-names>
</name>
</person-group>. <article-title>Comparison of Classification Techniques Used in Machine Learning as Applied on Vocational Guidance Data</article-title>. In: <source>2011 10th International Conference on Machine Learning and Applications and Workshops</source>, vol. <volume>2</volume>. <publisher-loc>Ankara, Turkey</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2011</year>). p. <fpage>298</fpage>&#x2013;<lpage>301</lpage>.</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fahmi</surname> <given-names>A</given-names>
</name>
<name>
<surname>Purwitasari</surname> <given-names>D</given-names>
</name>
<name>
<surname>Sumpeno</surname> <given-names>S</given-names>
</name>
<name>
<surname>Purnomo</surname> <given-names>MH</given-names>
</name>
</person-group>. <article-title>Performance Evaluation of Classifiers for Predicting Infection Cases of Dengue Virus Based on Clinical Diagnosis Criteria</article-title>. In: <source>2020 International Electronics Symposium (IES)</source>. <publisher-loc>Surabaya, Indonesia</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2020</year>). p. <page-range>456&#x2013;62</page-range>.</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Thitiprayoonwongse</surname> <given-names>D</given-names>
</name>
<name>
<surname>Suriyaphol</surname> <given-names>P</given-names>
</name>
<name>
<surname>Soonthornphisaj</surname> <given-names>N</given-names>
</name>
</person-group>. <article-title>Data Mining of Dengue Infection Using Decision Tree</article-title>. <source>Entropy</source> (<year>2012</year>) <volume>2</volume>:<fpage>2</fpage>.</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hossain</surname> <given-names>MS</given-names>
</name>
<name>
<surname>Sultana</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Nahar</surname> <given-names>L</given-names>
</name>
<name>
<surname>Andersson</surname> <given-names>K</given-names>
</name>
</person-group>. <article-title>An Intelligent System to Diagnose Chikungunya Under Uncertainty</article-title>. <source>J Wireless Mobile Netw Ubiquitous Comput Dependable Appl</source> (<year>2019</year>) <volume>10</volume>:<fpage>37</fpage>&#x2013;<lpage>54</lpage>. doi: <pub-id pub-id-type="doi">10.22667/JOWUA.2019.06.30.037</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Veiga</surname> <given-names>RV</given-names>
</name>
<name>
<surname>Schuler-Faccini</surname> <given-names>L</given-names>
</name>
<name>
<surname>Fran&#xe7;a</surname> <given-names>GV</given-names>
</name>
<name>
<surname>Andrade</surname> <given-names>RF</given-names>
</name>
<name>
<surname>Teixeira</surname> <given-names>MG</given-names>
</name>
<name>
<surname>Costa</surname> <given-names>LC</given-names>
</name>
<etal/>
</person-group>. <article-title>Classification Algorithm for Congenital Zika Syndrome: Characterizations, Diagnosis and Validation</article-title>. <source>Sci Rep</source> (<year>2021</year>) <volume>11</volume>:<fpage>1</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-021-86361-5</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>VJ</given-names>
</name>
<name>
<surname>Chow</surname> <given-names>A</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>X</given-names>
</name>
<name>
<surname>Carrasco</surname> <given-names>LR</given-names>
</name>
<name>
<surname>Cook</surname> <given-names>AR</given-names>
</name>
<name>
<surname>Lye</surname> <given-names>DC</given-names>
</name>
<etal/>
</person-group>. <article-title>Simple Clinical and Laboratory Predictors of Chikungunya Versus Dengue Infections in Adults</article-title>. <source>PloS Negl Trop Dis</source> (<year>2012</year>) <volume>6</volume>:<fpage>e1786</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pntd.0001786</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Niu</surname> <given-names>L</given-names>
</name>
</person-group>. <article-title>A Survey on Feature Selection</article-title>. <source>Proc Comput Sci</source> (<year>2016</year>) <volume>91</volume>:<page-range>919&#x2013;26</page-range>. doi: <pub-id pub-id-type="doi">10.1016/j.procs.2016.07.111</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Suto</surname> <given-names>J</given-names>
</name>
<name>
<surname>Oniga</surname> <given-names>S</given-names>
</name>
<name>
<surname>Sitar</surname> <given-names>PP</given-names>
</name>
</person-group>. <article-title>Comparison of Wrapper and Filter Feature Selection Algorithms on Human Activity Recognition</article-title>. In: <source>2016 6th International Conference on Computers Communications and Control (ICCCC)</source>. <publisher-loc>Oradea, Romania</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2016</year>). p. <page-range>124&#x2013;9</page-range>.</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bergstra</surname> <given-names>J</given-names>
</name>
<name>
<surname>Bengio</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>Random Search for Hyper-Parameter Optimization</article-title>. <source>J&#xa0;Mach Learn Res</source> (<year>2012</year>) <volume>13</volume>:<fpage>281</fpage>&#x2013;<lpage>305</lpage>.</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname> <given-names>J</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>FR</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>T</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>R</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C</given-names>
</name>
<etal/>
</person-group>. <article-title>A Survey of Machine Learning Techniques Applied to Software Defined Networking (Sdn): Research Issues and Challenges</article-title>. <source>IEEE Commun Surveys Tutorials</source> (<year>2018</year>) <volume>21</volume>:<fpage>393</fpage>&#x2013;<lpage>430</lpage>. doi: <pub-id pub-id-type="doi">10.1109/COMST.2018.2866942</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Potts</surname> <given-names>JA</given-names>
</name>
<name>
<surname>Gibbons</surname> <given-names>RV</given-names>
</name>
<name>
<surname>Rothman</surname> <given-names>AL</given-names>
</name>
<name>
<surname>Srikiatkhachorn</surname> <given-names>A</given-names>
</name>
<name>
<surname>Thomas</surname> <given-names>SJ</given-names>
</name>
<name>
<surname>Supradish</surname> <given-names>P</given-names>
</name>
<etal/>
</person-group>. <article-title>Prediction of Dengue Disease Severity Among Pediatric Thai Patients Using Early Clinical Laboratory Indicators</article-title>. <source>PloS Negl Trop Dis</source> (<year>2010</year>) <volume>4</volume>:<fpage>e769</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pntd.0000769</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gambhir</surname> <given-names>S</given-names>
</name>
<name>
<surname>Malik</surname> <given-names>SK</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>Y</given-names>
</name>
</person-group>. <article-title>The Diagnosis of Dengue Disease: An Evaluation of Three Machine Learning Approaches</article-title>. <source>Int J Healthcare Inf Syst Inf (IJHISI)</source> (<year>2018</year>) <volume>13</volume>:<fpage>1</fpage>&#x2013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.4018/IJHISI.2018070101</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ho</surname> <given-names>TS</given-names>
</name>
<name>
<surname>Weng</surname> <given-names>TC</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>JD</given-names>
</name>
<name>
<surname>Han</surname> <given-names>HC</given-names>
</name>
<name>
<surname>Cheng</surname> <given-names>HC</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>CC</given-names>
</name>
<etal/>
</person-group>. <article-title>Comparing Machine Learning With Case-Control Models to Identify Confirmed Dengue Cases</article-title>. <source>PloS Neglected Trop Dis</source> (<year>2020</year>) <volume>14</volume>:<fpage>e0008843</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pntd.0008843</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Faisal</surname> <given-names>T</given-names>
</name>
<name>
<surname>Ibrahim</surname> <given-names>F</given-names>
</name>
<name>
<surname>Taib</surname> <given-names>MN</given-names>
</name>
</person-group>. <article-title>A Noninvasive Intelligent Approach for Predicting the Risk in Dengue Patients</article-title>. <source>Expert Syst Appl</source> (<year>2010</year>) <volume>37</volume>:<page-range>2175&#x2013;81</page-range>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2009.07.060</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Department</surname> <given-names>RH</given-names>
</name>
</person-group>. <article-title>Cases of Dengue, Zika and Chikungunya</article-title>. (<year>2011</year>). Available at: <uri xlink:href="http://dados.recife.pe.gov.br/dataset/casos-de-dengue-zika-e-chikungunya">http://dados.recife.pe.gov.br/dataset/casos-de-dengue-zika-e-chikungunya</uri>.</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Prusa</surname> <given-names>J</given-names>
</name>
<name>
<surname>Khoshgoftaar</surname> <given-names>TM</given-names>
</name>
<name>
<surname>Dittman</surname> <given-names>DJ</given-names>
</name>
<name>
<surname>Napolitano</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>Using Random Undersampling to Alleviate Class Imbalance on Tweet Sentiment Data</article-title>. In: <source>2015 IEEE International Conference on Information Reuse and Integration</source>. <publisher-loc>San Francisco, USA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2015</year>). p. <fpage>197</fpage>&#x2013;<lpage>202</lpage>.</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tabosa</surname> <given-names>T</given-names>
</name>
<name>
<surname>Neto</surname> <given-names>SS</given-names>
</name>
<name>
<surname>Teixeira</surname> <given-names>I</given-names>
</name>
<name>
<surname>Oliveira</surname> <given-names>S</given-names>
</name>
<name>
<surname>Rodrigues</surname> <given-names>MG</given-names>
</name>
<name>
<surname>Sampaio</surname> <given-names>V</given-names>
</name>
<etal/>
</person-group>. <article-title>Clinical Cases of Dengue and Chikungunya</article-title>. (<year>2021</year>). doi:&#xa0;<pub-id pub-id-type="doi">10.17632/bv26kznkjs.1</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Berrar</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>Cross-Validation</article-title>. In: <person-group person-group-type="editor">
<name>
<surname>Ranganathan</surname> <given-names>S</given-names>
</name>
<name>
<surname>Gribskov</surname> <given-names>M</given-names>
</name>
<name>
<surname>Nakai</surname> <given-names>K</given-names>
</name>
<name>
<surname>Sch&#xf6;nbach</surname> <given-names>C</given-names>
</name>
</person-group>, editors. <source>Encyclopedia of Bioinformatics and Computational Biology</source>. <publisher-loc>Oxford</publisher-loc>: <publisher-name>Academic Press</publisher-name> (<year>2019</year>). p. <page-range>542&#x2013;5</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/B978-0-12-809633-8.20349-X</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kavitha</surname> <given-names>V</given-names>
</name>
<name>
<surname>Chetan</surname> <given-names>H</given-names>
</name>
</person-group>. <article-title>Performance Dependency of Facial Emotion Recognition System on Dropout and Learning Rate</article-title>. In: <source>2020 3rd International Conference on Intelligent Sustainable Systems (ICISS)</source>. <publisher-loc>Thoothukudi, India</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2020</year>). p. <fpage>71</fpage>&#x2013;<lpage>81</lpage>.</citation>
</ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Potts</surname> <given-names>JA</given-names>
</name>
<name>
<surname>Rothman</surname> <given-names>AL</given-names>
</name>
</person-group>. <article-title>Clinical and Laboratory Features That Distinguish Dengue From Other Febrile Illnesses in Endemic Populations</article-title>. <source>Trop Med Int Health</source> (<year>2008</year>) <volume>13</volume>:<page-range>1328&#x2013;40</page-range>. doi: <pub-id pub-id-type="doi">10.1111/j.1365-3156.2008.02151.x</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25</label>
<citation citation-type="book">
<person-group person-group-type="author">
<collab>Organization WH, for Research SP, in Tropical Diseases T, of Control of Neglected Tropical Diseases WHOD, Epidemic WHO, Alert P</collab>
</person-group>. <source>Dengue: Guidelines for Diagnosis, Treatment, Prevention and Control</source>. <publisher-loc>Geneva, Switzerland</publisher-loc>: <publisher-name>World Health Organization</publisher-name> (<year>2009</year>).</citation>
</ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goupil</surname> <given-names>BA</given-names>
</name>
<name>
<surname>Mores</surname> <given-names>CN</given-names>
</name>
</person-group>. <article-title>A Review of Chikungunya Virus-Induced Arthralgia: Clinical Manifestations, Therapeutics, and Pathogenesis</article-title>. <source>Open Rheumatol J</source> (<year>2016</year>) <volume>10</volume>:<fpage>129</fpage>. doi: <pub-id pub-id-type="doi">10.2174/1874312901610010129</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ozaydin</surname> <given-names>B</given-names>
</name>
<name>
<surname>Berner</surname> <given-names>ES</given-names>
</name>
<name>
<surname>Cimino</surname> <given-names>JJ</given-names>
</name>
</person-group>. <article-title>Appropriate Use of Machine Learning in Healthcare</article-title>. <source>Intelligence-Based Med</source> (<year>2021</year>) <volume>5</volume>:<elocation-id>100041</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ibmed.2021.100041</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cruz</surname> <given-names>EAS</given-names>
</name>
<name>
<surname>Franco</surname> <given-names>CHE</given-names>
</name>
</person-group>. <article-title>Challenges of Multivariable and Multiclass Classification Problems</article-title>. (<year>2013</year>). Available at: <uri xlink:href="https://www.udi.edu.co/congreso/historial/congreso_2016/ponencias/Ingenieria_Electronica/Challenges_of_multivariable_and_multiclass.pdf">https://www.udi.edu.co/congreso/historial/congreso_2016/ponencias/Ingenieria_Electronica/Challenges_of_multivariable_and_multiclass.pdf</uri>.</citation>
</ref>
<ref id="B29">
<label>29</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paix&#xe3;o</surname> <given-names>ES</given-names>
</name>
<name>
<surname>Teixeira</surname> <given-names>MG</given-names>
</name>
<name>
<surname>Rodrigues</surname> <given-names>LC</given-names>
</name>
</person-group>. <article-title>Zika, Chikungunya and Dengue: The Causes and Threats of New and Re-Emerging Arboviral Diseases</article-title>. <source>BMJ Global Health</source> (<year>2018</year>) <volume>3</volume>:<elocation-id>e000530</elocation-id>. doi: <pub-id pub-id-type="doi">10.1136/bmjgh-2017-00053</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Donalisio</surname> <given-names>MR</given-names>
</name>
<name>
<surname>Freitas</surname> <given-names>ARR</given-names>
</name>
<name>
<surname>Zuben</surname> <given-names>APBV</given-names>
</name>
</person-group>. <article-title>Arboviroses Emergentes No Brasil: Desafios Para a Cl&#xed;nica E Implica&#xe7;&#xf5;es Para a Sa&#xfa;de P&#xfa;blica</article-title>. <source>Rev Sa&#xfa;de P&#xfa;blica</source> (<year>2017</year>) <volume>51</volume>. doi: <pub-id pub-id-type="doi">10.1590/S1518-8787.2017051006889</pub-id>
</citation>
</ref>
<ref id="B31">
<label>31</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vasconcelos</surname> <given-names>PF</given-names>
</name>
<name>
<surname>Calisher</surname> <given-names>CH</given-names>
</name>
</person-group>. <article-title>Emergence of Human Arboviral Diseases in the Americas, 2000&#x2013;2016</article-title>. <source>Vector-Borne Zoonotic Dis</source> (<year>2016</year>) <volume>16</volume>:<fpage>295</fpage>&#x2013;<lpage>301</lpage>. doi: <pub-id pub-id-type="doi">10.1089/vbz.2016.1952</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>