<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurol.</journal-id>
<journal-title>Frontiers in Neurology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurol.</abbrev-journal-title>
<issn pub-type="epub">1664-2295</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fneur.2021.649521</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neurology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Prediction of 30-Day Readmission After Stroke Using Machine Learning and Natural Language Processing</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Lineback</surname> <given-names>Christina M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1189430/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Garg</surname> <given-names>Ravi</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Oh</surname> <given-names>Elissa</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Naidech</surname> <given-names>Andrew M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/803285/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Holl</surname> <given-names>Jane L.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Prabhakaran</surname> <given-names>Shyam</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/19602/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Neurology, Feinberg School of Medicine, Northwestern University, Chicago</institution>, <addr-line>IL</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Neurology, Biological Sciences, Division and Center for Healthcare Delivery Science and Innovation, University of Chicago</institution>, <addr-line>Chicago, IL</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Neurology, University of Chicago</institution>, <addr-line>Chicago, IL</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Thanh G. Phan, Monash Health, Australia</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Seana Gall, University of Tasmania, Australia; Aladine Elsamadicy, Yale University, United States</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Shyam Prabhakaran <email>shyam1&#x00040;neurology.bsd.uchicago.edu</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Stroke, a section of the journal Frontiers in Neurology</p></fn></author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>07</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>649521</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>01</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>06</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2021 Lineback, Garg, Oh, Naidech, Holl and Prabhakaran.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Lineback, Garg, Oh, Naidech, Holl and Prabhakaran</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract><p><bold>Background and Purpose:</bold> This study aims to determine whether machine learning (ML) and natural language processing (NLP) from electronic health records (EHR) improve the prediction of 30-day readmission after stroke.</p>
<p><bold>Methods:</bold> Among index stroke admissions between 2011 and 2016 at an academic medical center, we abstracted discrete data from the EHR on demographics, risk factors, medications, hospital complications, and discharge destination and unstructured textual data from clinician notes. Readmission was defined as any unplanned hospital admission within 30 days of discharge. We developed models to predict two separate outcomes, as follows: (1) 30-day all-cause readmission and (2) 30-day stroke readmission. We compared the performance of logistic regression with advanced ML algorithms. We used several NLP methods to generate additional features from unstructured textual reports. We evaluated the performance of prediction models using a five-fold validation and tested the best model in a held-out test dataset. Areas under the curve (AUCs) were used to compare discrimination of each model.</p>
<p><bold>Results:</bold> In a held-out test dataset, advanced ML methods along with NLP features out performed logistic regression for all-cause readmission (AUC, 0.64 vs. 0.58; <italic>p</italic> &#x0003C; 0.001) and stroke readmission prediction (AUC, 0.62 vs. 0.52; <italic>p</italic> &#x0003C; 0.001).</p>
<p><bold>Conclusion:</bold> NLP-enhanced machine learning models potentially advance our ability to predict readmission after stroke. However, further improvement is necessary before being implemented in clinical practice given the weak discrimination.</p></abstract>
<kwd-group>
<kwd>stroke</kwd>
<kwd>readmission</kwd>
<kwd>machine learning</kwd>
<kwd>natural language processing</kwd>
<kwd>bioinformatics</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="34"/>
<page-count count="8"/>
<word-count count="5170"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>Introduction</title>
<p>Nearly 800,000 patients experience a stroke each year in the USA (<xref ref-type="bibr" rid="B1">1</xref>). The cost of initial admissions for stroke averages US$20,000 while readmissions cost on average US$10,000 (<xref ref-type="bibr" rid="B1">1</xref>&#x02013;<xref ref-type="bibr" rid="B3">3</xref>). Reduction in readmission is, thus, an important target to reduce healthcare costs and improve patient care. However, several studies have demonstrated that available prediction models for readmission perform modestly (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>). A better understanding of the causes leading to readmission and better prediction tools may allow hospital systems to better allocate resources to the patients who are most at risk for readmission (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>).</p>
<p>Prior efforts to stratify risk of readmission have utilized basic statistical models, such as logistic regression, with modest results (AUC range: 0.53&#x02013;0.67) (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>). However, these studies do not report results on a separate held out dataset thereby not addressing the generalizability of these results. Also, since these methods are trained and validated on the same datasets, the results are highly prone to be inflated due to overfitting. Furthermore, logistic regression base models are incapable of properly weighing the interactions between the complex variables in additive analyses (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B9">9</xref>).</p>
<p>Machine learning (<xref ref-type="bibr" rid="B10">10</xref>) (ML) has emerged as a new statistical approach to overcome the limitation of non-linearity and improve predictive analysis in healthcare. Advanced ML methods have shown to be superior for predicting readmission in patients with heart failure (<xref ref-type="bibr" rid="B11">11</xref>). Furthermore, natural language processing (NLP) methods can be utilized to automatically extract much of the rich but difficult-to-access medical information that is often buried in unstructured text notes within electronic health records (EHR). There has been widespread interest to use ML in conjunction with NLP to build clinical tools for cohort construction, clinical trials, and clinical decision support (<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B12">12</xref>). There has been, however, no study to use NLP of clinical notes and ML to predict readmissions after stroke. We, therefore, sought to evaluate advanced ML algorithms that incorporate NLP features of textual data in the EHR to improve prediction of 30-day readmission after stroke. We also seek to evaluate our models on a separate held out dataset in order to test the generalizability of our results.</p></sec>
<sec sec-type="methods" id="s2">
<title>Methods</title>
<sec>
<title>Cohort</title>
<p>Using the Northwestern Medicine Enterprise Data Warehouse (NM-EDW), a database that collects and integrates data from the EHR at Northwestern Medicine Healthcare (NMHC) system practice settings, we identified stroke patients hospitalized at Northwestern Memorial Hospital between January 1, 2011 and December 31, 2015. Inclusion criteria were age &#x0003E;18 years old. We defined stroke by ICD-9 codes 430&#x02013;436 for hemorrhagic and ischemic stroke, excluding 432.x, and 433.x0, and 435.x for transient ischemic attack or asymptomatic cerebrovascular conditions. We excluded patients who expired during index hospitalization and those with psychiatric admissions due to privacy restrictions on access to this type of data in the EDW.</p></sec>
<sec>
<title>Data Extraction</title>
<p>We obtained discrete structured variables and unstructured free-form text-based clinical notes from the EHR (Cerner, Kansas City, MO) pertaining to the index stroke hospitalization for all patients meeting study criteria from the EDW. The EDW currently contains clinical data on nearly 6.2 million patients dating back to the 1970s, which can be easily queried at the individual patient level or for aggregate data and can link laboratory tests, procedures, therapies, and clinical data with clinical outcomes at specific points in time.</p>
<p>For discrete variables, we recorded demographics (age, sex, race, ethnicity, insurance status, marriage status, smoking status), comorbidities based on ICD-9/10 codes (prior stroke, prior transient ischemic attack (TIA), hypertension, diabetes, coronary artery disease, hyper/dyslipidemia, atrial fibrillation, chronic obstructive pulmonary disease, hypothyroidism, dementia, end stage renal disease, cancer, valvular heart disease, congestive heart disease, prior coronary stent or bypass), prior healthcare utilization (number of ED visits and number of hospitalizations in the preceding year), stroke type (hemorrhagic vs. ischemic), length of stay, index hospital stay complications (pneumonia, mechanical ventilation, and percutaneous gastrostomy tube placement), discharge disposition, and discharge medications (e.g., anticoagulants). For non-discrete variables (e.g., text), a data analyst extracted the notes from the EDW. We included only a small appropriate subset of report types to identify potential predictors of readmission: admission, progress, consultation, and discharge notes. We pre-processed them to make it usable for machine learning and combined the raw text data with the discrete data, linking by a common identifier.</p></sec>
<sec>
<title>Feature Selection</title>
<p>A feature is an individual measurable property or characteristic of a phenomenon being observed. We built different feature sets for our predictive models. First, we compiled discrete features, some of which were used previously in studies of readmission after stroke (<xref ref-type="table" rid="T1">Table 1</xref>). We then extracted these features from the structured data, when available, in the EDW. These 35 discrete features formed the first feature set. We ranked each feature based on its importance using feature importance methods. Specifically, we used xgboost in order to find out the importance of each feature.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>List of discrete features extracted from enterprise data warehouse.</p></caption>
<table frame="hsides" rules="groups">
<tbody><tr>
<td valign="top" align="left">Demographics</td>
<td valign="top" align="left">Age, gender, race, ethnicity, marital status, and insurance status</td>
</tr>
<tr>
<td valign="top" align="left">Risk factors</td>
<td valign="top" align="left">Hypertension, diabetes mellitus, atrial fibrillation, prior stroke, coronary artery disease, congestive heart failure, valvular heart disease, coronary artery bypass graft/stent, end-stage renal disease, hypothyroidism, dementia, cancer, chronic lung disease, and smoking status</td>
</tr>
<tr>
<td valign="top" align="left">Index stroke encounter characteristics</td>
<td valign="top" align="left">Primary stroke type, initial NIHSS score, initial GCS score, in-hospital pneumonia, medications (e.g., anticoagulants) at discharge, percutaneous endoscopic gastrostomy, mechanical ventilation, intensive care unit stay, and discharge destination</td>
</tr>
<tr>
<td valign="top" align="left">Other baseline factors</td>
<td valign="top" align="left">Miles from residence to hospital, frequency of hospital admissions in preceding year, and frequency of stroke admissions in preceding year</td>
</tr>
<tr>
<td/>
</tr>
</tbody>
</table>
</table-wrap>
<p>Next, we constructed three different types of NLP features from the unstructured clinical notes. To do that, we first pre-processed the notes to remove language abnormalities and make it usable for feature extraction. Specifically, we lowercased the text, removed punctuations, and stop words and non-alphanumeric words. We aggregated all the reports for each patient and then created a large corpus of all the aggregated reports from all the patients. We then created a token dictionary of all the unique important terms from the corpus. We experimented with unigrams, bigrams, trigrams, and noun phrases; however, we found the combination of unigrams and bigrams to work best. An <italic>n</italic>-gram is a set of occurring words within a given window (for example, <italic>n</italic> = 1 it is unigram, <italic>n</italic> = 2 it is bigram, <italic>n</italic> = 3 it is trigram, and so on).</p>
<p>For our first set of NLP features, using the token dictionary, we transformed the corpus to a patient-token matrix in which each token (unigram or bigram) is represented by term-frequency-inverse document frequency (tf-idf). Next, we used logistic regression with &#x0201C;l1&#x0201D; penalty (LASSO) to reduce the large dimensionality of features (<xref ref-type="bibr" rid="B13">13</xref>). The LASSO method puts a constraint on the sum of the parameter coefficient and applies shrinking (regularization) to penalize the coefficient of non-essential features to zero. We filtered all the non-zero coefficient features and used them as our second set of features.</p>
<p>For second set of features, on the patient-token matrix, we applied principal component analysis (PCA) (<xref ref-type="bibr" rid="B14">14</xref>) and constructed a graph of the variance by cumulative number of principal components. This graph provided us with the most effective number of principal components that explained the most variance in the data set. We then selected these principal components to form our third set of features.</p>
<p>For final set of features, we ran word2vec (<xref ref-type="bibr" rid="B15">15</xref>) on the text corpus to learn word vectors for each token in our dictionary. We used genism (<xref ref-type="bibr" rid="B16">16</xref>) package and continuous bag of words approach with standard parameters for running word2vec algorithm. Next, to construct a patient vector, we summed all the individual token vectors for each token present in each patient&#x00027;s report. Doing this, each patient is then represented by a single vector, which formed our fourth and final set of features.</p></sec>
<sec>
<title>Definition of Outcomes</title>
<p>Readmission was defined as any unplanned inpatient hospitalization for any cause after index stroke hospitalization discharge. We excluded planned or scheduled readmissions, emergency department visits without admission, and observation visits. Using the date of index stroke hospital discharge and date of readmission, we identified unplanned readmissions occurring within 30 days of hospital discharge.</p></sec>
<sec>
<title>Predictive Models</title>
<p>We developed models to predict two separate outcomes: (1) 30-day all-cause readmission and (2) 30-day stroke readmission. For each of these outcomes, we trained different predictive models and compared them with each other. In addition, we also used different types of features for each of predictive models as discussed above. Thus, our study not only evaluates the performance of different predictive algorithms but also the added value of different types of features. We trained a number of different base predictive models as well as several hierarchical predictive models to enhance predictive performance. The base models included logistic regression (<xref ref-type="bibr" rid="B17">17</xref>), na&#x000EF;ve Bayes (<xref ref-type="bibr" rid="B18">18</xref>), support vector machines (<xref ref-type="bibr" rid="B19">19</xref>), random forests (<xref ref-type="bibr" rid="B18">18</xref>), gradient boosting machines (<xref ref-type="bibr" rid="B20">20</xref>), and finally extreme gradient boosting (XGBoost) (<xref ref-type="bibr" rid="B21">21</xref>). We trained each of these models for each of the feature types and compared the performance across multiple models.</p>
<p>For our first hierarchical model (<xref ref-type="fig" rid="F1">Figure 1</xref>), we combined all the features in the dataset to form a &#x0201C;super&#x0201D; feature set and then trained each of the base models on top of it. In addition, we combined the results from each of these base models and using those as features, we trained another meta-classifier model. We experimented with logistic regression as well as XGBoost for meta-classifier, but we found logistic regression to perform better. We designated this model a feature ensemble model.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Description of feature ensemble method.</p></caption>
<graphic xlink:href="fneur-12-649521-g0001.tif"/>
</fig>
<p>Next, for our final model (<xref ref-type="fig" rid="F2">Figure 2</xref>), instead of combining all the features, we concatenated results from the best performing model on individual features. We used the predictions from each of these models as features to train a meta-classifier. This technique is known as stacking (<xref ref-type="bibr" rid="B22">22</xref>) wherein outputs from base predictive models are combined to form a feature set which is then used to train another level 2 classifier. We designated this method a classifier ensemble model.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Description of classifier ensemble method.</p></caption>
<graphic xlink:href="fneur-12-649521-g0002.tif"/>
</fig></sec>
<sec>
<title>Validation and Evaluation</title>
<p>To avoid over-fitting, we performed five-fold cross-validation (<xref ref-type="bibr" rid="B23">23</xref>). Cross-validation, also called rotation estimation, is a technique to evaluate predictive models by partitioning the original sample into a training set to train the model and a validation set to evaluate it. In <italic>k</italic>-fold cross-validation, the original sample is randomly partitioned into <italic>k</italic> equal size subsamples. Of the <italic>k</italic> subsamples, a single subsample is retained as the validation data for testing the model, and the remaining <italic>k</italic>-1 subsamples are used as training data. The cross-validation process is then repeated <italic>k</italic> times (the folds), with each of the <italic>k</italic> subsamples used exactly once as the validation data. The results from the folds can then be averaged (or otherwise combined) to produce a single estimation. We also performed hyper-parameter tuning for our base model within each fold using &#x0201C;hyperopt&#x0201D; python package (<xref ref-type="bibr" rid="B24">24</xref>).</p>
<p>In order to test true generalizability of our results, we obtained another dataset spanning from January 1, 2016 to December 31, 2016. We pre-processed it the same way as we did for training data we used for 5-fold cross validation. Next, we trained the best performing models for both outcomes on all the training data and performed the trained model in the test dataset to generate final predictions. We also bootstrapped the test dataset over 50 iterations to generate confidence intervals.</p>
<p>To evaluate the performance of each model, we estimated area under the curve or AUCs from receiver operating characteristic curve analysis. We also compared the best performing model with the baseline logistic regression model of discrete variables alone. <italic>p</italic>-values &#x0003C; 0.05 were considered significant in all analyses.</p></sec>
<sec>
<title>Interpretability of NLP Features</title>
<p>To evaluate which NLP-based features were helpful in the prediction model, we ranked the bag of words features according to the feature importance given by the model.</p></sec>
<sec>
<title>Standard Protocol Approvals, Registrations, and Patient Consents</title>
<p>This study was approved by the Institutional Review Board of Northwestern University. Informed consent was waived for this retrospective data analysis.</p></sec>
<sec>
<title>Data Availability</title>
<p>All data not presented in this paper will be made available in a trusted data repository or shared at the request of other investigators for purposes of replicating procedures and results.</p></sec></sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<p>After pre-processing and combining various data files, we had 2,305 patients for training and 550 patients for testing. The mean age for training cohort and testing cohort was 64.4 and 64.8 years, respectively. The training and testing datasets were similar except the testing set contained more Hispanic, government-insured, married, hypertensive, cardiac disease, and intracerebral hemorrhage patients with more ICU days; the testing set also contained more patients who required acute inpatient rehabilitation at discharge (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Baseline characteristics of the training cohort (<italic>n</italic> = 2,305) and testing cohort (<italic>n</italic> = 550).</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Characteristic</bold></th>
<th valign="top" align="center"><bold>Training cohort</bold></th>
<th valign="top" align="center"><bold>Testing cohort</bold></th>
<th valign="top" align="center"><bold><italic>P</italic>-value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Mean age in years (SD)</td>
<td valign="top" align="center">64.4 (16.4)</td>
<td valign="top" align="center">64.8 (15.1)</td>
<td valign="top" align="center">0.90</td>
</tr>
<tr>
<td valign="top" align="left">Male sex [<italic>n</italic> (%)]</td>
<td valign="top" align="center">1,156 (50.2)</td>
<td valign="top" align="center">297 (54)</td>
<td valign="top" align="center">0.11</td>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Race [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;White</td>
<td valign="top" align="center">1,156 (50.2)</td>
<td valign="top" align="center">284 (51.6)</td>
<td valign="top" align="center">0.09</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Black</td>
<td valign="top" align="center">613 (26.6)</td>
<td valign="top" align="center">138 (25)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Asian</td>
<td valign="top" align="center">78 (3.4)</td>
<td valign="top" align="center">13 (2.4)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;American Indian or Alaskan Native</td>
<td valign="top" align="center">4 (0.2)</td>
<td valign="top" align="center">4 (0.7)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Native Hawaiian/Pacific Islander</td>
<td valign="top" align="center">4 (0.2)</td>
<td valign="top" align="center">3 (0.5)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Declined, missing, or unknown</td>
<td valign="top" align="center">233 (10.1)</td>
<td valign="top" align="center">63 (11.4)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Other</td>
<td valign="top" align="center">217 (9.41)</td>
<td valign="top" align="center">45 (8.1)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Hispanic [<italic>n</italic> (%)]</td>
<td valign="top" align="center">164 (7.1)</td>
<td valign="top" align="center">63 (11.4)</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Marital status [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Married</td>
<td valign="top" align="center">1,001 (43.4)</td>
<td valign="top" align="center">265 (48.1)</td>
<td valign="top" align="center">0.02</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Widowed</td>
<td valign="top" align="center">253 (11.0)</td>
<td valign="top" align="center">45 (8.1)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Single</td>
<td valign="top" align="center">759 (32.9)</td>
<td valign="top" align="center">157 (28.5)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Divorced</td>
<td valign="top" align="center">142 (6.2)</td>
<td valign="top" align="center">33 (6)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Separated</td>
<td valign="top" align="center">8 (0.3)</td>
<td valign="top" align="center">1 (0.2)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Unknown, other, or missing</td>
<td valign="top" align="center">142 (6.2)</td>
<td valign="top" align="center">49 (8.9)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Insurance status [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Private</td>
<td valign="top" align="center">833 (36.1)</td>
<td valign="top" align="center">173 (31.5)</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Medicare</td>
<td valign="top" align="center">1,060 (46.0)</td>
<td valign="top" align="center">278 (50.5)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Medicaid</td>
<td valign="top" align="center">182 (7.9)</td>
<td valign="top" align="center">63 (11.5)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Other or self-pay</td>
<td valign="top" align="center">230 (10.0)</td>
<td valign="top" align="center">36 (6.5)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Primary index stroke diagnosis [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Ischemic stroke</td>
<td valign="top" align="center">1,825 (79.1)</td>
<td valign="top" align="center">416 (75.6)</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Intracerebral hemorrhage</td>
<td valign="top" align="center">257 (11.1)</td>
<td valign="top" align="center">94 (17)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Subarachnoid hemorrhage</td>
<td valign="top" align="center">223 (9.7)</td>
<td valign="top" align="center">40 (7.3)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Hypertension [<italic>n</italic> (%)]</td>
<td valign="top" align="center">1,853 (78.8)</td>
<td valign="top" align="center">466 (84.7)</td>
<td valign="top" align="center">0.01</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Diabetes mellitus [<italic>n</italic> (%)]</td>
<td valign="top" align="center">629 (27.3)</td>
<td valign="top" align="center">179 (32.6)</td>
<td valign="top" align="center">0.13</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Atrial fibrillation [<italic>n</italic> (%)]</td>
<td valign="top" align="center">430 (18.7)</td>
<td valign="top" align="center">111 (20.2)</td>
<td valign="top" align="center">0.42</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Coronary artery disease [<italic>n</italic> (%)]</td>
<td valign="top" align="center">189 (8.2)</td>
<td valign="top" align="center">30 (5.5)</td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Congestive heart failure [<italic>n</italic> (%)]</td>
<td valign="top" align="center">232 (10.1)</td>
<td valign="top" align="center">67 (12.2)</td>
<td valign="top" align="center">0.15</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Valvular heart disease [<italic>n</italic> (%)]</td>
<td valign="top" align="center">42 (1.8)</td>
<td valign="top" align="center">36 (6.5)</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Prior stroke [<italic>n</italic> (%)]</td>
<td valign="top" align="center">218 (9.5)</td>
<td valign="top" align="center">57 (10.3)</td>
<td valign="top" align="center">0.57</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Chronic lung disease [<italic>n</italic> (%)]</td>
<td valign="top" align="center">236 (10.2)</td>
<td valign="top" align="center">48 (8.7)</td>
<td valign="top" align="center">0.29</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Dementia [<italic>n</italic> (%)]</td>
<td valign="top" align="center">149 (6.5)</td>
<td valign="top" align="center">37 (6.7)</td>
<td valign="top" align="center">0.87</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Cancer [<italic>n</italic> (%)]</td>
<td valign="top" align="center">180 (7.8)</td>
<td valign="top" align="center">45 (8.2)</td>
<td valign="top" align="center">0.75</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;End-stage renal disease [<italic>n</italic> (%)]</td>
<td valign="top" align="center">39 (1.7)</td>
<td valign="top" align="center">13 (2.3)</td>
<td valign="top" align="center">0.34</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Hypothyroidism [<italic>n</italic> (%)]</td>
<td valign="top" align="center">270 (11.7)</td>
<td valign="top" align="center">56 (10.2)</td>
<td valign="top" align="center">0.32</td>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Smoking [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Current</td>
<td valign="top" align="center">363 (15.7)</td>
<td valign="top" align="center">76 (13.8)</td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Former</td>
<td valign="top" align="center">595 (25.8)</td>
<td valign="top" align="center">115 (20.9)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Non-smoker</td>
<td valign="top" align="center">1,224 (53.1)</td>
<td valign="top" align="center">328 (59.6)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Missing or other</td>
<td valign="top" align="center">123 (5.3)</td>
<td valign="top" align="center">31 (5.6)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Any prior hospitalization [<italic>n</italic> (%)]</td>
<td valign="top" align="center">1,428 (61.0)</td>
<td valign="top" align="center">324 (58.9)</td>
<td valign="top" align="center">0.37</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Median initial NIHSS score (IQR)</td>
<td valign="top" align="center">2 (0&#x02013;6)</td>
<td valign="top" align="center">2 (0&#x02013;6)</td>
<td valign="top" align="center">0.09</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Median initial GCS (IQR)</td>
<td valign="top" align="center">15 (14&#x02013;15)</td>
<td valign="top" align="center">15 (14&#x02013;15)</td>
<td valign="top" align="center">0.10</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Missing [<italic>n</italic> (%)]</td>
<td valign="top" align="center">83 (3.6)</td>
<td valign="top" align="center">22 (4)</td>
<td valign="top" align="center">0.65</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Intensive care unit stay [<italic>n</italic> (%)]</td>
<td valign="top" align="center">1,166 (50.6)</td>
<td valign="top" align="center">306 (55.64)</td>
<td valign="top" align="center">0.04</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Inhospital pneumonia [<italic>n</italic> (%)]</td>
<td valign="top" align="center">108 (4.7)</td>
<td valign="top" align="center">24 (4.4)</td>
<td valign="top" align="center">0.76</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Mechanical ventilation [<italic>n</italic> (%)]</td>
<td valign="top" align="center">226 (9.8)</td>
<td valign="top" align="center">49 (8.9)</td>
<td valign="top" align="center">0.52</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Gastrostomy [<italic>n</italic> (%)]</td>
<td valign="top" align="center">153 (6.6)</td>
<td valign="top" align="center">35 (6.3)</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td valign="top" align="left" colspan="4"><bold>Discharge destination [</bold><italic><bold>n</bold></italic> <bold>(%)]</bold></td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Home</td>
<td valign="top" align="center">1,659 (72.0)</td>
<td valign="top" align="center">350 (63.6)</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Acute inpatient rehabilitation</td>
<td valign="top" align="center">429 (18.6)</td>
<td valign="top" align="center">148 (26.9)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Skilled nursing facility or long-term facility</td>
<td valign="top" align="center">153 (6.6)</td>
<td valign="top" align="center">33 (6)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Other hospital or against medical advice</td>
<td valign="top" align="center">64 (2.8)</td>
<td valign="top" align="center">19 (3.45)</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Any unplanned readmission within 30 days [<italic>n</italic> (%)]</td>
<td valign="top" align="center">337 (14.6)</td>
<td valign="top" align="center">62 (11.5)</td>
<td valign="top" align="center">0.04</td>
</tr>
<tr>
<td valign="top" align="left">&#x000A0;&#x000A0;&#x000A0;Stroke readmission within 30 days [<italic>n</italic> (%)]</td>
<td valign="top" align="center">124 (5.4)</td>
<td valign="top" align="center">24 (4.5)</td>
<td valign="top" align="center">0.33</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In training cohort, there were 337 patients (14.6%) with all-cause readmission within 30 days and 124 patients (5.4%) with stroke readmission within 30 days. In testing cohort, there were 62 patients (11.3%) with all-cause readmission within 30 days and 24 patients (4.4%) with stroke readmission within 30 days. We collected &#x0007E;28,500 different patient reports for the training data set and 6,606 reports for the test dataset. We extracted 35 discrete features, 250 principal components features, 400 word-vector features, and 200 bag of words features for all patients in both cohorts.</p>
<p>For all-cause readmission (<xref ref-type="fig" rid="F3">Figure 3</xref>), a model using logistic regression using discrete features had AUC of 0.58 (95% CI, 0.57&#x02013;0.59). In comparison, XGBoost outperformed logistic regression using the same discrete features with an AUC of 0.62 (95% CI, 0.61&#x02013;0.63). Using NLP-based features, we obtained similar results with XGBoost performing best with bag of words features (AUC, 0.61; 95% CI, 0.60&#x02013;0.62), logistic regression performing best with PCA features scoring (AUC, 0.61; 95% CI, 0.59&#x02013;0.62), and XGBoost performing best with word-vector-based features (AUC, 0.60; 95% CI, 0.59&#x02013;0.61). Ensemble model performed best with feature ensemble method (AUC, 0.64; 95% CI, 0.62&#x02013;0.66) and classifier ensemble method (AUC, 0.65; 95% CI, 0.62&#x02013;0.66). We performed the trained classifier ensemble model in the test dataset with bootstrapping over 50 iterations, which resulted in an AUC of 0.64 (95% CI, 0.63&#x02013;0.65).</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Comparison of models to predict 30-day all-cause readmissions.</p></caption>
<graphic xlink:href="fneur-12-649521-g0003.tif"/>
</fig>
<p>We obtained similar results for 30-day stroke readmissions (<xref ref-type="fig" rid="F4">Figure 4</xref>). Logistic regression with discrete features formed modest baseline with AUC of 0.52 (95% CI, 0.51&#x02013;0.53). XGBoost outperformed logistic regression using discrete features alone with AUC of 0.58 (95% CI, 0.56&#x02013;0.59). The models using the best NLP-based features produced AUCs of 0.61 (95% CI, 0.59&#x02013;0.63), 0.60 (95% CI, 0.59&#x02013;0.62), and 0.58 (95% CI, 0.57&#x02013;0.59) for bag of words features, PCA features, and word-vector features, respectively. Ensemble methods were again the best performing models with AUCs of 0.63 (95% CI, 0.6&#x02013;0.65) and 0.64 (95% CI, 0.62&#x02013;0.66) for feature ensemble model and classifier ensemble models, respectively. Performed on the test set, we obtained an AUC of 0.62 (95% CI, 0.61&#x02013;0.63) using classifier ensemble.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Comparison of models to predict 30-day stroke readmissions.</p></caption>
<graphic xlink:href="fneur-12-649521-g0004.tif"/>
</fig>
<p>Some of the NLP features that were ranked higher in importance by the model were as follows: &#x0201C;stenosis,&#x0201D; &#x0201C;encephalomalacia,&#x0201D; &#x0201C;craniectomy,&#x0201D; &#x0201C;encephalomalacia,&#x0201D; &#x0201C;mild calcified atherosclerotic,&#x0201D; &#x0201C;hypoattenuation white matter,&#x0201D; and &#x0201C;chiari ii malformation.&#x0201D;</p></sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>Given the burden of readmission on the patient and the healthcare system, improving prediction of readmissions with a goal of preventing them is of major importance. A prior study estimated that the cost to Medicare of unplanned rehospitalizations in 2004 was $17.4 billion (<xref ref-type="bibr" rid="B25">25</xref>). Readmission to the hospital within 30 days after stroke is also associated with 1-year mortality and serves as a quality metric across specialties under the guidance of the Affordable Care Act (<xref ref-type="bibr" rid="B3">3</xref>, <xref ref-type="bibr" rid="B26">26</xref>).</p>
<p>Currently, clinician judgment and simple mathematical models are able to only modestly predict readmission after stroke. In our study, the baseline model that used logistic regression and discrete variables resulted in poor discrimination of 30-day readmission, a result that is consistent with prior studies (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B8">8</xref>). While NLP-enhanced ML models advance conventional approaches, further improvement is necessary before these predictive models can be implemented in practice given the weak discrimination. Our finding is similar to another study using machine learning in readmission after heart failure (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>Given the challenges in accurate prediction of 30-day readmission even using modern machine learning approaches, grading and penalizing hospitals on this metric may not be justifiable. Indeed, hospitals may be forced to &#x0201C;game&#x0201D; the system by increasing observation status visits and avoid penalties at the cost of increasing mortality as a recent study in heart failure patients found (<xref ref-type="bibr" rid="B27">27</xref>). Therefore, the penalties facing hospitals seem misguided until such a time when readmission prediction is more robust.</p>
<p>Machine learning is able to weigh the interactions between complex variables in additive analysis to produce better prediction models. In addition, the use of NLP in medicine may be revolutionary. Untangling the complex data within clinical notes and other non-discrete and unstructured data could be valuable in tackling a myriad of research questions. Our advanced models could further ongoing machine learning efforts across specialties to better identify patients for clinical trials, radiologic findings in neurologic emergencies, dermatologic-related malignancies, automatic infectious disease prediction in the emergency room, and outcomes in psychiatric admissions (<xref ref-type="bibr" rid="B28">28</xref>&#x02013;<xref ref-type="bibr" rid="B32">32</xref>).</p>
<p>The strengths of our study include a five-fold cross-validation technique to avoid overfitting. The internal validity of our results was further tested by obtaining a second dataset not used in the derivation and validation steps. We also bootstrapped the test dataset over 50 iterations to generate confidence intervals. Our study, however, has limitations. ML algorithms are also limited by the data that are fed into them such that data that are not commonly reflected in the EHR, such as psychosocial factors, post-discharge care coordination, detailed social support post-hospitalization, and post-stroke rehabilitation care are not accounted for in our study. Prior studies suggest including post-acute care data improve prediction of readmission (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B33">33</xref>). Healthcare systems across the country are heterogeneous, and the variables we used may be non-uniformly available at other hospitals. External validation of our results is necessary. An additional limitation of a single-center cohort is the potential for incomplete follow-up (e.g., care fragmentation leading to admission at another hospital in the region) resulting in an underestimation of readmission rates. However, a recent Chicago multihospital study noted a low rate of care fragmentation (<xref ref-type="bibr" rid="B34">34</xref>). There are several differences between the two datasets: the training dataset as it was later chronologically noted changes in the health system and stroke program. These differences may result in error in trained model validation. However, it does provide some measure of external validation as the model performed well. Nevertheless, formal external validation of the model is recommended. In addition, these algorithms require large volume, structured pools of data. Approximately 80% of EHR data is composed of provider notes. Our use of NLP provided a tool for deconstructing these language blocks; however, sufficient time is required to design and train these programs (<xref ref-type="bibr" rid="B9">9</xref>). Lastly, these programs lack the clinical insight that is essential for unsupervised implementation, and with any &#x0201C;black box&#x0201D; program, results must be interpreted cautiously (<xref ref-type="bibr" rid="B11">11</xref>).</p></sec>
<sec id="s5">
<title>Summary</title>
<p>In summary, we demonstrated a modest added utility of NLP-enhanced ML algorithms to improve prediction of 30-day readmission after stroke hospitalization compared with conventional statistical approaches using discrete predictors alone. While these results are encouraging, further work is required. Given the challenges in predicting readmission after stroke even using the most advanced techniques, the current penalties applied to hospitals for unplanned readmissions should be reevaluated.</p></sec>
<sec sec-type="data-availability-statement" id="s6">
<title>Data Availability Statement</title>
<p>The datasets presented in this article are not readily available because the code and data contains Protected Health Information (PHI). Requests to access the datasets should be directed to the corresponding author.</p></sec>
<sec id="s7">
<title>Author Contributions</title>
<p>Statistical analysis was done by RG. All authors contributed to the article and approved the submitted version.</p></sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Benjamin</surname> <given-names>EJ</given-names></name> <name><surname>Virani</surname> <given-names>SS</given-names></name> <name><surname>Callaway</surname> <given-names>CW</given-names></name> <name><surname>Chamberlain</surname> <given-names>AM</given-names></name> <name><surname>Chang</surname> <given-names>AR</given-names></name> <name><surname>Cheng</surname> <given-names>S</given-names></name> <etal/></person-group>. <article-title>Heart disease and stroke statistics-2018 update: a report from the American Heart Association</article-title>. <source>Circulation</source>. (<year>2018</year>) <volume>137</volume>:<fpage>e67</fpage>&#x02013;<lpage>492</lpage>. <pub-id pub-id-type="doi">10.1161/CIR.0000000000000573</pub-id><pub-id pub-id-type="pmid">29555722</pub-id></citation></ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>G</given-names></name> <name><surname>Zhang</surname> <given-names>Z</given-names></name> <name><surname>Ayala</surname> <given-names>C</given-names></name> <name><surname>Dunet</surname> <given-names>DO</given-names></name> <name><surname>Fang</surname> <given-names>J</given-names></name> <name><surname>George</surname> <given-names>MG</given-names></name></person-group>. <article-title>Costs of hospitalization for stroke patients aged 18-64 years in the United States</article-title>. <source>J Stroke Cerebrovasc Dis</source>. (<year>2014</year>) <volume>23</volume>:<fpage>861</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1016/j.jstrokecerebrovasdis.2013.07.017</pub-id><pub-id pub-id-type="pmid">23954598</pub-id></citation></ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kind</surname> <given-names>AJ</given-names></name> <name><surname>Smith</surname> <given-names>MA</given-names></name> <name><surname>Liou</surname> <given-names>JI</given-names></name> <name><surname>Pandhi</surname> <given-names>N</given-names></name> <name><surname>Frytak</surname> <given-names>JR</given-names></name> <name><surname>Finch</surname> <given-names>MD</given-names></name></person-group>. <article-title>The price of bouncing back: one-year mortality and payments for acute stroke patients with 30-day bounce-backs</article-title>. <source>J Am Geriatr Soc</source>. (<year>2008</year>) <volume>56</volume>:<fpage>999</fpage>&#x02013;<lpage>1005</lpage>. <pub-id pub-id-type="doi">10.1111/j.1532-5415.2008.01693.x</pub-id><pub-id pub-id-type="pmid">18422948</pub-id></citation></ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kansagara</surname> <given-names>D</given-names></name> <name><surname>Englander</surname> <given-names>H</given-names></name> <name><surname>Salanitro</surname> <given-names>A</given-names></name> <name><surname>Kagen</surname> <given-names>D</given-names></name> <name><surname>Theobald</surname> <given-names>C</given-names></name> <name><surname>Freeman</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Risk prediction models for hospital readmission: a systematic review</article-title>. <source>JAMA</source>. (<year>2011</year>) <volume>306</volume>:<fpage>1688</fpage>&#x02013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1001/jama.2011.1515</pub-id><pub-id pub-id-type="pmid">32269037</pub-id></citation></ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fehnel</surname> <given-names>CR</given-names></name> <name><surname>Lee</surname> <given-names>Y</given-names></name> <name><surname>Wendell</surname> <given-names>LC</given-names></name> <name><surname>Thompson</surname> <given-names>BB</given-names></name> <name><surname>Potter</surname> <given-names>NS</given-names></name> <name><surname>Mor</surname> <given-names>V</given-names></name></person-group>. <article-title>Post-acute care data for predicting readmission after ischemic stroke: a nationwide cohort analysis using the minimum data set</article-title>. <source>J Am Heart Assoc</source>. (<year>2015</year>) <volume>4</volume>:<fpage>e002145</fpage>. <pub-id pub-id-type="doi">10.1161/JAHA.115.002145</pub-id><pub-id pub-id-type="pmid">26396202</pub-id></citation></ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Burke</surname> <given-names>JF</given-names></name> <name><surname>Skolarus</surname> <given-names>LE</given-names></name> <name><surname>Adelman</surname> <given-names>EE</given-names></name> <name><surname>Reeves</surname> <given-names>MJ</given-names></name> <name><surname>Brown</surname> <given-names>DL</given-names></name></person-group>. <article-title>Influence of hospital-level practices on readmission after ischemic stroke</article-title>. <source>Neurology</source>. (<year>2014</year>) <volume>82</volume>:<fpage>2196</fpage>&#x02013;<lpage>204</lpage>. <pub-id pub-id-type="doi">10.1212/WNL.0000000000000514</pub-id><pub-id pub-id-type="pmid">25601885</pub-id></citation></ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lichtman</surname> <given-names>JH</given-names></name> <name><surname>Leifheit-Limson</surname> <given-names>EC</given-names></name> <name><surname>Jones</surname> <given-names>SB</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Goldstein</surname> <given-names>LB</given-names></name></person-group>. <article-title>Preventable readmissions within 30 days of ischemic stroke among Medicare beneficiaries</article-title>. <source>Stroke</source>. (<year>2013</year>) <volume>44</volume>:<fpage>3429</fpage>&#x02013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1161/STROKEAHA.113.003165</pub-id><pub-id pub-id-type="pmid">24172581</pub-id></citation></ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fonarow</surname> <given-names>GC</given-names></name> <name><surname>Smith</surname> <given-names>EE</given-names></name> <name><surname>Reeves</surname> <given-names>MJ</given-names></name> <name><surname>Pan</surname> <given-names>W</given-names></name> <name><surname>Olson</surname> <given-names>D</given-names></name> <name><surname>Hernandez</surname> <given-names>AF</given-names></name> <etal/></person-group>. <article-title>Hospital-level variation in mortality and rehospitalization for medicare beneficiaries with acute ischemic stroke</article-title>. <source>Stroke</source>. (<year>2011</year>) <volume>42</volume>:<fpage>159</fpage>&#x02013;<lpage>66</lpage>. <pub-id pub-id-type="doi">10.1161/STROKEAHA.110.601831</pub-id><pub-id pub-id-type="pmid">21566229</pub-id></citation></ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Artetxe</surname> <given-names>A</given-names></name> <name><surname>Beristain</surname> <given-names>A</given-names></name> <name><surname>Grana</surname> <given-names>M</given-names></name></person-group>. <article-title>Predictive models for hospital readmission risk: a systematic review of methods</article-title>. <source>Comput Method Programs Biomed</source>. (<year>2018</year>) <volume>164</volume>:<fpage>149</fpage>&#x02013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2018.06.006</pub-id><pub-id pub-id-type="pmid">30195431</pub-id></citation></ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nasrabadi</surname> <given-names>NM</given-names></name></person-group>. <article-title>Pattern recognition and machine learning</article-title>. <source>J Electon Imaging</source>. (<year>2007</year>) <volume>16</volume>:<fpage>049901</fpage>. <pub-id pub-id-type="doi">10.1117/1.2819119</pub-id></citation></ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Golas</surname> <given-names>SB</given-names></name> <name><surname>Shibahara</surname> <given-names>T</given-names></name> <name><surname>Agboola</surname> <given-names>S</given-names></name> <name><surname>Otaki</surname> <given-names>H</given-names></name> <name><surname>Sato</surname> <given-names>J</given-names></name> <name><surname>Nakae</surname> <given-names>T</given-names></name> <etal/></person-group>. <article-title>A machine learning model to predict the risk of 30-day readmissions in patients with heart failure: a retrospective analysis of electronic medical records data</article-title>. <source>BMC Med Inform Decis Mak</source>. (<year>2018</year>) <volume>18</volume>:<fpage>44</fpage>. <pub-id pub-id-type="doi">10.1186/s12911-018-0620-z</pub-id><pub-id pub-id-type="pmid">29929496</pub-id></citation></ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shameer</surname> <given-names>K</given-names></name> <name><surname>Johnson</surname> <given-names>Kw</given-names></name> <name><surname>Yahi</surname> <given-names>A</given-names></name> <name><surname>Miotto</surname> <given-names>R</given-names></name> <name><surname>Li</surname> <given-names>LI</given-names></name> <name><surname>Ricks</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Predictive modeling of hospital readmission rates using electronic medical record-wide machine learning: a case-study using mount sinai heart failure cohort</article-title>. <source>Biocomputing</source>. (<year>2017</year>) <volume>22</volume>:<fpage>276</fpage>&#x02013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1142/9789813207813_0027</pub-id><pub-id pub-id-type="pmid">27896982</pub-id></citation></ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guyon</surname> <given-names>I</given-names></name> <name><surname>Elisseeff</surname> <given-names>A</given-names></name></person-group>. <article-title>An introduction to variable and feature selection</article-title>. <source>J Mach Learning Res</source>. (<year>2003</year>) <volume>3</volume>:<fpage>1157</fpage>&#x02013;<lpage>82</lpage>.</citation></ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abdi</surname> <given-names>H</given-names></name> <name><surname>Williams</surname> <given-names>LJ</given-names></name></person-group>. <article-title>Principal component analysis</article-title>. <source>Wiley Interdisc Rev Comput Stat</source>. (<year>2010</year>) <volume>2</volume>:<fpage>433</fpage>&#x02013;<lpage>59</lpage>. <pub-id pub-id-type="doi">10.1002/wics.101</pub-id></citation></ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mikolov</surname> <given-names>T</given-names></name> <name><surname>Sutskever</surname> <given-names>I</given-names></name> <name><surname>Chen</surname> <given-names>K</given-names></name> <name><surname>Corrado</surname> <given-names>GS</given-names></name> <name><surname>Dean</surname> <given-names>J</given-names></name></person-group>. <article-title>Distributed representations of words and phrases and their compositionality</article-title>. In: <source>Advances in Neural Information Processing Systems</source>. Lake Tahoe, NV (<year>2013</year>).</citation></ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rehurek</surname> <given-names>R</given-names></name> <name><surname>Sojka</surname> <given-names>P</given-names></name></person-group>. <article-title>Gensim-Statistical Semantics in Python</article-title>. 2011.</citation></ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hosmer</surname> <given-names>DW</given-names> <suffix>Jr.</suffix></name> <name><surname>Lemeshow</surname> <given-names>S</given-names></name> <name><surname>Sturdivant</surname> <given-names>RX</given-names></name></person-group>. <source>Applied Logistic Regression</source>. <publisher-loc>Hoboken, NJ</publisher-loc>: <publisher-name>John Wiley &#x00026; Sons</publisher-name> (<year>2013</year>).</citation></ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liaw</surname> <given-names>A</given-names></name> <name><surname>Wiener</surname> <given-names>M</given-names></name></person-group>. <article-title>Classification and regression by random forest</article-title>. <source>R News</source>. (<year>2002</year>) <volume>2</volume>:<fpage>18</fpage>&#x02013;<lpage>22</lpage>.</citation></ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cortes</surname> <given-names>C</given-names></name> <name><surname>Vapnik</surname> <given-names>V</given-names></name></person-group>. <article-title>Support-vector networks</article-title>. <source>Machine Learn</source>. (<year>1995</year>) <volume>20</volume>:<fpage>273</fpage>&#x02013;<lpage>97</lpage>. <pub-id pub-id-type="doi">10.1007/BF00994018</pub-id></citation></ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Friedman</surname> <given-names>JH</given-names></name></person-group>. <article-title>Greedy function approximation: a gradient boosting machine</article-title>. <source>Ann Stat</source>. (<year>2001</year>) <volume>29</volume>:<fpage>1189</fpage>&#x02013;<lpage>232</lpage>. <pub-id pub-id-type="doi">10.1214/aos/1013203451</pub-id></citation></ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T</given-names></name> <name><surname>Guestrin</surname> <given-names>C</given-names></name></person-group>. <article-title>Xgboost: a scalable tree boosting system</article-title>. In: <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source>. San Francisco, CA (<year>2016</year>).</citation></ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wolpert</surname> <given-names>DH</given-names></name></person-group>. <article-title>Stacked generalization</article-title>. <source>Neural Netw</source>. (<year>1992</year>) <volume>5</volume>:<fpage>241</fpage>&#x02013;<lpage>59</lpage>. <pub-id pub-id-type="doi">10.1016/S0893-6080(05)80023-1</pub-id></citation></ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kohavi</surname> <given-names>R</given-names></name></person-group>. <article-title>A study of cross-validation and bootstrap for accuracy estimation and model selection</article-title>. In: <source>Ijcai</source>. San Francisco, CA (<year>1995</year>).</citation></ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bergstra</surname> <given-names>J</given-names></name> <name><surname>Yamins</surname> <given-names>D</given-names></name> <name><surname>Cox</surname> <given-names>DD</given-names></name></person-group>. <article-title>Hyperopt: a python library for optimizing the hyperparameters of machine learning algorithms</article-title>. In: <source>Proceedings of the 12th Python in Science Conference</source>. Austin, TX (<year>2013</year>).</citation></ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jencks</surname> <given-names>SF</given-names></name> <name><surname>Williams</surname> <given-names>MV</given-names></name> <name><surname>Coleman</surname> <given-names>EA</given-names></name></person-group>. <article-title>Rehospitalizations among patients in the Medicare fee-for-service program</article-title>. <source>N Engl J Med</source>. (<year>2009</year>) <volume>360</volume>:<fpage>1418</fpage>&#x02013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMsa0803563</pub-id><pub-id pub-id-type="pmid">19610166</pub-id></citation></ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="web"><person-group person-group-type="author"><collab>Centers for Medicare &#x00026; Medicaid Services</collab></person-group>. <source>Centers for Medicare &#x00026; Medicaid Services: Readmissions Reduction Program</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://www.cms.gov/Medicare/Medicare-Fee-for-Service-Payment/AcuteInpatientPPS/Readmissions-Reduction-Program.html">http://www.cms.gov/Medicare/Medicare-Fee-for-Service-Payment/AcuteInpatientPPS/Readmissions-Reduction-Program.html</ext-link> (accessed 2019).</citation></ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>A</given-names></name> <name><surname>Allen</surname> <given-names>LA</given-names></name> <name><surname>Bhatt</surname> <given-names>DL</given-names></name> <name><surname>Cox</surname> <given-names>M</given-names></name> <name><surname>DeVore</surname> <given-names>AD</given-names></name> <name><surname>Heidenreich</surname> <given-names>PA</given-names></name> <etal/></person-group>. <article-title>Association of the Hospital Readmissions Reduction Program Implementation With Readmission and Mortality Outcomes in Heart Failure</article-title>. <source>JAMA Cardiol</source>. (<year>2018</year>) <volume>3</volume>:<fpage>44</fpage>&#x02013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1001/jamacardio.2017.4265</pub-id><pub-id pub-id-type="pmid">29128869</pub-id></citation></ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Miotto</surname> <given-names>R</given-names></name> <name><surname>Weng</surname> <given-names>C</given-names></name></person-group>. <article-title>Case-based reasoning using electronic health records efficiently identifies eligible patients for clinical trials</article-title>. <source>J Am Med Inform Assoc</source>. (<year>2015</year>) <volume>22</volume>:<fpage>e141</fpage>&#x02013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocu050</pub-id><pub-id pub-id-type="pmid">25769682</pub-id></citation></ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Titano</surname> <given-names>JJ</given-names></name> <name><surname>Badgeley</surname> <given-names>M</given-names></name> <name><surname>Schefflein</surname> <given-names>J</given-names></name> <name><surname>Pain</surname> <given-names>M</given-names></name> <name><surname>Su</surname> <given-names>A</given-names></name> <name><surname>Cai</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Automated deep-neural-network surveillance of cranial images for acute neurologic events</article-title>. <source>Nat Med</source>. (<year>2018</year>) <volume>24</volume>:<fpage>1337</fpage>&#x02013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-018-0147-y</pub-id><pub-id pub-id-type="pmid">30104767</pub-id></citation></ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Esteva</surname> <given-names>A</given-names></name> <name><surname>Kuprel</surname> <given-names>B</given-names></name> <name><surname>Novoa</surname> <given-names>RA</given-names></name> <name><surname>Ko</surname> <given-names>J</given-names></name> <name><surname>Swetter</surname> <given-names>SM</given-names></name> <name><surname>Blau</surname> <given-names>HM</given-names></name> <etal/></person-group>. <article-title>Dermatologist-level classification of skin cancer with deep neural networks</article-title>. <source>Nature</source>. (<year>2017</year>) <volume>542</volume>:<fpage>115</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1038/nature21056</pub-id><pub-id pub-id-type="pmid">28658222</pub-id></citation></ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rumshisky</surname> <given-names>A</given-names></name> <name><surname>Ghassemi</surname> <given-names>M</given-names></name> <name><surname>Naumann</surname> <given-names>T</given-names></name> <name><surname>Szolovits</surname> <given-names>P</given-names></name> <name><surname>Castro</surname> <given-names>VM</given-names></name> <name><surname>McCoy</surname> <given-names>TH</given-names></name> <etal/></person-group>. <article-title>Predicting early psychiatric readmission with natural language processing of narrative discharge summaries</article-title>. <source>Transl Psychiatry</source>. (<year>2016</year>) <volume>6</volume>:<fpage>e921</fpage>. <pub-id pub-id-type="doi">10.1038/tp.2015.182</pub-id><pub-id pub-id-type="pmid">27754482</pub-id></citation></ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tou</surname> <given-names>H</given-names></name> <name><surname>Yao</surname> <given-names>L</given-names></name> <name><surname>Wei</surname> <given-names>Z</given-names></name> <name><surname>Zhuang</surname> <given-names>X</given-names></name> <name><surname>Zhang</surname> <given-names>BJBB</given-names></name></person-group>. <article-title>Automatic infection detection based on electronic medical records</article-title>. <source>BMC Bioinform</source>. (<year>2018</year>) <volume>19</volume>:<fpage>117</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-018-2101-x</pub-id><pub-id pub-id-type="pmid">29671399</pub-id></citation></ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Slocum</surname> <given-names>C</given-names></name> <name><surname>Gerrard</surname> <given-names>P</given-names></name> <name><surname>Black-Schaffer</surname> <given-names>R</given-names></name> <name><surname>Goldstein</surname> <given-names>R</given-names></name> <name><surname>Singhal</surname> <given-names>A</given-names></name> <name><surname>DiVita</surname> <given-names>MA</given-names></name> <etal/></person-group>. <article-title>Functional status predicts acute care readmissions from inpatient rehabilitation in the stroke population</article-title>. <source>PLoS ONE</source>. (<year>2015</year>) <volume>10</volume>:<fpage>e0142180</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0142180</pub-id><pub-id pub-id-type="pmid">26599009</pub-id></citation></ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Galanter</surname> <given-names>WL</given-names></name> <name><surname>Applebaum</surname> <given-names>A</given-names></name> <name><surname>Boddipalli</surname> <given-names>V</given-names></name> <name><surname>Kho</surname> <given-names>A</given-names></name> <name><surname>Lin</surname> <given-names>M</given-names></name> <name><surname>Meltzer</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Migration of patients between five urban teaching hospitals in Chicago</article-title>. <source>J Med Syst</source>. (<year>2013</year>) <volume>37</volume>:<fpage>9930</fpage>. <pub-id pub-id-type="doi">10.1007/s10916-013-9930-y</pub-id><pub-id pub-id-type="pmid">23381645</pub-id></citation></ref>
</ref-list> 
</back>
</article>