<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">771435</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2021.771435</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Contextualizing Genes by Using Text-Mined Co-Occurrence Features for Cancer Gene Panel Discovery</article-title>
<alt-title alt-title-type="left-running-head">Chen et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Text Mining for Cancer Gene Panel</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Hui-O</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Lin</surname>
<given-names>Peng-Chan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/923314/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Chen-Ruei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Chi-Shiang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1475813/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chiang</surname>
<given-names>Jung-Hsien</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Department of Computer Science and Information Engineering, College of Electrical Engineering and Computer Science, National Cheng Kung University, <addr-line>Tainan</addr-line>, <country>Taiwan</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Institute of Medical Informatics, National Cheng Kung University, <addr-line>Tainan</addr-line>, <country>Taiwan</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Department of Oncology, National Cheng Kung University Hospital, College of Medicine, National Cheng Kung University, <addr-line>Tainan</addr-line>, <country>Taiwan</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>Department of Genomic Medicine, National Cheng Kung University Hospital, College of Medicine, National Cheng Kung University, <addr-line>Tainan</addr-line>, <country>Taiwan</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/865211/overview">Ying Li</ext-link>, Zhejiang University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/137287/overview">Rafael Rosell</ext-link>, Catalan Institute of Oncology, Spain</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1041828/overview">Ehsan Nazemalhosseini-Mojarad</ext-link>, Shahid Beheshti University of Medical Sciences,&#x20;Iran</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1502038/overview">Ruan Maomei</ext-link>, Shanghai Chest Hospital, Shanghai Jiaotong University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Peng-Chan Lin, <email>pengchanlin@gmail.com</email>; Jung-Hsien Chiang, <email>jchiang@mail.ncku.edu.tw</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Computational Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>25</day>
<month>10</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>771435</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>09</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>10</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Chen, Lin, Liu, Wang and Chiang.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Chen, Lin, Liu, Wang and Chiang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Developing a biomedical-explainable and validatable text mining pipeline can help in cancer gene panel discovery. We create a pipeline that can contextualize genes by using text-mined co-occurrence features. We apply Biomedical Natural Language Processing (BioNLP) techniques for literature mining in the cancer gene panel. A literature-derived 4,679 &#xd7; 4,630 gene term-feature matrix was built. The <italic>EGFR</italic> L858R and T790M, and <italic>BRAF</italic> V600E genetic variants are important mutation term features in text mining and are frequently mutated in cancer. We validate the cancer gene panel by the mutational landscape of different cancer types. The cosine similarity of gene frequency between text mining and a statistical result from clinical sequencing data is 80.8%. In different machine learning models, the best accuracy for the prediction of two different gene panels, including MSK-IMPACT (Memorial Sloan Kettering-Integrated Mutation Profiling of Actionable Cancer Targets), and Oncomine cancer gene panel, is 0.959, and 0.989, respectively. The receiver operating characteristic (ROC) curve analysis confirmed that the neural net model has a better prediction performance (Area under the ROC curve (AUC) &#x3d; 0.992). The use of text-mined co-occurrence features can contextualize each gene. We believe the approach is to evaluate several existing gene panels, and show that we can use part of the gene panel set to predict the remaining genes for cancer discovery.</p>
</abstract>
<kwd-group>
<kwd>biomedical natural language processing</kwd>
<kwd>machine learning</kwd>
<kwd>topic modeling</kwd>
<kwd>cancer gene panel</kwd>
<kwd>text mining</kwd>
</kwd-group>
<contract-sponsor id="cn001">Ministry of Science and Technology, Taiwan<named-content content-type="fundref-id">10.13039/501100004663</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Ministry of Health and Family Welfare<named-content content-type="fundref-id">10.13039/501100007334</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Scientific articles provide text mining (TM) applications in cancer biology (<xref ref-type="bibr" rid="B47">Zhu et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B3">Azam et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B38">Wang et&#x20;al., 2020</xref>). Several solutions are currently available to meet the growing need for different cancer gene panels. Several commercial gene panels constitute a &#x201c;one-size-fits-all&#x201d; solution. In a clinical investigation, we need to design gene panels specifically tailored for particular questions or individual cancers (<xref ref-type="bibr" rid="B15">Hyman et&#x20;al., 2015</xref>). The precision of the designed panel for different tumors plays an important role. They rely on literature reviews and cancer genomics databases. The reason for selecting somatic and germline mutation profiling is also complicated. Emerging TM techniques such as Gene2Vec offer some answers to information interpreting problems. Gene2Vec is a study that explored the idea of gene embedding, distributed representation of genes, in the spirit of word embedding (<xref ref-type="bibr" rid="B9">Demeester et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B11">Du et&#x20;al., 2019</xref>). However, we cannot explain the biomedical meaning of the vector in the neural embedding model. The goal of explainability is very important and would be very useful. The ability to provide additional gene suggestions for a gene panel with an explanation would be hugely valuable but also really challenging. Therefore, we developed a biomedical-explainable and validatable text mining pipeline for cancer gene panel discovery.</p>
<p>Firstly, we find a system for predicting genes and interesting applications for a gene panel discovery. The use of text-mined co-occurrences features for each gene can contextualize each gene, and as input for a machine learning system. We extract NER names mentioned in the literature, such as gene NER (<xref ref-type="bibr" rid="B18">Leaman et&#x20;al., 2013</xref>) and disease NER (<xref ref-type="bibr" rid="B41">Wei et&#x20;al., 2013</xref>). The use of PubTator (<xref ref-type="bibr" rid="B39">Wang et&#x20;al., 2016</xref>) along with MeSH (<xref ref-type="bibr" rid="B16">Ikonomakis et&#x20;al., 2005</xref>) is a good way of getting as good enrichment for biomedical relevant terms. The frequency-inverse document frequency (TF-IDF) was used to construct the document-term matrix (<xref ref-type="bibr" rid="B16">Ikonomakis et&#x20;al., 2005</xref>). Machine learning-based and biomedical-explainable approaches have recently become the most popular approaches in the study of the document-term matrix. For example, M. Ikonomakis et&#x20;al. introduced several machine learning (ML) algorithms applied to text classification such as na&#xef;ve-Bayes, decision trees, neural networks, nearest neighbors, and support vector machines (<xref ref-type="bibr" rid="B10">Devarajan et&#x20;al., 2015</xref>). Wei Xu et&#x20;al. proposed a novel document-clustering method based on non-negative matrix factorization (<xref ref-type="bibr" rid="B8">Choo et&#x20;al., 2013</xref>). Choo et&#x20;al. presented a user-driven topic modeling based on interactive non-negative matrix factorization capable of tuning the topic model result by integrating user interactions (<xref ref-type="bibr" rid="B26">Pedregosa et&#x20;al., 2011</xref>). Summarizing the abovementioned studies, we established a fully integrated text mining pipeline to find the gene term-feature, mutational landscape heatmap, and cancer information&#x20;topic.</p>
<p>With next-generation sequencing (NGS) technologies (<xref ref-type="bibr" rid="B33">Shabani Azim et&#x20;al., 2018</xref>), many targeted panels have been developed to detect hereditary cancer and monitor somatic mutation changes in progressive cancer (<xref ref-type="bibr" rid="B21">McCabe et&#x20;al., 2019</xref>). The Memorial Sloan Kettering Cancer Center has developed MSK-IMPACT (Memorial Sloan Kettering-Integrated Mutation Profiling of Actionable Cancer Targets), a hybridization capture-based next-generation sequencing assay for deep target sequencing of all exons and selected introns of 410 essential cancer genes in tumors (<xref ref-type="bibr" rid="B15">Hyman et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B7">Cheng et&#x20;al., 2015</xref>). The MSK-IMPACT panel performed well not only in the above study but also in a large-scale clinical sequencing project with more than 10,000 patients (<xref ref-type="bibr" rid="B45">Zehir et&#x20;al., 2017</xref>). They provided a comprehensive gene panel database including actionable drug targets, cancer susceptibility genes in hematological malignancies, and solid tumors. For solid tumors, the Oncomine Cancer Panel (OCP) is only used for the clinical screening of actionable genetic mutations in solid tumors (<xref ref-type="bibr" rid="B19">Luthra et&#x20;al., 2017</xref>). They significantly provide druggable target databases. We validate the biomedical literature mining through the MSK-IMPACT or OCP cancer gene panel NGS database.</p>
<p>We create a pipeline that can suggest additional genes for a gene panel given an existing set of genes. And we believe the approach is to evaluate several existing gene panels, and show that we can use part of the gene panel set to predict the remaining&#x20;genes.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<sec id="s2-1">
<title>PUBMED</title>
<p>PubMed, a free database of more than 30 million literature citations for biomedical literature, includes the fields of biomedicine and health. We extracted the abstracts that mentioned genes related to human cancer from PubMed and took the gene&#x2019;s context by gene window.</p>
</sec>
<sec id="s2-2">
<title>Machine Learning Model and Analysis</title>
<p>K nearest neighbors, linear support vector machine (SVM), Gaussian process, decision tree, random forest, neural net, and naive Bayes were used to conduct supervised machine learning. All the models were built by python with the scikit-learn package and used five-fold cross-validation (<xref ref-type="bibr" rid="B40">Wei et&#x20;al., 2015</xref>). The receiver operating characteristic (ROC) curve and the area under the ROC curve (AUC) were used to evaluate the model&#x2019;s performance.</p>
</sec>
<sec id="s2-3">
<title>Biomedical Term Tagging</title>
<sec id="s2-3-1">
<title>PubTator</title>
<p>PubTator (<xref ref-type="bibr" rid="B41">Wei et&#x20;al., 2013</xref>) is a web-based PubMed abstract biomedical named entity recognition (NER) system. PubTator can tag the gene, disease, chemical, species, and mutation in PubMed abstracts, and the tagging result could be accessed <italic>via</italic> the RESTful API. We used PubTator as a part of the biomedical term tagger.</p>
</sec>
<sec id="s2-3-2">
<title>Medical Subject Heading</title>
<p>MeSH is a hierarchically organized medical vocabulary thesaurus used for indexing articles for PubMed. PubMed Articles curated by NLM are indexed with several related MeSH terms; every MeSH term has unique id and hierarchical categories. With these characteristics of MeSH term and our tagging algorithm, we could tag biomedical terms that are not tagged by PubTator. Our algorithm started from the MeSH terms of each PubMed article. For each MeSH term in an article, we first created a MeSH term-mapping set that mapped a MeSH term to another set that contained itself and its lower hierarchy MeSH term. Second, for each MeSH term in the MeSH term-mapping set, we tried matching all of the entry terms, synonyms of a specific MeSH term, to every word in the article. If a word in the article matched any entry names of a MeSH term, we tagged that word as a biomedical term. This way, those terms having the same concepts could be merged and analyzed.</p>
</sec>
</sec>
<sec id="s2-4">
<title>Gene Term-Feature Term Frequency&#x2013;Inverse Document Frequency Matrix Construction</title>
<p>For a particular gene, considering all of its gene windows in the whole corpus, we calculated the frequency of the co-occurrence of the gene and features (terms) tagged by our algorithm in the window as the term frequency of the feature. The higher the term frequency is, the stronger the association of the gene and feature. In our study, term frequency (TF) was calculated using the following formula:<disp-formula id="equ1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>t</mml:mi>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>f</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>To calculate the inverse document frequency of each term feature, we simply count the occurrences of the term feature in all genes as document frequency. The inverse document frequency (IDF) was calculated using the following formula:<disp-formula id="equ2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>D</mml:mi>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mi>d</mml:mi>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The higher the IDF, the more specific the term feature is to a particular gene. Finally, by multiplying TF and IDF, the gene term -feature matrix was constructed.</p>
</sec>
<sec id="s2-5">
<title>Term Feature Selection by the Hypergeometric Test</title>
<p>We filtered out genes that had less than ten term features. We identified the critical term feature according to the gene panel using the <italic>p</italic>-values of hypergeometric tests as follows. We input the MSK-IMPACT (<xref ref-type="bibr" rid="B15">Hyman et&#x20;al., 2015</xref>) panel. Ns is the size of the MSK-IMPACT panel set S, <inline-formula id="inf1">
<mml:math id="m3">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> is the size of the set <inline-formula id="inf2">
<mml:math id="m4">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mo>&#x27;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which contains 500&#x20;non-MSK genes (randomly selected from the gene term-feature matrix) and all of the MSK genes, <inline-formula id="inf3">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of genes in the set <inline-formula id="inf4">
<mml:math id="m6">
<mml:mrow>
<mml:mtext>S&#x27;</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> that contain term feature t, and <inline-formula id="inf5">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of genes in the set <inline-formula id="inf6">
<mml:math id="m8">
<mml:mtext>S</mml:mtext>
</mml:math>
</inline-formula> containing <inline-formula id="inf7">
<mml:math id="m9">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula>. The random variable <inline-formula id="inf8">
<mml:math id="m10">
<mml:mtext>y</mml:mtext>
</mml:math>
</inline-formula> representing several genes containing the term feature in the set <inline-formula id="inf9">
<mml:math id="m11">
<mml:mtext>S</mml:mtext>
</mml:math>
</inline-formula> is a hypergeometric random variable with parameters <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>and <inline-formula id="inf11">
<mml:math id="m13">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B42">Westlake and Larson, 1970</xref>). The probability distribution of <inline-formula id="inf12">
<mml:math id="m14">
<mml:mtext>y</mml:mtext>
</mml:math>
</inline-formula> is shown as follows:<disp-formula id="equ3">
<mml:math id="m15">
<mml:mrow>
<mml:mtext>P</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mtext>y</mml:mtext>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mi>y</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mi>N</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>From <inline-formula id="inf13">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we compute the <italic>p</italic>-value, the probability of the observed (<inline-formula id="inf14">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>), as follows:<disp-formula id="equ4">
<mml:math id="m18">
<mml:mrow>
<mml:mtext>Pvalue</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mtext>min</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:munderover>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The <italic>p</italic>-value reflects significant phrases in <inline-formula id="inf15">
<mml:math id="m19">
<mml:mtext>S</mml:mtext>
</mml:math>
</inline-formula> compared with all of the genes in the gene term -feature matrix. A low <italic>p</italic>-value indicates that we observe a rare event and that the observed term feature represents a statistical discovery, suggesting that it is essential in the MSK-IMPACT&#x20;panel.</p>
</sec>
<sec id="s2-6">
<title>Topic Modeling</title>
<p>Our topic modeling was based on the algorithms of non-negative matrix factorization (NMF) (<xref ref-type="bibr" rid="B43">Yeganova et&#x20;al., 2014</xref>). Given a nonnegative matrix <inline-formula id="inf16">
<mml:math id="m20">
<mml:mrow>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, when the desired lower dimension is k, the goal of NMF is to find the two matrixes, <inline-formula id="inf17">
<mml:math id="m21">
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf18">
<mml:math id="m22">
<mml:mrow>
<mml:mtext>H</mml:mtext>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, having only non-negative entries such that <inline-formula id="inf19">
<mml:math id="m23">
<mml:mrow>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x2248;</mml:mo>
<mml:mtext>WH</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The objective function is shown as the following formula:<disp-formula id="equ5">
<mml:math id="m24">
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mtext>H</mml:mtext>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:munder>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:msubsup>
<mml:mrow/>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The function is the most commonly used formulation based on the Frobenius norm. K represents the number of topics we expected, <inline-formula id="inf20">
<mml:math id="m25">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula> represents the gene term-feature matrix, <inline-formula id="inf21">
<mml:math id="m26">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> represents the gene-topic matrix, and <inline-formula id="inf22">
<mml:math id="m27">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> represents the topic text-feature matrix. Since the weights in <inline-formula id="inf23">
<mml:math id="m28">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> and <inline-formula id="inf24">
<mml:math id="m29">
<mml:mi>H</mml:mi>
</mml:math>
</inline-formula> have been calculated, we used the top 20 genes and the top 20 text features with the highest importance for each topic to interpret the biomedical meaning.</p>
</sec>
<sec id="s2-7">
<title>Gene Window</title>
<p>We take the gene&#x2019;s context as its gene window. Each gene window contains three sentences. The sentence contains the gene, the previous sentence, and the next sentence. We want to eliminate the redundant part. Using the gene window algorithm, we could iterate through the full abstracts containing specific genes in the text and grip the most critical section for further analysis. We pick three sentences based on the concept that the sentence that is closer to the gene is more relevant to it. Since the closest ones are previous and the next one, so we picked&#x20;three.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Study Design and Workflow</title>
<p>This study develops a gene panel analysis framework that can discover the characteristics of a gene panel based on biomedical literature mining. The method overview is shown in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. First, we extracted the PubMed abstracts, which mentioned genes related to humans. The method is shown as <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>. In this step, approximately 430,000 PubMed abstracts regarding genes were filtered out from all of the current PubMed corpus (approximately 30 million articles). Second, we performed biomedical named entity recognition (NER) on the extracted PubMed abstracts using PubTator (<xref ref-type="bibr" rid="B39">Wang et&#x20;al., 2016</xref>) and MeSH (Medical Subject Headings). Third, we used the biomedical term to construct the gene term-feature matrix, which has a concept similar to that of the document-term matrix. Fourth, we performed term feature selection according to individual gene panels to make the term feature generated by the previous step stronger and correspond to the target gene&#x20;panel.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Study design and workflow The flowchart shows the overall analysis framework of this study. We first extracted 430,000 abstracts that mentioned genes related to humans in the PubMed corpus. Second, biomedical named entity recognition (NER) was performed to obtain biomedical terms, such as gene name, disease name, and drug name, using PubTator and MeSH. Third, we used the biomedical term tagged by the previous step to construct the gene term-feature matrix whose concept was similar to the document-term matrix. Fourth, we performed term feature selection according to a particular gene panel. We took the MSK-IMPACT panel as an example and made the term features generated by the previous step correspond more to the target gene panel using the hypergeometric distribution. Finally, several analyses, including identifying the top gene term features, creating the mutational landscape of cancers, and topic modeling based on nonnegative matrix factorization, were conducted to determine and interpret the biomedical characteristics of the target gene panel.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>An example displays how the term &#x201c;lung cancer&#x201d;, being tagged in MeSH hierarchical structure. The way &#x201c;lung cancer&#x201d; being tagged is as follows. First, we iterate through the MeSH terms of the index of PMID: 27823967 and found &#x201c;Lung Neoplasms&#x201d; was one of the MeSH terms, which its synonyms contain &#x201c;Lung Cancer.&#x201d; Second, if the term &#x201c;Lung Cancer&#x201d; also appeared in the article, the MeSH tagging algorithm would tag this word and take its MeSH ID for further analysis.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g002.tif"/>
</fig>
<p>Here, we explored the idea of the hypergeometric distribution. For each term feature, by comparing the distribution of occurrences in the target gene set and the whole gene set, the term features that correlated more with the target gene panel would be enriched. This approach is flexible in regard to different target gene sets, such as the Oncomine Cancer Panel or cardiovascular gene panels. Finally, we filtered out 4,630 term features from 20,015 term features. The filtered gene term-feature matrix, whose size is 4,679 (genes) x 4,630 (term features), will be used in the following analysis. Thus, we can discover the top 20 gene term features, the mutational landscape of the cancer genome, and topic modeling of cancer information. In this way, we can find the potential characteristics of the gene&#x20;panel.</p>
</sec>
<sec id="s3-2">
<title>Biomedical Term Extraction by Hypergeometric Test</title>
<p>In the field of biomedical literature mining, tagging the biomedical term is an important issue. For an abstract of the biomedical literature, only biomedical words are what we are interested in, such as drug name, disease name, or gene name. PubTator was capable of tagging the gene, disease, chemical, species, and mutation in PubMed abstracts. <xref ref-type="fig" rid="F3">Figure&#x20;3A</xref> shows the term feature extraction result of an <italic>EGFR</italic>-related abstract compared to the term features extracted by raw text TF-IDF scoring without biomedical term tagging. The biomedical term features were filled with redundant words, such as &#x201c;with&#x201d;, &#x201c;for&#x201d;, and &#x201c;after&#x201d;.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Biomedical term extraction <bold>(A)</bold> The term feature of an EGFR-related abstract. The former was filled with many redundant words, such as with, for, and after. The latter contains lots of biologically meaningful terms, such as gefitinib (chemical), non-small cell lung cancer (disease), L858R (mutation), the woman (species), and recurrence (MeSH). This phenomenon shows that the tagging approach with MeSH and PubTator terms is essential to gene term-feature extraction. <bold>(B)</bold> The proportion distribution bar chart of the MSK-IMPACT panel in each term feature group before and after the hypergeometric distribution test. It shows that after term feature selection, the proportion of the term feature groups of interest increases, such as cancer, drug, genetic phenomena, and phenotype.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g003.tif"/>
</fig>
<p>On the other hand, the term feature extraction approach with MeSH terms and PubTator resulted in term features that contained lots of biologically meaningful terms, such as gefitinib (chemical), non-small cell lung cancer (disease), L858R (mutation), the woman (species), and recurrence (MeSH). This phenomenon shows that the tagging approach is essential for gene term feature extraction.</p>
<p>To discover the characteristics of a gene panel, we used the hypergeometric distribution test. According to MeSH terms and PubTator categories, all the term features can be divided into five groups: cancer, drug, genetic phenomena, mutation, and phenotype (<xref ref-type="sec" rid="s11">Supplementary Table S1</xref>). Take the MSK-IMPACT panel as a target gene panel, for example. The distribution of the MSK-IMPACT panel shows that the percentage increases in some term feature groups after using the hypergeometric distribution test (<xref ref-type="fig" rid="F3">Figure&#x20;3B</xref>). We filtered out the unimportant genes and found the critical term features according to the gene panel using a hypergeometric distribution test. The proportion of term feature groups in our interest increases, such as cancer, drug, genetic phenomena, and phenotype. The percentage after using the hypergeometric distribution test showed a noticeable improvement from 8.01 to 25.03% in the cancer group. The proportion increased from 9.02 to 11.38% in the drug group and grew from 7.4 to 16.85% in the genetic phenomenon group. There was a slight increase from 32.85 to 35.79% in the phenotype group. After the term feature selection, the proportion decreased from 42.71 to 10.95% in the mutation group. The MSK-IMPACT panel stands for integrated mutation profiling of actionable cancer targets, so the percentage in these groups increases after the hypergeometric distribution&#x20;test.</p>
</sec>
<sec id="s3-3">
<title>3.3&#x20;Literature-derived Gene Term Features</title>
<p>The biomedical term features extracted from the literature were directly or indirectly related to each gene. Here, we took some cancer-related genes as examples for further demonstration. <xref ref-type="fig" rid="F4">Figures 4A,B</xref> show the top twenty biomedical term features with the highest TF-IDF scores for <italic>EGFR</italic> (the score range from 8.02 to 13.49) and <italic>BRAF</italic> (the score range from 5.35 to 18.66). For <italic>EGFR</italic>, which has been recognized for its importance in lung cancer (<xref ref-type="bibr" rid="B23">Paez et&#x20;al., 2004</xref>; <xref ref-type="bibr" rid="B34">Shepherd et&#x20;al., 2005</xref>), most of the term features directly represent lung cancer or its subtypes, such as &#x201c;Adenocarcinoma of the lung,&#x201d; &#x201c;Carcinoma, small cell,&#x201d; and &#x201c;Carcinoma, Non-Small Cell Lung.&#x201d; &#x201c;T790M&#x201d; is a drug resistance mutation frequently observed in patients with lung cancer (<xref ref-type="bibr" rid="B46">Zhou et&#x20;al., 2009</xref>). &#x201c;Erlotinib&#x201d; is an effective tyrosine kinase inhibitor (TKI) targeting <italic>EGFR</italic> for non-small cell lung carcinoma (NSCLC). &#x201c;Lapatinib&#x201d; is a dual <italic>EGFR/ERBB2</italic> TKI for metastatic breast cancer (<xref ref-type="bibr" rid="B4">Burris, 2004</xref>). Some term features were indirectly relevant to <italic>EGFR</italic>, such as &#x201c;Platinum&#x201d; and &#x201c;cisplatin,&#x201d; which are both standard chemotherapy in NSCLC (<xref ref-type="bibr" rid="B1">Arriagada et&#x20;al., 2004</xref>). <italic>EGFR</italic> TKIs are commonly compared with conventional platinum-based therapies. Another example is <italic>BRAF</italic>, whose mutations are widely detected in melanoma, thyroid cancer, and colorectal cancer (<xref ref-type="bibr" rid="B6">Chapman et&#x20;al., 2011</xref>). &#x201c;V600E&#x201d; is a crucial mutation that causes the constitutive activation of the cellular signaling pathway (<xref ref-type="bibr" rid="B6">Chapman et&#x20;al., 2011</xref>). &#x201c;Vemurafenib&#x201d; and &#x201c;dabrafenib&#x201d; are competitive inhibitors designed for <italic>BRAF</italic> with the V600E mutation (<xref ref-type="bibr" rid="B13">Hauschild et&#x20;al., 2012</xref>). The other examples, such as <italic>BRCA1</italic>, <italic>BRCA2</italic>, <italic>MLH1</italic>, and <italic>ERBB2</italic>, are shown in <xref ref-type="sec" rid="s11">Supplementary Figure S1</xref>. Nearly all of the biomedical term features relevant to these genes were consistent with current knowledge.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Top term features in <italic>EGFR</italic> and <italic>BRAF</italic> genes <bold>(A)</bold> The bar chart shows the TF-IDF scores of term features related to <italic>EGFR</italic>. Most of the identified term features for <italic>EGFR</italic> were associated with syndromes (e.g., lung adenocarcinoma and non-small cell lung carcinoma), mutations (e.g., T790M), and therapies (e.g., erlotinib and lapatinib) for lung cancer. <bold>(B)</bold> The bar chart shows the TF-IDF scores of term features related to <italic>BRAF</italic>. Biomedical term features, including cancer types (e.g., melanoma and thyroid cancer), mutations (e.g., V600E), and inhibitors (e.g., vemurafenib and dabrafenib) for <italic>BRAF</italic>, were consistent with known findings.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g004.tif"/>
</fig>
</sec>
<sec id="s3-4">
<title>Mutational Landscape of the Actionable Cancer Genome From Biomedical Literature Mining Validated by NGS Database</title>
<p>We constructed the gene-cancer association matrix from the filtered gene term -feature matrix to understand the associations between cancer types and gene mutations. The recurrent common cancer-associated genes are shown in <xref ref-type="fig" rid="F5">Figure&#x20;5A</xref>. The most common cancer-associated genes were <italic>TP53</italic>, <italic>EGFR</italic>, <italic>CTNNB1</italic>, <italic>NOTCH1</italic>, and <italic>PTEN</italic>, as shown in <xref ref-type="fig" rid="F5">Figure&#x20;5B</xref>. Using two genes, <italic>EGFR</italic> and <italic>BRAF</italic>, as examples, we found that <italic>EGFR</italic> L858R and T790M and <italic>BRAF</italic> V600E were important mutation term features in text mining and were frequently mutated in MSK samples (<xref ref-type="fig" rid="F5">Figure&#x20;5C</xref>). The cosine similarity of gene frequency between text mining and a statistical result from clinical sequencing data (<xref ref-type="bibr" rid="B9">Demeester et&#x20;al., 2016</xref>) is 80.8% (<xref ref-type="fig" rid="F5">Figure&#x20;5B</xref>). To understand the time series of the association between gene mutations and cancer types in the last decade, we constructed the gene-cancer TF-IDF matrixes of the years from 2011 to 2015 and the years from 2016 to 2019. As shown in <xref ref-type="sec" rid="s11">Supplementary Figure S2A and S2B</xref>, we found that cancer immunotherapy was a major issue in the past 5&#xa0;years. The rank of CD274 was increased, and CTLA4 first appeared (<xref ref-type="bibr" rid="B32">Seidel et&#x20;al., 2018</xref>). In addition, the TF-IDF value of <italic>BRAF</italic> mutation in colorectal cancers increased because of the better outcomes of the <italic>BRAF</italic>-mutant CRC tumors with microsatellite instability (MSI) in immunotherapy (<xref ref-type="bibr" rid="B30">Rosenbaum et&#x20;al., 2016</xref>). The results indicate that we can design a series of cancer gene panels by updating the literature mining time&#x20;frame.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>The spectrum and frequency of actionable genetic mutation by literature mining <bold>(A)</bold> Heatmap of cancer genomics by the TF-IDF matrix. The X-axis represents the 31 common cancer types, and the y-axis represents the recurrent somatic genes. The darker color indicates a higher association between genes and cancer. <bold>(B)</bold> The bar plot shows the gene frequency within all of the cancer types. The data is validated by the MSK-IMPACT Clinical Sequencing Cohort, which is targeted sequencing of 10,000 clinical cases using the MSK-IMPACT assay. The cosine similarity of gene frequency between text mining and a statistical result from clinical sequencing data is 80.8%. <bold>(C)</bold> Lollipop plot of <italic>EGFR</italic> and <italic>BRAF</italic> in the MSK-IMPACT pan-cancer cohort. The critical gene mutation term features found by text mining are shown and labeled in red. Other gene mutations are labeled in&#x20;green.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g005.tif"/>
</fig>
</sec>
<sec id="s3-5">
<title>Gene Panel Prediction by Machine Learning Models</title>
<p>Seven machine learning prediction models, including nearest neighbors, linear support vector machine (SVM), Gaussian process, decision tree, random forest, neural net, and Naive Bayes (<xref ref-type="bibr" rid="B40">Wei et&#x20;al., 2015</xref>), were used to verify the specific gene panel (<xref ref-type="fig" rid="F6">Figure&#x20;6A</xref>). The MSK-IMPACT, Oncomine Comprehensive Assay (<xref ref-type="bibr" rid="B29">Rhodes et&#x20;al., 2007</xref>), and cardiovascular gene panels (<xref ref-type="bibr" rid="B24">Paige et&#x20;al., 2018</xref>) represent different gene characteristics. There are 410 essential cancer genes in the MSK-IMPACT panel. The Oncomine Comprehensive Assay includes 161&#x20;cancer-related genes. We used the congenital heart defect focus panel of 115 genes associated with congenital heart defects (CHDs) as the cardiovascular gene panels.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Performance of the machine learning models with the gene panel <bold>(A)</bold> Evaluation of the overall accuracy, precision (positive predictive value, PPV), recall (sensitivity), and F1-score of every prediction model. Each gene could be labeled a target or non-target, indicating whether the gene is in the given target panel. The following seven prediction models were used: nearest neighbors, linear support vector machine (SVM), Gaussian process, decision tree, random forest, neural net, and Naive Bayes. The target gene panels were MSK-IMPACT, Oncomine Comprehensive Assay, and cardiovascular gene panels. <bold>(B)</bold> Receiver operating characteristic (ROC) curves of the models with the MSK-IMPACT 410-cancer gene panel. The neural net model had the highest area under the ROC curve (AUC), which was 0.992.</p>
</caption>
<graphic xlink:href="fgene-12-771435-g006.tif"/>
</fig>
<p>Each gene can be labeled as a target or non-target, which indicates whether the gene is in the given target panel. We performed five-fold cross-validation on our dataset to evaluate the models&#x2019; efficiency and evaluate the overall accuracy of each prediction model. We measured the target and non-target genes in each prediction model separately with precision (positive predictive value, PPV), recall (sensitivity), and F1-score. The accuracies for nearest neighbors, linear SVM, Gaussian process, decision tree, random forest, neural net, and naive Bayes in the MSK-IMPACT panel were 0.786, 0.913, 0.868, 0.799, 0.663, 0.959, and 0.831, respectively; the accuracies for all models in the OCP gene panel were 0.814, 0.989, 0.938, 0.907, 0.773, 1 and 0.958; and the accuracies for all the models in the cardiovascular gene panel were 0.777, 0.814, 0.925, 0.87, 0.648, 1, and 0.87. The receiver operating characteristic (ROC) curve analysis confirmed that the neural net model had a better prediction performance; the area under the ROC curve (AUC) was 0.992 (<xref ref-type="fig" rid="F6">Figure&#x20;6B</xref>). The AUCs of nearest neighbors, linear SVM, Gaussian process, decision tree, random forest, and naive Bayes were 0.909, 0.972, 0.953, 0.869, 0.692, and 0.842, respectively. The results of the biomedical term feature set prediction models are good, and the performance can reach up to 0.9. This means that the term feature sets can contain most of the information in the gene&#x20;panel.</p>
</sec>
<sec id="s3-6">
<title>Design of Cancer-Related Gene Panels Based on Topic Modeling</title>
<p>To understand the MSK-IMPACT panel characteristics, we generated thirty topics that potentially represented different biomedical meanings. The following are some examples of issues relevant to genes in the MSK-IMPACT panel. <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> shows the text features, genes, and related pathways derived from the Reactome of topics 2, 7, and 14, including hematologic, and malignancies. In topic two, leukemia subtypes and targeted inhibitors (e.g., imatinib, dasatinib, and decitabine) were mined. Heart arrest, a common side effect of inhibitors for leukemia, was also been reported (<xref ref-type="bibr" rid="B14">Hochhaus et&#x20;al., 2009</xref>). The related MSK-IMPACT panel in topic two was involved in the signaling of interleukin-4 and interleukin-13 (<italic>p</italic>&#x20;&#x3d; 5.27e-5), which was associated with the apoptosis of leukemia cells (<xref ref-type="bibr" rid="B5">Chaouchi et&#x20;al., 1996</xref>; <xref ref-type="bibr" rid="B27">Pe&#xf1;a-Mart&#xed;nez et&#x20;al., 2018</xref>) (<xref ref-type="fig" rid="F7">Figure&#x20;7A</xref>). These results indicated that topic two was associated with leukemia, a hematological malignancy. In topic seven, key text features such as kidney neoplasms, carcinoma, renal cell, and Wilms tumor implied the relationship between topic seven and kidney cancer. Inhibitors for kidney cancer, such as sorafenib and everolimus, were also identified (<xref ref-type="bibr" rid="B20">Mart&#xed;n-Aguilar et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B28">Ren et&#x20;al., 2021</xref>). The hypoxia pathway enriched by <italic>VHL</italic>, <italic>VEGFA</italic>, and <italic>PBRM1</italic> (<italic>p</italic>&#x20;&#x3d; 5.41e-11) played a crucial role in the governance of cancer stem cells of renal cancer (<xref ref-type="bibr" rid="B22">Myszczyszyn et&#x20;al., 2015</xref>) (<xref ref-type="fig" rid="F7">Figure&#x20;7B</xref>). In topic 14, colorectal neoplasms, hereditary nonpolyposis, adenomatous polyposis coli, oxaliplatin, and cetuximab were associated with colon cancer. Related genes (e.g., <italic>MLH1</italic>, <italic>MSH2</italic>, and <italic>MSH6</italic>) in topic 14 were involved in mismatch repair (<italic>p</italic>&#x20;&#x3d; 5.72e-8), which has clinical importance in Lynch syndrome (<xref ref-type="bibr" rid="B37">Truninger et&#x20;al., 2005</xref>) (<xref ref-type="fig" rid="F7">Figure&#x20;7C</xref>). Other examples of different cancers, including brain cancer, gynecologic cancer, and breast cancer, are shown in <xref ref-type="sec" rid="s11">Supplementary Figure S3</xref>. These results indicated that most of the genes in the MSK-IMPACT panel were collected for either therapeutic usage or biological relevance to various cancer types. In the future, we could design a small subset of multiple-gene groups by cancer&#x20;topic.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Examples of cancer topics containing relevant text features, genes, and pathways <bold>(A)</bold> Figure showing the text features, genes, and pathways of topic 2. Cancer types (e.g., leukemia) and inhibitors (e.g., imatinib) were reported in this topic. Reactome pathway analysis revealed that the related genes of the MSK-IMPACT panel in topic 2 (e.g., FLT3) were involved in interleukin-4 and interleukin-13 signaling (<italic>p</italic>&#x20;&#x3d; 5.27e-5). <bold>(B)</bold> Figure showing the text features, genes, and pathways of topic 7. Text features including cancer types (e.g., kidney neoplasms) and inhibitors (e.g., sorafenib) implied the relationship between topic seven and kidney cancer. The hypoxia pathway enriched by related genes (e.g., VHL) of the MSK-IMPACT panel in topic 7 (<italic>p</italic>&#x20;&#x3d; 5.41e-11) played a crucial role in the governance of cancer stem cells of renal cancer. <bold>(C)</bold> Figure showing the text features, genes, and pathways of topic 14. Many text features containing cancer types (e.g., colorectal neoplasms) and inhibitors (e.g., oxaliplatin) indicated the association between topic 14 and colon cancer. Related genes of the MSK-IMPACT panel in topic 14 (e.g., MLH1) were involved in the mismatch repair pathway (<italic>p</italic>&#x20;&#x3d; 5.72e-8).</p>
</caption>
<graphic xlink:href="fgene-12-771435-g007.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<title>Disscussion</title>
<p>It is helpful to gain insight into the field that bridges the knowledge gap between valuable biomedical information and free text by text mining (Sachin <xref ref-type="bibr" rid="B17">Kumar Deshmukh, 2020</xref>). With biomedical text mining advances and its applications in cancer research, we can design cancer gene panels by the semantic interpretation of comprehensive cancer narratives. Here, we used a biomedical literature mining model to discover the characteristics of a gene panel. Importantly, we demonstrated and validated the performance of the machine learning approach in text mining of cancer information. Our results highlight the following important points. 1) We developed a gene panel analysis framework based on a biomedical text mining pipeline. 2) Our pipeline can enrich the term features of cancer gene panels. 3) We demonstrated and validated the patterns of the cancer mutational landscape by NGS database. 4) The non-negative matrix factorization (NMF) method and topic modeling are useful for generating cancer information. Biomedical literature mining is valuable for discovering the inherent characteristics of gene panels. These results could be applied to the classification of cancer-related information and strategies for novel cancer gene panel designs.</p>
<p>The hypergeometric distribution test is one of the practical machine learning tools in TM. It can be used to select and extract term features from various genomic characterizations (<xref ref-type="bibr" rid="B25">Pal, 2017</xref>). We identified the critical term features according to the gene panel using <italic>p</italic>-values based on a hypergeometric test. Our term feature selection methods can distinguish in different gene panels. This implicates a high-performance prediction model for different datasets, including the MSK-IMPACT panel, Oncomine Cancer Panel, and cardiovascular gene panels. Although many gene recommendation algorithms have been developed, little is known about gene panel design.</p>
<p>Our biomedical term tagging algorithm provides a compressive cancer gene panel and related information. With our tagging algorithm, most of the essential biomedical terms in the text have been tagged. The construction of a gene term-feature matrix in different categories provides useful profiling for the characteristics of the genes. In this study, we constructed a biologically meaningful platform to analyze gene panels in terms of the diseases, chemicals, mutations, and MeSH terms related to genes. We can implement more biomedical term feature matrixes, such as a drug-feature matrix and disease-feature matrix. These different types of forms can provide strategies to analyze biology. With NMF topic modeling, we can capture cancer gene-drug information compatible with our knowledge. It will be useful to design a small subset of cancer gene panels by interpreting the topic&#x20;model.</p>
<p>For the discovery of cancer gene panels, <xref ref-type="fig" rid="F5">Figure&#x20;5A</xref> and <xref ref-type="fig" rid="F7">Figure&#x20;7C</xref> illustrate an example of a cancer gene panel design for colorectal cancer. The most frequent genes are <italic>KRAS</italic>, <italic>EGFR</italic>, <italic>BRAF</italic>, <italic>PTEN</italic>, <italic>TP53</italic>, <italic>MLH1</italic>, <italic>PIK3CA</italic>, <italic>CTNNB1</italic> in colorectal cancer by the heatmap. Hereditary nonpolyposis colon cancer (HNPCC) is caused by inherited mismatch repair genetic mutations, including <italic>MLH1, MSH2,</italic> and <italic>MSH6.</italic> The lifetime ovarian cancer risk increased in HNPCC. We can find ovarian cancer and a gene panel including <italic>MLH1</italic>, <italic>MSH2</italic>, <italic>MSH6</italic>, <italic>BRAF</italic>, <italic>KRAS</italic>, <italic>SMAD4</italic>, <italic>NRAS</italic>, <italic>CTNNB1</italic> by topic model. In our study, we can design the two different cancer panels by phenotype. These results indicated the platform could provide an opportunity to construct a cancer gene panel recommendation by different cancer subtypes. There are some text mining limitations in our study. The entity-term based features are based only on co-occurrence in three sentences. However, related entities may have distinct relationships, which are not necessarily co-occurred. The features were obtained from only one resource, PubMed abstracts. Many curated databases have many useful biological features of genes or diseases or drugs; for example, Gene Ontology (GO) (<xref ref-type="bibr" rid="B2">Ashburner et&#x20;al., 2000</xref>; <xref ref-type="bibr" rid="B36">The Gene Ontology Consortium., 2017</xref>) contains GO terms that describe genes by the functions of genes or cellular components. It may provide a benefit to the cancer researcher. Unfortunately, the TF-IDF table is going to weight toward common diseases and omit those that are critical in identifying rare diseases. The gene panels are not useful for the identification of unknown or rare gene mutations that are important for treatment. Simultaneously, the manuscripts and supplementary materials may also provide more critical results, but the lack of standardization in accessing this information is a significant problem. The text mining method often focuses on a few sentences due to the challenges of creating a complicated relationship between several critical keywords.</p>
<p>As we know, the random forest algorithm performed well than the decision tree in most of pattern classification cases. However, we found that the random forest approach presented a worse ability for cancer gene panel prediction in the experiments. Several reasons may cause this situation in the model training and evaluation, such as whether or not we specify the maximum number of features to be included at each node split. One of the reasons is that the random forest builds subtrees by randomly choosing features from amounts of features in our study. Unlike the other methods, they calculated the weights for each feature by determining the importance of all features. Thus, the performance might be increased when we increase the number of trees in the random forest. Because the subtrees increased, the model will be seen more features to build more diverse trees. Therefore, the model will become robust and make an excellent performance. Nevertheless, in this paper, we are focusing on a pipeline that can contextualize genes. We used the default parameter in most of the methods in our study. Although we are not emphasizing the methods and parameters optimization, it is also an important issue that we will study in our future&#x20;works.</p>
<p>Several text mining systems have been developed for mutation-disease association (<xref ref-type="bibr" rid="B12">Erdogmus and Sezermen., 2007</xref>; <xref ref-type="bibr" rid="B44">Yeniterzi and Sezerman., 2009</xref>; <xref ref-type="bibr" rid="B35">Singhal et&#x20;al., 2016</xref>). An automated pipeline using the full-length biomedical literature was recently established and validated by evidence-based gene panels (<xref ref-type="bibr" rid="B31">Saberian et&#x20;al., 2020</xref>). All these methods focus on mutation-disease associations. In contrast, we contextualized the genes for clinical precision medicine. We provide information about druggable targets, mutations in hereditary cancer syndrome, and disease subtypes.</p>
<p>Although many text mining-based gene panel algorithms were developed, there is still little known to validate the gene panel characteristics. This study provides a biomedical literature mining pipeline in gene panel discovery and interpretation. The platform validated by NGS database could provide an opportunity to construct a gene recommendation and annotation system for precision medicine.</p>
</sec>
<sec sec-type="conclusions" id="s5">
<title>Conclusions</title>
<p>In conclusion, this study highlights the importance of biomedical literature mining in gene panel discovery and interpretation. The platform could provide an opportunity to construct a gene recommendation and annotation system for precision medicine.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s11">Supplementary Material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>Conception and study design: H-OC, P-CL, C-RL, C-SW, and J-HC; Development of methodology: H-OC, P-CL, J-HC; Acquisition of data: H-OC; C-RL, and C-SW; Statistical and computational analysis: P-CL, H-OC, C-RL, and J-HC; Writing, review, and revision of the manuscript: H-OC, P-CL, C-RL, and J-HC; Study supervision: J-HC; All authors have read and approved the manuscript. All authors agree for publication.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>This work was supported in part by the Ministry of Science and Technology (MOST), Taiwan under Research Grant of MOST 110-2634-F-006-014 and MOST 110-2634-F-006-020, Ministry of Health and Welfare (MOHW110-TDU-B-211-144018). All authors have read and approved the manuscript.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ack>
<p>The authors gratefully acknowledge the significant contribution of Kimforest LTD. Taiwan, KD Yang, CC Pan, and CJ Lee for the bioinformatics support.</p>
</ack>
<sec id="s11">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2021.771435/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2021.771435/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Image3.TIFF" id="SM1" mimetype="application/TIFF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image1.TIFF" id="SM2" mimetype="application/TIFF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM3" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image2.TIFF" id="SM4" mimetype="application/TIFF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arriagada</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bergman</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dunant</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Le Chevalier</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pignon</surname>
<given-names>J.&#x20;P.</given-names>
</name>
<name>
<surname>Vansteenkiste</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>&#x26; International Adjuvant Lung Cancer Trial Collaborative GroupCisplatin-Based Adjuvant Chemotherapy in Patients with Completely Resected Non-small-cell Lung Cancer</article-title>. <source>N. Engl. J.&#x20;Med.</source> <volume>350</volume> (<issue>4</issue>), <fpage>351</fpage>&#x2013;<lpage>360</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa031644</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ashburner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ball</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Blake</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Botstein</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Butler</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cherry</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>Gene Ontology: Tool for the Unification of Biology. The Gene Ontology Consortium</article-title>. <source>Nat. Genet.</source> <volume>25</volume> (<issue>1</issue>), <fpage>25</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1038/75556</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Azam</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Musa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dehmer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yli-Harja</surname>
<given-names>O. P.</given-names>
</name>
<name>
<surname>Emmert-Streib</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Global Genetics Research in Prostate Cancer: A Text Minning and Computational Network Theory Approach</article-title>. <source>Front. Genet.</source> <volume>10</volume>, <fpage>70</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2019.00070</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Burris</surname>
<given-names>H. A.</given-names>
<suffix>3rd</suffix>
</name>
</person-group> (<year>2004</year>). <article-title>Dual Kinase Inhibition in the Treatment of Breast Cancer: Initial Experience with the EGFR/ErbB-2 Inhibitor Lapatinib</article-title>. <source>Oncologist</source> <volume>9</volume> (<issue>Suppl. 3</issue>), <fpage>10</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1634/theoncologist.9-suppl_3-10</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaouchi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wallon</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Goujard</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tertian</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rudent</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Caput</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>1996</year>). <article-title>Interleukin-13 Inhibits Interleukin-2-Induced Proliferation and Protects Chronic Lymphocytic Leukemia B&#x20;Cells from <italic>In Vitro</italic> Apoptosis</article-title>. <source>Blood</source> <volume>87</volume> (<issue>3</issue>), <fpage>1022</fpage>&#x2013;<lpage>1029</lpage>. </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chapman</surname>
<given-names>P. B.</given-names>
</name>
<name>
<surname>Hauschild</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Robert</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Haanen</surname>
<given-names>J.&#x20;B.</given-names>
</name>
<name>
<surname>Ascierto</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Larkin</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group>
<collab>BRIM-3 Study Group</collab> (<year>2011</year>). <article-title>Improved Survival with Vemurafenib in Melanoma with BRAF V600E Mutation</article-title>. <source>N. Engl. J.&#x20;Med.</source> <volume>364</volume> (<issue>26</issue>), <fpage>2507</fpage>&#x2013;<lpage>2516</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa1103782</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>D. T.</given-names>
</name>
<name>
<surname>Mitchell</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Zehir</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Benayed</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Syed</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Memorial Sloan Kettering-Integrated Mutation Profiling of Actionable Cancer Targets (MSK-IMPACT): A Hybridization Capture-Based Next-Generation Sequencing Clinical Assay for Solid Tumor Molecular Oncology</article-title>. <source>J.&#x20;Mol. Diagn.</source> <volume>17</volume> (<issue>3</issue>), <fpage>251</fpage>&#x2013;<lpage>264</lpage>. <pub-id pub-id-type="doi">10.1016/j.jmoldx.2014.12.006</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Reddy</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>UTOPIAN: User-Driven Topic Modeling Based on Interactive Nonnegative Matrix Factorization</article-title>. <source>IEEE Trans. Vis. Comput. Graph.</source> <volume>19</volume> (<issue>12</issue>), <fpage>1992</fpage>&#x2013;<lpage>2001</lpage>. <pub-id pub-id-type="doi">10.1109/TVCG.2013.212</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Demeester</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Corado</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Distributed Representations of Words and Phrases and Their Compositionality</article-title>. <source>EMNLP 2016&#x20;&#x2013; Conf. Empir. Methods Nat. Lang. Process. Proc.</source>, <fpage>1389</fpage>&#x2013;<lpage>1399</lpage>. <ext-link ext-link-type="uri" xlink:href="http://arXiv: 1606.08359">arXiv: 1606.08359</ext-link>. </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Devarajan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Ebrahimi</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A Unified Statistical Approach to Non-negative Matrix Factorization and Probabilistic Latent Semantic Indexing</article-title>. <source>Mach. Learn.</source> <volume>99</volume> (<issue>1</issue>), <fpage>137</fpage>&#x2013;<lpage>163</lpage>. <pub-id pub-id-type="doi">10.1007/s10994-014-5470-z</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhi</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Gene2vec: Distributed Representation of Genes Based on Co-expression</article-title>. <source>BMC Genomics</source> <volume>20</volume> (<issue>Suppl. 1</issue>), <fpage>82</fpage>. <pub-id pub-id-type="doi">10.1186/s12864-018-5370-x</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Erdogmus</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sezerman</surname>
<given-names>O. U.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Application of Automatic Mutation-Gene Pair Extraction to Diseases</article-title>. <source>J.&#x20;Bioinform. Comput. Biol.</source> <volume>5</volume> (<issue>6</issue>), <fpage>1261</fpage>&#x2013;<lpage>1275</lpage>. <pub-id pub-id-type="doi">10.1142/s021972000700317x</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hauschild</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grob</surname>
<given-names>J.&#x20;J.</given-names>
</name>
<name>
<surname>Demidov</surname>
<given-names>L. V.</given-names>
</name>
<name>
<surname>Jouary</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gutzmer</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Millward</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Dabrafenib in BRAF-Mutated Metastatic Melanoma: a Multicentre, Open-Label, Phase 3 Randomised Controlled Trial</article-title>. <source>Lancet</source> <volume>380</volume> (<issue>9839</issue>), <fpage>358</fpage>&#x2013;<lpage>365</lpage>. <pub-id pub-id-type="doi">10.1016/S0140-6736(12)60868-X</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hochhaus</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>O&#x27;Brien</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>Guilhot</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Druker</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Branford</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Foroni</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group>
<collab>IRIS Investigators</collab> (<year>2009</year>). <article-title>Six-year Follow-Up of Patients Receiving Imatinib for the First-Line Treatment of Chronic Myeloid Leukemia</article-title>. <source>Leukemia</source> <volume>23</volume> (<issue>6</issue>), <fpage>1054</fpage>&#x2013;<lpage>1061</lpage>. <pub-id pub-id-type="doi">10.1038/leu.2009.38</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hyman</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Solit</surname>
<given-names>D. B.</given-names>
</name>
<name>
<surname>Arcila</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>D. T.</given-names>
</name>
<name>
<surname>Sabbatini</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Baselga</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Precision Medicine at Memorial Sloan Kettering Cancer Center: Clinical Next-Generation Sequencing Enabling Next-Generation Targeted Therapy Trials</article-title>. <source>Drug DiscovToday</source> <volume>20</volume> (<issue>12</issue>), <fpage>1422</fpage>&#x2013;<lpage>1428</lpage>. <pub-id pub-id-type="doi">10.1016/j.drudis.2015.08.005</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ikonomakis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kotsiantis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tampakas</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Text Classification Using Machine Learning Techniques</article-title>. <source>WSEAS Trans. Comput.</source> <volume>4</volume>. </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar Deshmukh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Machine Learning for Precision Medicine in Cancer- Transforming Drug Discovery and Treatment</article-title>. <source>J.&#x20;Cancer Biol.</source> <volume>1</volume>, <fpage>20</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.46439/cancerbiology.1.005</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leaman</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Islamaj Dogan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>DNorm: Disease Name Normalization with Pairwise Learning to Rank</article-title>. <source>Bioinformatics</source> <volume>29</volume> (<issue>22</issue>), <fpage>2909</fpage>&#x2013;<lpage>2917</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btt474</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luthra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>K. P.</given-names>
</name>
<name>
<surname>Routbort</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Broaddus</surname>
<given-names>R. R.</given-names>
</name>
<name>
<surname>Yau</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Simien</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>A Targeted High-Throughput Next-Generation Sequencing Panel for Clinical Screening of Mutations, Gene Amplifications, and Fusions in Solid Tumors</article-title>. <source>J.&#x20;Mol. Diagn.</source> <volume>19</volume> (<issue>2</issue>), <fpage>255</fpage>&#x2013;<lpage>264</lpage>. <pub-id pub-id-type="doi">10.1016/j.jmoldx.2016.09.011</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mart&#xed;n-Aguilar</surname>
<given-names>A. E.</given-names>
</name>
<name>
<surname>N&#xfa;&#xf1;ez-L&#xf3;pez</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ramirez-Sandoval</surname>
<given-names>J.&#x20;C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Sorafenib as a Second-Line Treatment in Metastatic Renal Cell Carcinoma in Mexico: a Prospective Cohort Study</article-title>. <source>BMC Cancer</source> <volume>21</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1186/s12885-020-07720-5</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McCabe</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Gauthier</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Thompson</surname>
<given-names>T. J.</given-names>
</name>
<name>
<surname>De Sousa</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Puttick</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Development and Validation of a Targeted Gene Sequencing Panel for Application to Disparate Cancers</article-title>. <source>Sci. Rep.</source> <volume>9</volume> (<issue>1</issue>), <fpage>17052</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-019-52000-3</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Myszczyszyn</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Czarnecka</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Matak</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Szymanski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Kornakiewicz</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>The Role of Hypoxia and Cancer Stem Cells in Renal Cell Carcinoma Pathogenesis</article-title>. <source>Stem Cel Rev. Rep.</source> <volume>11</volume> (<issue>6</issue>), <fpage>919</fpage>&#x2013;<lpage>943</lpage>. <pub-id pub-id-type="doi">10.1007/s12015-015-9611-y</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paez</surname>
<given-names>J.&#x20;G.</given-names>
</name>
<name>
<surname>J&#xe4;nne</surname>
<given-names>P. A.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.&#x20;C.</given-names>
</name>
<name>
<surname>Tracy</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Greulich</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gabriel</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2004</year>). <article-title>EGFR Mutations in Lung Cancer: Correlation with Clinical Response to Gefitinib Therapy</article-title>. <source>Science</source> <volume>304</volume> (<issue>5676</issue>), <fpage>1497</fpage>&#x2013;<lpage>1500</lpage>. <pub-id pub-id-type="doi">10.1126/science.1099314</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paige</surname>
<given-names>S. L.</given-names>
</name>
<name>
<surname>Saha</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Priest</surname>
<given-names>J.&#x20;R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Beyond Gene Panels: Whole Exome Sequencing for Diagnosis of Congenital Heart Disease</article-title>. <source>Circ. Genom. Precis. Med.</source> <volume>11</volume> (<issue>3</issue>), <fpage>e002097</fpage>. <pub-id pub-id-type="doi">10.1161/CIRCGEN.118.002097</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pal</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Feature Selection and Extraction from Heterogeneous Genomic Characterizations</source>. <publisher-name>Predictive Modeling of Drug Sensitivity</publisher-name>, <fpage>45</fpage>&#x2013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.1016/b978-0-12-805274-7.00003-8</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine Learning in Python</article-title>. <source>J.&#x20;Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. <ext-link ext-link-type="uri" xlink:href="http://arXiv: 201.0490">arXiv: 201.0490</ext-link>. </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pe&#xf1;a-Mart&#xed;nez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Eriksson</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ramakrishnan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chapellier</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>H&#xf6;gberg</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Orsmark-Pietras</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Interleukin 4 Induces Apoptosis of Acute Myeloid Leukemia Cells in a Stat6-dependent Manner</article-title>. <source>Leukemia</source> <volume>32</volume> (<issue>3</issue>), <fpage>588</fpage>&#x2013;<lpage>596</lpage>. <pub-id pub-id-type="doi">10.1038/leu.2017.261</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Clinical Analysis of Everolimus in the Treatment of Metastatic Renal Cell Carcinoma</article-title>. <source>Ann. Palliat. Med.</source> <volume>10</volume>. <pub-id pub-id-type="doi">10.21037/apm-20-2465</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rhodes</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Kalyana-Sundaram</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mahavisno</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Varambally</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Briggs</surname>
<given-names>B. B.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>Oncomine 3.0: Genes, Pathways, and Networks in a Collection of 18,000 Cancer Gene Expression Profiles</article-title>. <source>Neoplasia</source> <volume>9</volume> (<issue>2</issue>), <fpage>166</fpage>&#x2013;<lpage>180</lpage>. <pub-id pub-id-type="doi">10.1593/neo.07112</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rosenbaum</surname>
<given-names>M. W.</given-names>
</name>
<name>
<surname>Bledsoe</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Morales-Oyarvide</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Huynh</surname>
<given-names>T. G.</given-names>
</name>
<name>
<surname>Mino-Kenudson</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>PD-L1 Expression in Colorectal Cancer Is Associated with Microsatellite Instability, BRAF Mutation, Medullary Morphology and Cytotoxic Tumor-Infiltrating Lymphocytes</article-title>. <source>Mod. Pathol.</source> <volume>29</volume> (<issue>9</issue>), <fpage>1104</fpage>&#x2013;<lpage>1112</lpage>. <pub-id pub-id-type="doi">10.1038/modpathol.2016.95</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saberian</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Shafi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Peyvandipour</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Draghici</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MAGPEL: an autoMated Pipeline for Inferring vAriant-Driven Gene PanEls from the Full-Length Biomedical Literature</article-title>. <source>Sci. Rep.</source> <volume>10</volume> (<issue>1</issue>), <fpage>12365</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-68649-0</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Seidel</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Otsuka</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kabashima</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Anti-PD-1 and Anti-CTLA-4 Therapies in Cancer: Mechanisms of Action, Efficacy, and Limitations</article-title>. <source>Front. Oncol.</source> <volume>8</volume>, <fpage>86</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2018.00086</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shabani Azim</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Houri</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ghalavand</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Nikmanesh</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Next Generation Sequencing in Clinical Oncology: Applications, Challenges and Promises: A Review Article</article-title>. <source>Iran. J.&#x20;Public Health</source> <volume>47</volume>, <fpage>1453</fpage>&#x2013;<lpage>1457</lpage>. <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6277731/">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6277731/</ext-link>. </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shepherd</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Rodrigues Pereira</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ciuleanu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>E. H.</given-names>
</name>
<name>
<surname>Hirsh</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thongprasert</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Erlotinib in Previously Treated Non-small-cell Lung Cancer</article-title>. <source>N. Engl. J.&#x20;Med.</source> <volume>353</volume> (<issue>2</issue>), <fpage>123</fpage>&#x2013;<lpage>132</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa050753</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singhal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Simmons</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Text Mining for Precision Medicine: Automating Disease-Mutation Relationship Extraction from Biomedical Literature</article-title>. <source>J.&#x20;Am. Med. Inform. Assoc.</source> <volume>23</volume> (<issue>4</issue>), <fpage>766</fpage>&#x2013;<lpage>772</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocw041</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<collab>The Gene Ontology Consortium</collab> (<year>2017</year>). <article-title>Expansion of the Gene Ontology Knowledgebase and Resources</article-title>. <source>Nucleic Acids Res.</source> <volume>45</volume> (<issue>D1</issue>), <fpage>D331</fpage>&#x2013;<lpage>D338</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkw1108</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Truninger</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Menigatti</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Luz</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Haider</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gebbers</surname>
<given-names>J.&#x20;O.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Immunohistochemical Analysis Reveals High Frequency of PMS2 Defects in Colorectal Cancer</article-title>. <source>Gastroenterology</source> <volume>128</volume> (<issue>5</issue>), <fpage>1160</fpage>&#x2013;<lpage>1171</lpage>. <pub-id pub-id-type="doi">10.1053/j.gastro.2005.01.056</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C. C. N.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>J.&#x20;G.</given-names>
</name>
<name>
<surname>Hayakawa</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kitazawa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>J.&#x20;J.&#x20;P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Identification of Most Influential Co-occurring Gene Suites for Gastrointestinal Cancer Using Biomedical Literature Mining and Graph-Based Influence Maximization</article-title>. <source>BMC Med. Inform. Decis. Mak.</source> <volume>20</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1186/s12911-020-01227-6</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mehrabi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A Part-Of-Speech Term Weighting Scheme for Biomedical Information Retrieval</article-title>. <source>J.&#x20;Biomed. Inform.</source> <volume>63</volume>, <fpage>379</fpage>&#x2013;<lpage>389</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbi.2016.08.026</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Kao</surname>
<given-names>H. Y.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>GNormPlus: An Integrative Approach for Tagging Genes, Gene Families, and Protein Domains</article-title>. <source>Biomed. Res. Int.</source> <volume>918710</volume>. <pub-id pub-id-type="doi">10.1155/2015/918710</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Kao</surname>
<given-names>H. Y.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>PubTator: a Web-Based Text Mining Tool for Assisting Biocuration</article-title>. <source>Nucleic Acids Res.</source> <volume>41</volume>, <fpage>W518</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkt441</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Westlake</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Larson</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>1970</year>). <article-title>Introduction to Probability Theory and Statistical Inference</article-title>. <source>Stat</source> <volume>19</volume>, <fpage>352</fpage>. </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yeganova</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wilbur</surname>
<given-names>W. J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Retro: Concept-Based Clustering of Biomedical Topical Sets</article-title>. <source>Bioinformatics</source> <volume>30</volume> (<issue>22</issue>), <fpage>3240</fpage>&#x2013;<lpage>3248</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btu514</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yeniterzi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sezerman</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>EnzyMiner: Automatic Identification of Protein Level Mutations and Their Impact on Target Enzymes from PubMed Abstracts</article-title>. <source>BMC bioinformatics</source> <volume>10</volume> (<issue>Suppl. 8Suppl 8</issue>), <fpage>S2</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-10-S8-S2</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zehir</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Benayed</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Syed</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Middha</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Mutational Landscape of Metastatic Cancer Revealed from Prospective Clinical Sequencing of 10,000 Patients</article-title>. <source>Nat. Med.</source> <volume>23</volume> (<issue>6</issue>), <fpage>703</fpage>&#x2013;<lpage>713</lpage>. <pub-id pub-id-type="doi">10.1038/nm.4333</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ercan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yun</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Capelletti</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2009</year>). <article-title>Novel Mutant-Selective EGFR Kinase Inhibitors against EGFR T790M</article-title>. <source>Nature</source> <volume>462</volume> (<issue>7276</issue>), <fpage>1070</fpage>&#x2013;<lpage>1074</lpage>. <pub-id pub-id-type="doi">10.1038/nature08622</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Patumcharoenpol</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Meechai</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Biomedical Text Mining and its Applications in Cancer Research</article-title>. <source>J.&#x20;Biomed. Inform.</source> <volume>46</volume> (<issue>2</issue>), <fpage>200</fpage>&#x2013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbi.2012.10.007</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>