<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fgene.2021.671866</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>DNA Methylation, Deamination, and Translesion Synthesis Combine to Generate Footprint Mutations in Cancer Driver Genes in B-Cell Derived Lymphomas and Other Cancers</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Rogozin</surname> <given-names>Igor B.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/35211/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Roche-Lima</surname> <given-names>Abiel</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/495013/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tyryshkin</surname> <given-names>Kathrin</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Carrasquillo-Carri&#x00F3;n</surname> <given-names>Kelvin</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/495016/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lada</surname> <given-names>Artem G.</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/476435/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Poliakov</surname> <given-names>Lennard Y.</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Schwartz</surname> <given-names>Elena</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Saura</surname> <given-names>Andreu</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yurchenko</surname> <given-names>Vyacheslav</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="aff" rid="aff8"><sup>8</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/65019/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Cooper</surname> <given-names>David N.</given-names></name>
<xref ref-type="aff" rid="aff9"><sup>9</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Panchenko</surname> <given-names>Anna R.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/109595/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Pavlov</surname> <given-names>Youri I.</given-names></name>
<xref ref-type="aff" rid="aff10"><sup>10</sup></xref>
<xref ref-type="aff" rid="aff11"><sup>11</sup></xref>
<xref ref-type="aff" rid="aff12"><sup>12</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/345185/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>National Center for Biotechnology Information, National Library of Medicine, National Institutes of Health</institution>, <addr-line>Bethesda, MD</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Center for Collaborative Research in Health Disparities &#x2013; RCMI Program, University of Puerto Rico</institution>, <addr-line>San Juan</addr-line>, <country>Puerto Rico</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Pathology and Molecular Medicine, School of Medicine, Queen&#x2019;s University</institution>, <addr-line>Kingston, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff4"><sup>4</sup><institution>Integrated Informatics Services Core &#x2013; RCMI, University of Puerto Rico</institution>, <addr-line>San Juan</addr-line>, <country>Puerto Rico</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department Microbiology and Molecular Genetics, University of California, Davis</institution>, <addr-line>Davis, CA</addr-line>, <country>United States</country></aff>
<aff id="aff6"><sup>6</sup><institution>Life Science Research Centre, Faculty of Science, University of Ostrava</institution>, <addr-line>Ostrava</addr-line>, <country>Czechia</country></aff>
<aff id="aff7"><sup>7</sup><institution>Coordinating Center for Clinical Trials, National Cancer Institute, National Institutes of Health</institution>, <addr-line>Bethesda, MD</addr-line>, <country>United States</country></aff>
<aff id="aff8"><sup>8</sup><institution>Martsinovsky Institute of Medical Parasitology, Tropical and Vector Borne Diseases, Sechenov First Moscow State Medical University</institution>, <addr-line>Moscow</addr-line>, <country>Russia</country></aff>
<aff id="aff9"><sup>9</sup><institution>Institute of Medical Genetics, Cardiff University</institution>, <addr-line>Cardiff</addr-line>, <country>United Kingdom</country></aff>
<aff id="aff10"><sup>10</sup><institution>Eppley Institute for Research in Cancer and Allied Diseases</institution>, <addr-line>Omaha, NE</addr-line>, <country>United States</country></aff>
<aff id="aff11"><sup>11</sup><institution>Department of Microbiology and Pathology, Biochemistry and Molecular Biology, Genetics, Cell Biology and Anatomy, University of Nebraska Medical Center</institution>, <addr-line>Omaha, NE</addr-line>, <country>United States</country></aff>
<aff id="aff12"><sup>12</sup><institution> Department of Genetics and Biotechnology, Saint-Petersburg State University</institution>, <addr-line>Saint-Petersburg</addr-line>, <country>Russia</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Yuriy L. Orlov, The Digital Health Institute, I.M. Sechenov First Moscow State Medical University, Russia</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Richard Chahwan, University of Zurich, Switzerland; Robert W. Maul, National Institute on Aging, National Institutes of Health (NIH), United States; Alexei Fedorov, University of Toledo, United States</p></fn>
<corresp id="c001">&#x002A;Correspondence: Youri I. Pavlov, <email>ypavlov@unmc.edu</email></corresp>
<corresp id="c002">Vyacheslav Yurchenko, <email>vyacheslav.yurchenko@osu.cz</email></corresp>
<fn fn-type="other" id="fn004"><p>This article was submitted to Computational Genomics, a section of the journal Frontiers in Genetics</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>19</day>
<month>05</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>671866</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>02</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>04</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2021 Rogozin, Roche-Lima, Tyryshkin, Carrasquillo-Carri&#x00F3;n, Lada, Poliakov, Schwartz, Saura, Yurchenko, Cooper, Panchenko and Pavlov.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Rogozin, Roche-Lima, Tyryshkin, Carrasquillo-Carri&#x00F3;n, Lada, Poliakov, Schwartz, Saura, Yurchenko, Cooper, Panchenko and Pavlov</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Cancer genomes harbor numerous genomic alterations and many cancers accumulate thousands of nucleotide sequence variations. A prominent fraction of these mutations arises as a consequence of the off-target activity of DNA/RNA editing cytosine deaminases followed by the replication/repair of edited sites by DNA polymerases (pol), as deduced from the analysis of the DNA sequence context of mutations in different tumor tissues. We have used the weight matrix (sequence profile) approach to analyze mutagenesis due to Activation Induced Deaminase (AID) and two error-prone DNA polymerases. Control experiments using shuffled weight matrices and somatic mutations in immunoglobulin genes confirmed the power of this method. Analysis of somatic mutations in various cancers suggested that AID and DNA polymerases &#x03B7; and &#x03B8; contribute to mutagenesis in contexts that almost universally correlate with the context of mutations in A:T and G:C sites during the affinity maturation of immunoglobulin genes. Previously, we demonstrated that AID contributes to mutagenesis in (de)methylated genomic DNA in various cancers. Our current analysis of methylation data from malignant lymphomas suggests that driver genes are subject to different (de)methylation processes than non-driver genes and, in addition to AID, the activity of pols &#x03B7; and &#x03B8; contributes to the establishment of methylation-dependent mutation profiles. This may reflect the functional importance of interplay between mutagenesis in cancer and (de)methylation processes in different groups of genes. The resulting changes in CpG methylation levels and chromatin modifications are likely to cause changes in the expression levels of driver genes that may affect cancer initiation and/or progression.</p>
</abstract>
<kwd-group>
<kwd>tumor cells</kwd>
<kwd>frequency matrices</kwd>
<kwd>database</kwd>
<kwd>computational biology</kwd>
<kwd>somatic hypermutation</kwd>
<kwd>immunoglobulin genes</kwd>
<kwd>gene expression</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Institute on Minority Health and Health Disparities<named-content content-type="fundref-id">10.13039/100006545</named-content></contract-sponsor>
<contract-sponsor id="cn002">Nebraska Department of Health and Human Services<named-content content-type="fundref-id">10.13039/100010537</named-content></contract-sponsor>
<contract-sponsor id="cn003">Russian Science Foundation<named-content content-type="fundref-id">10.13039/501100006769</named-content></contract-sponsor>
<contract-sponsor id="cn004">Fred and Pamela Buffett Cancer Center<named-content content-type="fundref-id">10.13039/100016764</named-content></contract-sponsor>
<counts>
<fig-count count="5"/>
<table-count count="4"/>
<equation-count count="4"/>
<ref-count count="62"/>
<page-count count="14"/>
<word-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec id="S1">
<title>Introduction</title>
<p>Epigenetic reprogramming in cancer genomes creates a distinct DNA methylation landscape encompassing clustered sites of hypermethylation at regulatory regions and protein-coding genes separated by long intergenic tracks of hypomethylated regions. Such changes in DNA methylation landscape are displayed by most cancer types, and hence have the potential to serve as a universal cancer biomarker (<xref ref-type="bibr" rid="B54">Sina et al., 2018</xref>; <xref ref-type="bibr" rid="B34">Oliver et al., 2021</xref>). Previous research has focused on the biological consequences of DNA methylation changes in genomes, whereas its impact on the structure and flexibility of DNA, and its vulnerability to modifications/repair/replication in cancer, have remained largely unexplored.</p>
<p>Other prominent features of cancer initiation and progression are genomic alterations. Cancer genomes harbor numerous genomic alterations, including hundreds/thousands of nucleotide sequence variations (<xref ref-type="bibr" rid="B57">Stratton et al., 2009</xref>; <xref ref-type="bibr" rid="B41">Roberts and Gordenin, 2014</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>). A prominent fraction of these mutations arises as a consequence of the off-target activity of enzymes participating in somatic hypermutation (SHM) in immunoglobulin (Ig) genes: DNA/RNA editing cytosine deaminases of the Activation Induced Deaminase (AID)/APOBEC family and the replication/repair of edited sites by DNA polymerases (pols), as deduced by the analysis of the DNA sequence context of mutations in different cancer tissues (<xref ref-type="bibr" rid="B2">Alexandrov et al., 2013</xref>; <xref ref-type="bibr" rid="B41">Roberts and Gordenin, 2014</xref>; <xref ref-type="bibr" rid="B58">Swanton et al., 2015</xref>; <xref ref-type="bibr" rid="B21">Granadillo Rodriguez et al., 2020</xref>). Analyses of various types of cancer by means of this technique has yielded a set of 30&#x2013;50 distinct mutation signatures implying many mechanisms of hypermutation in cancer cells (<xref ref-type="bibr" rid="B3">Alexandrov and Stratton, 2014</xref>; <xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>; <xref ref-type="bibr" rid="B24">Islam and Alexandrov, 2021</xref>).</p>
<p>There is a well-established association between DNA methylation and genomic alteration. Early studies revealed that methylated cytosines explain mutation hotspots in bacteria (<xref ref-type="bibr" rid="B13">Coulondre et al., 1978</xref>). In eukaryotic genomes, CpG sites are known to be vulnerable to mutation in both cancer and normal cells (<xref ref-type="bibr" rid="B12">Cooper and Youssoufian, 1988</xref>; <xref ref-type="bibr" rid="B4">Als&#x00F8;e et al., 2017</xref>; <xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>; <xref ref-type="bibr" rid="B8">Brinkman et al., 2019</xref>). We recently detected a substantial excess of mutations in CpG sites that overlap with AID mutable motifs (WR<underline>C</underline>/<underline>G</underline>YW, W = A or T, R = A or G, Y = T or C, the mutable position is underlined) forming &#x201C;hybrid&#x201D; mutable motifs (WR<underline>C</underline>G/C<underline>G</underline>YW) whereas the opposite trend was observed in SHM (<xref ref-type="bibr" rid="B43">Rogozin and Diaz, 2004</xref>; <xref ref-type="bibr" rid="B46">Rogozin et al., 2016</xref>). This finding implies that in many cancers the SHM machinery acts aberrantly at genomic sites containing methylated cytosine. The discovery of abundant mutations in WRCG/CGYW motifs in many types of human cancer suggests that AID-mediated, CpG methylation-dependent mutagenesis is a common feature of tumorigenesis connecting methylation and hypermutation (<xref ref-type="bibr" rid="B46">Rogozin et al., 2016</xref>).</p>
<p>A prominent feature of carcinogenesis is the presence of cancer driver and passenger mutations. A driver mutation directly or indirectly confers a selective advantage upon cancer cells, whilst a passenger mutation does not (<xref ref-type="bibr" rid="B57">Stratton et al., 2009</xref>). In this context, it should be appreciated that there is a difference between a driver gene and a driver gene mutation: a driver gene may accumulate recurrent driver mutations but may also harbor passenger mutations. Some genes contain only recurrent passenger mutations with frequencies comparable to driver genes (hotspots related to the intrinsic properties of the processes of mutagenesis), which complicates the identification of cancer driver mutations (<xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>). In this study, we operationally define a non-driver gene as a gene that contains numerous mutations that do not cause cancer and are classified as passenger mutations according to the MutaGene (<xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B9">Brown et al., 2019</xref>) and CHASMplus (<xref ref-type="bibr" rid="B59">Tokheim and Karchin, 2019</xref>) computational tools.</p>
<p>We studied the association of mutable motifs produced by AID and two error-prone DNA pols ultimately associated with cancer, and the methylation status of sets of driver and non-driver genes. Our null hypothesis was that driver and non-driver genes would have contrasting methylation and mutation profiles, which could be studied using mutable motifs (<xref ref-type="bibr" rid="B46">Rogozin et al., 2016</xref>). The conventional method for the analysis of mutable DNA motifs is the consensus approach (<xref ref-type="bibr" rid="B3">Alexandrov and Stratton, 2014</xref>), for example, 5&#x2032;WR<underline>C</underline> for the AID enzyme (<xref ref-type="bibr" rid="B37">Pham et al., 2011</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>) or 5&#x2032;W<underline>A</underline> for DNA pol &#x03B7; (<xref ref-type="bibr" rid="B47">Rogozin et al., 2001</xref>, <xref ref-type="bibr" rid="B45">2018b</xref>). Here, we applied the weight matrix (sequence profile) approach that is frequently used in the analysis of biological processes (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>) to the analysis of methylation profiles and mutagenesis generated by AID and error-prone DNA pols &#x03B7; and &#x03B8; in CpG dinucleotides. Control experiments, using shuffled sites and SHM in immunoglobulin genes, suggested that the weight matrix method adds power to the study of mutagenesis. Analysis of mutations in various cancers indicated that AID and DNA pol &#x03B7; mutable motifs almost universally correlate with SHM in G:C sites. Analysis of mutations and motifs in A:T sites yielded a similar correlation for pol &#x03B8;. Analysis of methylation data in malignant lymphomas (the MALY-DE dataset) suggested that the methylation status of driver genes differs from that of non-driver genes and this may be one reason for the differences in distribution of mutations in the two groups of genes.</p>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="S2.SS1">
<title>Mutable Motif Construction Using Weight Matrices</title>
<p>Several approaches have been developed for the analysis of a set of mutated genomic sequences (<xref ref-type="bibr" rid="B56">Staden, 1984</xref>; <xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>, <xref ref-type="bibr" rid="B49">2019</xref>). A mononucleotide weight matrix is a simple and straightforward way to present the structure of a functional signal and to calculate weights for the signal sequence (<xref ref-type="bibr" rid="B18">Gelfand, 1995</xref>). Each matrix W(b,j) (nucleotide b = A, T, G, or C in a position j) includes information on a normalized frequency of A, T, G, and C bases in each of the six positions surrounding detected sites of mutation (3 bases downstream and 3 bases upstream; <xref ref-type="fig" rid="F1">Figure 1</xref>; corresponding raw numbers are shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 1</xref>). We calculated the weight matrices for the two studied DNA polymerases and used a collection of mutations generated by classic gap-filling DNA synthesis <italic>in vitro</italic> by human pols &#x03B7; and &#x03B8; (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B47">Rogozin et al., 2001</xref>; <xref ref-type="bibr" rid="B5">Arana et al., 2008</xref>) (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Nucleotide frequency matrices for mutations at A:T sites [<bold>(A)</bold> DNA pol &#x03B7;; <bold>(B)</bold> pol &#x03B8;] and G:C sites [<bold>(C)</bold> pol &#x03B8;; <bold>(D)</bold> DNA pol &#x03B7;]. Known mutable motifs (consensus sequences) (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B47">Rogozin et al., 2001</xref>) are shown below each matrix in bold, whereas mutable positions are underlined. Putative (previously unobserved) parts of mutable motifs and potentially informative positions are italicized, W = A or T; Y = T or C; B = A, T or G; D = A, T, or G. Source of data: <xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>.</p></caption>
<graphic xlink:href="fgene-12-671866-g001.tif"/>
</fig>
<p>The following formula for W(b,j) was used for data analysis: W(b,j) = log<sub>2</sub> [f(b,j) / e(b)], where f(b,j) is the observed frequency of the nucleotide b in position j and e(b) is the expected frequency of the nucleotide b calculated as the mean nucleotide frequencies of positions &#x2013;5,&#x2013;4, +4, +5 for the sites of mutation in the target sequence; the resulting W(b,j) matrices are shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<p>The matching score S<sub>(b1,&#x2026;,bL)</sub> of a sequence b1,&#x2026;,bL is:</p>
<disp-formula id="S2.Ex1">
<mml:math id="M1">
<mml:mrow>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mtext>b</mml:mtext>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mtext>bL</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>j</mml:mtext>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The matching score between sequence b1,&#x2026;,bL and a weight matrix can be further expressed as a percentage:</p>
<disp-formula id="S2.Ex2">
<mml:math id="M2">
<mml:mrow>
<mml:mo>%</mml:mo>
<mml:mpadded width="+2.8pt">
<mml:mi>matching</mml:mi>
</mml:mpadded>
<mml:mi>score</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo rspace="5.3pt">&#x00D7;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mtext>b</mml:mtext>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mtext>bL</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mi>min</mml:mi>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mi>max</mml:mi>
</mml:msub>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mi>min</mml:mi>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex3">
<mml:math id="M3">
<mml:mrow>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>min</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:munder>
<mml:mtext>MIN</mml:mtext>
<mml:mrow>
<mml:mtext>b</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex4">
<mml:math id="M4">
<mml:mrow>
<mml:msub>
<mml:mtext>S</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>max</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mtext>j</mml:mtext>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>L</mml:mtext>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:munder>
<mml:mtext>MAX</mml:mtext>
<mml:mrow>
<mml:mtext>b</mml:mtext>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>W</mml:mtext>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mtext>b</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Hereafter, we use the term &#x201C;weight&#x201D; instead of &#x201C;% matching score.&#x201D; We used positions &#x2013;3 : +3 to estimate the weights of sites.</p>
</sec>
<sec id="S2.SS2">
<title>ICGC/TCGA Mutation Datasets</title>
<p>Somatic mutation data from the ICGC and TCGA cancer genome projects were extracted from the Sanger COSMIC Whole Genome Project v75.<sup><xref ref-type="fn" rid="footnote1">1</xref></sup> The ICGC/TCGA datasets are almost exclusively passenger mutations and, as such, they are unlikely to be subject to selection in the context of promoting cellular proliferation. Indeed, they are much more likely to reflect unselected mutational spectra (<xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>). The tissues and cancer types were defined according to the primary tumor site and the cancer project in question. This dataset is included in the MutaGene package, where it is described in detail (<xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B9">Brown et al., 2019</xref>). We also used collections of mutations obtained by means of <italic>in vitro</italic> experiments for human pol &#x03B7; (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>) and pol &#x03B8; (<xref ref-type="bibr" rid="B5">Arana et al., 2008</xref>; <xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>) to build weight matrices.</p>
</sec>
<sec id="S2.SS3">
<title>Methylation and Expression Data</title>
<p>For the analysis of the association between somatic mutations, mRNA expression, mutable motifs and methylation, datasets for 26 patients with malignant lymphoma<sup><xref ref-type="fn" rid="footnote2">2</xref></sup> were used. In the analyzed datasets, the methylation data for all patients were pooled together. Each position was characterized by the methylated/unmethylated read count and the methylation ratio (the number of methylated reads divided by the total number of reads overlapping this position and multiplied by 100). Only positions with more than nine associated reads were included in the analysis. The major methodological problem inherent in the analysis of methylation across CpG&#x2019;s is the absence of control sets. Therefore, we compared methylation values below and above threshold values (25 and 75%). The mean weight of mutable motifs (<xref ref-type="fig" rid="F1">Figure 1</xref>) in the positions of methylated CpG&#x2019;s (the group 1 with the size S1, <xref ref-type="fig" rid="F2">Figure 2</xref>) was compared to the mean weight of the same motifs in a contrasting dataset (the group 2 with the size S2, <xref ref-type="fig" rid="F2">Figure 2</xref>) using the <italic>t</italic>-test (2-tailed test) and Monte Carlo test (MC, 1-tailed test) similar to the consensus method as previously described (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>). Expression of mRNA was measured using the FPKM values (<xref ref-type="bibr" rid="B23">Howe et al., 2011</xref>). The mean and variance for each gene were calculated across 26 studied samples.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Schematic representation of the procedure used for construction of <xref ref-type="table" rid="T3">Table 3</xref>. Each circle represents a methylated CpG site, with its size reflecting the methylation level. Red &#x201C;X&#x201D; denotes CpG sites that overlap with known mutable motifs. The left and right panels correspond to thresholds 25% and 75%. The left panel: The set &#x201C;1&#x201D; (the methylation levels are smaller than 25%) is compared to set &#x201C;2&#x201D; (the methylation levels are larger than 25%). The right panel: The set &#x201C;1&#x201D; (the methylation levels are larger than 75%) is compared to set &#x201C;2&#x201D; (the methylation levels are smaller than 75%).</p>
</caption>
<graphic xlink:href="fgene-12-671866-g002.tif"/>
</fig>
</sec>
<sec id="S2.SS4">
<title>Analysis of Mutations</title>
<p>DNA sequences surrounding the mutated nucleotides represent the mutation context. We compared the frequency of known mutable motifs for somatic mutations with the frequency of these motifs in the vicinity of the mutated nucleotide. Specifically, for each base substitution, the 121 bp sequence centered at the mutation was extracted (the DNA neighborhood). We used only the nucleotides immediately flanking the mutations because DNA repair/replication enzymes are thought to scan a very limited region of DNA (<xref ref-type="bibr" rid="B42">Roberts et al., 2013</xref>; <xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>). This approach does not exclude any specific area of the genome, but rather uses the areas within each sample where mutagenesis has occurred (considering the variability in the mutation rate across the human genome) (<xref ref-type="bibr" rid="B42">Roberts et al., 2013</xref>; <xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>). A schematic representation of this procedure for CpG dinucleotides is shown in <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 4</xref>). Here, the mean weight of mutable motifs (represented by weight matrices; <xref ref-type="fig" rid="F1">Figure 1</xref>) in the positions of each somatic mutation (in C/G or A/T positions) was compared to the mean weight of mutable motifs in C/G or A/T positions without mutations in the DNA neighborhood (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 4</xref>) using the <italic>t</italic>-test (2-tailed test) and Monte Carlo test (MC, 1-tailed test) similar to the consensus method, as previously described (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>). The MC test is based on the random sampling from the group 2. In total, 10,000 groups with size S1 have been generated. The fraction of generated groups with mean weights larger or equal to the mean value of the sample 1 is the P value.</p>
<p>In addition to analyses of the derived mutable motifs in cancer genomes, we performed a control experiment: we randomly shuffled a dataset of sequences surrounding the mutations in the studied target sequences (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>) keeping position 6 (the position of mutations) intact. Each sequence was shuffled separately; thus, the overall base composition and the base compositions of each sequence were the same. Weight matrices were derived from these shuffled sequences, and the sampling procedure was repeated 1,000 times.</p>
</sec>
<sec id="S2.SS5">
<title>Detection of Driver and Non-driver Genes</title>
<p>In this study, we used two independent methods to predict the driver status of cancer mutations: the MutaGene (<xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B9">Brown et al., 2019</xref>) and Chasmplus (<xref ref-type="bibr" rid="B59">Tokheim and Karchin, 2019</xref>). These methods showed top performance on a recent benchmarking set (<xref ref-type="bibr" rid="B9">Brown et al., 2019</xref>). MutaGene is a probabilistic approach which adjusts the number of mutation recurrences in patients by means of a cancer-type specific background mutation model. The MutaGene driver mutation prediction method has not been explicitly trained on any particular set of mutations. The background models estimate the probability of obtaining a codon substitution from the underlying processes of mutagenesis. We used two MutaGene background models: one was derived from pan-cancer mutational data (&#x201C;Pancancer&#x201D; model in MutaGene) whereas the other was constructed directly from the MALY-DE mutational data since this cancer-specific model was not present in the MutaGene database of background models. As a result, two ranking lists of driver mutations were produced for three types of mutation: missense, nonsense and silent. Chasmplus is a machine learning method that was trained using somatic mutations from TCGA. Since no cancer-specific model was available for MALY-DE, we used pan-cancer predictions while running Chasmplus. Then we merged the predictions produced by the three different models/methods and reported only those mutations as drivers which were predicted to be &#x201C;drivers&#x201D; or &#x201C;potential drivers&#x201D; by MutaGene and had a Chasmplus score cutoff larger than 0.5. <xref ref-type="supplementary-material" rid="SM2">Supplementary File 1</xref> shows recurrent driver and passenger mutations.</p>
<p>Predicted driver mutations satisfy at least two of the above-mentioned criteria of driver mutations (<xref ref-type="supplementary-material" rid="SM3">Supplementary File 2</xref>). Predicted passenger mutations must satisfy all criteria of passenger mutations. Since Chasmplus does not generate predictions for nonsense and silent mutations, only predictions for missense mutations were reported. In addition, some mutations/genes were not reported by Chasmplus because it excluded them from the list of potential cancer driver genes. In this study, we defined driver genes in the following way: a driver gene must have at least one recurrent driver mutation but may also possess recurrent passenger mutations (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>). Some genes contain only recurrent passenger mutations with frequencies comparable to driver genes. In this study, we defined a non-driver gene operationally as a gene that only contains recurrent mutations that are not associated with the process of tumorigenesis and hence are classified as passenger mutations (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 2</xref>).</p>
</sec>
</sec>
<sec id="S3">
<title>Results</title>
<sec id="S3.SS1">
<title>Weight Matrices Are Powerful Descriptors of Mutable Motifs</title>
<p>Weight matrices constitutes a novel technique when applied to the description of preferential mutable motifs. It was shown to be a robust and precise technique to describe AID/APOBEC mutable motifs in cancer cells (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>). The weight matrices include information on the frequency of A, T, G, and C bases in each of the ten positions surrounding the sites of mutation (5 bases downstream and 5 bases upstream). AID, DNA pol &#x03B7; and pol &#x03B8; are involved in SHM in immunoglobulin genes (<xref ref-type="bibr" rid="B40">Revy et al., 2000</xref>; <xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B35">Pavlov et al., 2002</xref>; <xref ref-type="bibr" rid="B62">Zan et al., 2005</xref>; <xref ref-type="bibr" rid="B33">Neuberger and Rada, 2007</xref>; <xref ref-type="bibr" rid="B5">Arana et al., 2008</xref>; <xref ref-type="bibr" rid="B6">Bhattacharya et al., 2008</xref>), although this role for both polymerases has been questioned (<xref ref-type="bibr" rid="B15">D&#x00F6;rner and Lipsky, 2001</xref>; <xref ref-type="bibr" rid="B28">Martomo et al., 2008</xref>).</p>
<p>In this study, we started from the construction of weight matrices for both DNA pols. It should be noted that we previously derived weight matrices using collections of mutations induced by AID/APOBEC deaminases in yeast genomes (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>). For human DNA pols &#x03B7; and &#x03B8;, such collections are not available. Therefore, we used a collection of mutations generated by human pols &#x03B7; and &#x03B8; during classic gap-filling DNA synthesis <italic>in vitro</italic> (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B47">Rogozin et al., 2001</xref>; <xref ref-type="bibr" rid="B35">Pavlov et al., 2002</xref>) (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>). Constructed matrices of nucleotide frequencies are shown in <xref ref-type="fig" rid="F1">Figure 1A-D</xref> (corresponding raw numbers are shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 1</xref>). Pols &#x03B7; and &#x03B8; exhibit known DNA context features for mutations in A:T sites. W (A or T) or A in the position -1 (<xref ref-type="fig" rid="F1">Figures 1A,B</xref>) was the most prominent feature of A:T mutations produced by pol &#x03B7; and pol &#x03B8;, accordingly. We cannot exclude the possibility that some other previously undetected positions may contribute to the mutable motifs, for example, a higher frequency of Y (T or C) in position -2 or a lower frequency of G may be additional features of the pol &#x03B7; mutable motif (<xref ref-type="fig" rid="F1">Figure 1A</xref>).</p>
<p>By contrast, pols &#x03B7; and &#x03B8; exhibit dissimilar DNA context features for mutations at G:C sites. A characteristic feature of pol &#x03B8; is an elevated frequency of C at position &#x2013;1 and a lower frequency of C at position &#x2013;2 (<xref ref-type="fig" rid="F1">Figure 1C</xref>). Thus, pol &#x03B8; tends to produce more errors in the DC<underline>G</underline> nucleotide context (D = A or T or G). Pol &#x03B7; appears to have a different DNA mutational context with an excess of C in position +1 (<xref ref-type="fig" rid="F1">Figure 1D</xref>). In general, it is hard to confidently delineate mutable motifs of either DNA polymerase using the consensus approach owing to the lack of objective inclusion criteria for position-specific context features to mutable motifs (<xref ref-type="fig" rid="F1">Figure 1</xref>). Thus, the weight matrix approach, which utilizes information contained in all studied positions, is likely to be a more straightforward way to describe the polymerase &#x03B7; and &#x03B8; mutable motifs than the consensus approach.</p>
<p>We also compared the nucleotide composition of sequences surrounding positions of mutations (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 1</xref>) for pols &#x03B7; and &#x03B8; using the &#x03C7;<sup>2</sup> test. We found that these pols were significantly different with respect to the DNA sequence context of mutation sites expressed in the form of nucleotide frequency matrices (A:T sites: &#x03C7;<sup>2</sup> = 155.0, df = 40, <italic>P</italic> = 1.9 &#x00D7; 10<sup>&#x2013;15</sup>; G:C sites: &#x03C7;<sup>2</sup> = 82.2, df = 40, <italic>P</italic> = 0.00007). Thus, DNA pol &#x03B7; and pol &#x03B8; differ significantly in terms of the features of the DNA sequence context of mutations. This result is consistent with the different context properties of pols &#x03B7; and &#x03B8; (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
</sec>
<sec id="S3.SS2">
<title>Footprints of pol &#x03B7; and pol &#x03B8; Correlate With the Somatic Mutational Spectrum in Many Cancer Types</title>
<p>Previously, we demonstrated using the consensus approach that mutagenesis by AID is likely modulated by the (de)methylation and/or translesion synthesis (TLS) of CpG dinucleotides in follicular lymphomas and many other cancers (<xref ref-type="bibr" rid="B46">Rogozin et al., 2016</xref>). Based on analyses of mutations in CpG dinucleotides in skin cancer cells and normal cells, it was also suggested that pol &#x03B7; mutagenesis might also correlate with the methylation of CpG dinucleotides in cancer cells (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>). The weight matrix approach and the MALY-DE datasets (CpG methylation spectra and somatic mutations, see Materials and Methods) allow us to perform further analyses of the role of AID and error-prone polymerases in mutagenesis, and to see how it is affected by (de)methylation.</p>
<p>We examined the correlation between the nucleotide sequence context of somatic mutations in cancers and pol &#x03B7; and pol &#x03B8; mutable motifs found after <italic>in vitro</italic> DNA synthesis. A correlation was inferred when the results of two statistical tests (Monte Carlo test and <italic>t</italic>-test) were significant at <italic>P</italic> &#x003C; 0.05. AID has already been studied using the consensus motif WR<underline>C</underline>/<underline>G</underline>YW and weight matrices and has been shown to be one of the most ubiquitous contributors to mutations in various cancer types according to its characteristic mutable motif (the AID weight matrix) (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>). Analysis of pol-generated mutations in G:C sites revealed that both mutation motifs are almost universally correlated with the nucleotide context of somatic mutations in G:C sites (<xref ref-type="fig" rid="F3">Figure 3</xref>). Similar analysis of A:T site mutations also revealed correlations for pol &#x03B7;. A significant correlation with pol &#x03B8; was documented only for a few cancer cases. This difference may reflect a more specialized role for pol &#x03B8; in DNA transactions on methylated CpG&#x2019;s (<xref ref-type="bibr" rid="B61">Wood and Doubli&#x00E9;, 2016</xref>; <xref ref-type="bibr" rid="B7">Brambati et al., 2020</xref>). It is also possible that pol &#x03B8; is expressed in only a few cancers. Pol &#x03B7; probably plays a more widespread, although not particularly pronounced, role in causing mutations in cancer according to its characteristic weight matrix in various cancer types; this is consistent with our previous study where we used the consensus sequence W<underline>A</underline> (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Correlation of pol &#x03B7; (eta) and &#x03B8; (theta) mutable motifs and the sequence context of somatic mutations. For the actual data, see <xref ref-type="supplementary-material" rid="SM1">Supplementary Tables 3</xref>, <xref ref-type="supplementary-material" rid="SM1">4</xref>. The intensities of the gray color correspond to the <italic>t</italic>-test values (<xref ref-type="supplementary-material" rid="SM1">Supplementary Tables 3</xref>, <xref ref-type="supplementary-material" rid="SM1">4</xref>). The unweighted pair group method, with arithmetic mean (UPGMA) clustering of ratio values for the pol &#x03B7; and &#x03B8; footprints and tissues, is shown to the left and top. The upper left panel shows the distribution of the studied <italic>t</italic>-test values and correspondence of the <italic>t</italic>-test values and color intensity (the darker colors correspond to the higher correlation values). A similar plot of ratio values (the ratio being the mean weight of mutated sites divided by the mean weight of non-mutated sites) is shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 5</xref>.</p></caption>
<graphic xlink:href="fgene-12-671866-g003.tif"/>
</fig>
</sec>
<sec id="S3.SS3">
<title>Control Experiments</title>
<p>The <italic>in vitro</italic> collections of mutations that were used to reconstruct weight matrices for pol &#x03B7; and pol &#x03B8; (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B5">Arana et al., 2008</xref>) are relatively small (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>). Thus, control experiments were important to analyze the quality of the derived weight matrices. We previously demonstrated that analyses of the association between the matrices of shuffled sites of mutation and the nucleotide context of somatic mutation in various cancer cell types is a reliable approach to estimate the impact of the accuracy of association prediction (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>). Analysis of 16 types of cancer (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 5</xref>) suggested that the AID weight matrix is less prone to errors of prediction compared to pol &#x03B7;/pol &#x03B8; (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 5</xref>). Only a few types of cancer have a low level of prediction errors. Fortunately, for our study of MALY-DE sets, &#x201C;Blood&#x201D; tissue, GCB lymphomas (from the COSMIC database) and MALY_DE malignant lymphomas have extremely low rates of false prediction (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 5</xref>). Therefore, we opted to use the derived matrices for further analysis of the MALY-DE datasets.</p>
<p>Analysis of somatic mutations in immunoglobulin genes can be used to estimate the prediction accuracy because the context of mutations in human immunoglobulin genes is known to correlate strongly with AID and pol &#x03B7; mutable motifs (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>). Thus, these mutations can be used as a control dataset as performed previously (<xref ref-type="bibr" rid="B49">Rogozin et al., 2019</xref>). A significant association between the AID mutable motif and mutations was found in all three studied somatic mutation datasets (<xref ref-type="bibr" rid="B31">Milstein et al., 1998</xref>; <xref ref-type="bibr" rid="B30">Mayorov et al., 2005</xref>; <xref ref-type="table" rid="T1">Table 1</xref>), confirming that the AID weight matrix is a reliable descriptor of AID-induced mutagenesis. The pol &#x03B7; weight matrices revealed a significant association for all studied cases except xeroderma pigmentosum variant (XPV) patients where pol &#x03B7; is inactive (<xref ref-type="table" rid="T1">Table 1</xref>; <xref ref-type="bibr" rid="B30">Mayorov et al., 2005</xref>). Pol &#x03B8; matrices yielded significant results for some studied cases (<xref ref-type="table" rid="T1">Table 1</xref>). This is consistent with the hypothesis that pol &#x03B8; is also involved in SHM (<xref ref-type="bibr" rid="B5">Arana et al., 2008</xref>). The results of both control experiments suggested that the weight matrix technique approach is adequate to study the mutational spectra of DNA polymerases.</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Correlation between the sequence context of somatic mutations and mutable motifs in fragments of human immunoglobulin genes.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Locus</td>
<td valign="top" align="center">Test</td>
<td valign="top" align="center">Number of Mutations</td>
<td valign="top" align="center">AID/G:C</td>
<td valign="top" align="center">Pol &#x03B7;/G:C</td>
<td valign="top" align="center">Pol &#x03B8;/G:C</td>
<td valign="top" align="center">Number of Mutations</td>
<td valign="top" align="center">Pol &#x03B7;/A:T</td>
<td valign="top" align="center">Pol &#x03B8;/A:T</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">V<sub><italic>H</italic></sub>26</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">583</td>
<td valign="top" align="center">1.208</td>
<td valign="top" align="center">1.027</td>
<td valign="top" align="center">1.091</td>
<td valign="top" align="center">351</td>
<td valign="top" align="center">1.082</td>
<td valign="top" align="center">0.979</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>13.1&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>5.9&#x002A;</bold></td>
<td/>
<td valign="top" align="center"><bold>5.3&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.004</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.699</td>
</tr>
<tr>
<td valign="top" align="left">J<sub><italic>H</italic></sub>4 intron, control individuals</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">177</td>
<td valign="top" align="center">1.341</td>
<td valign="top" align="center">1.05</td>
<td valign="top" align="center">1.029</td>
<td valign="top" align="center">95</td>
<td valign="top" align="center">1.041</td>
<td valign="top" align="center">1.032</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>12.3&#x002A;</bold></td>
<td valign="top" align="center"><bold>2.8&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td/>
<td valign="top" align="center"><bold>2.4&#x002A;</bold></td>
<td valign="top" align="center"><bold>2.2&#x002A;</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.002</td>
<td valign="top" align="center">0.106</td>
<td/>
<td valign="top" align="center">0.004</td>
<td valign="top" align="center">0.011</td>
</tr>
<tr>
<td valign="top" align="left">J<sub><italic>H</italic></sub>4 intron, XP-V patients</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">227</td>
<td valign="top" align="center">1.278</td>
<td valign="top" align="center">1.009</td>
<td valign="top" align="center">1.011</td>
<td valign="top" align="center">25</td>
<td valign="top" align="center">0.957</td>
<td valign="top" align="center">0.98</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>9.9&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center">NSE</td>
<td/>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center">NSE</td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.329</td>
<td valign="top" align="center">0.061</td>
<td/>
<td valign="top" align="center">0.776</td>
<td valign="top" align="center">0.67</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic>&#x201C;Ratio&#x201D; is the mean weight of mutated sites divided by the mean weight of non-mutated sites.</italic></attrib>
<attrib><italic>NSE (no significant excess) indicates the absence of a significant excess of mutations in mutable motifs, suggesting there to be no association between mutagenesis and mutable motifs. The significance of any excess was measured using the Student <italic>t</italic> and Monte Carlo (MC) tests. The asterisk (&#x002A;) denotes that the corresponding <italic>P</italic> &#x003C; 0.01; this is a conservative estimate of the critical overall value of the <italic>t</italic>-test having allowed for multiple testing by means of the Bonferroni correction (5 comparisons).</italic></attrib>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="S3.SS4">
<title>Analysis of Driver and Non-driver Genes</title>
<p>Analysis of driver/passenger mutations is known to be powerful approach in cancer genomics and can even be diagnostic of various cancers (<xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B9">Brown et al., 2019</xref>; <xref ref-type="bibr" rid="B59">Tokheim and Karchin, 2019</xref>; <xref ref-type="bibr" rid="B14">Dietlein et al., 2020</xref>). We derived lists of recurrent driver and non-driver mutations using three computational approaches (see section &#x201C;Materials and Methods&#x201D;). We define driver genes as those genes, which accumulate recurrent driver mutations, but which may also possess recurrent passenger mutations (<xref ref-type="supplementary-material" rid="SM1">Supplementary Tables 1</xref>). Some genes contain only recurrent passenger mutations with frequencies comparable to driver genes; in this study, we defined a non-driver gene operationally as a gene that only contains recurrent passenger mutations (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 2</xref>).</p>
<p>Final lists of operationally defined driver and non-driver genes are shown in <xref ref-type="supplementary-material" rid="SM1">Supplementary Tables 1</xref>, <xref ref-type="supplementary-material" rid="SM1">2</xref> (we used the ENSEMBL IDs, as recommended by the DAVID Bioinformatics Resources web site, <ext-link ext-link-type="uri" xlink:href="https://david.ncifcrf.gov/">https://david.ncifcrf.gov/</ext-link>). The total numbers of driver and non-driver genes are 134 and 210, respectively. We performed pathway/keyword enrichment analyses (<xref ref-type="bibr" rid="B27">Luque-Baena et al., 2014</xref>; <xref ref-type="bibr" rid="B60">Wang et al., 2014</xref>; <xref ref-type="bibr" rid="B55">Soldatos et al., 2015</xref>) using the DAVID web site (<xref ref-type="bibr" rid="B25">Jiao et al., 2012</xref>). Results are shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 6</xref>. Keywords &#x201C;methylation,&#x201D; &#x201C;nuclear chromatin,&#x201D; and numerous pathways/terms associated with various types of cancer are consistent with properties of GCB lymphomas (<xref ref-type="bibr" rid="B22">Green et al., 2015</xref>; <xref ref-type="bibr" rid="B46">Rogozin et al., 2016</xref>). The KEGG pathway &#x201C;pathways in cancer&#x201D; (<italic>P</italic> = 0.025) is another important descriptor of the driver gene list (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 6</xref>). In general, the driver gene set appears to be highly informative and contains many features expected for cancer-related genes (<xref ref-type="bibr" rid="B22">Green et al., 2015</xref>) (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 6</xref>). By contrast, analysis of non-driver genes yielded only a few significant results with no obvious functional associations with cancer (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 6</xref>).</p>
<p>There is a significant association of the AID mutable motif with somatic mutations in all genes, as well as in driver and non-driver genes (<xref ref-type="table" rid="T2">Table 2</xref>) suggesting that AID plays an important role in mutagenesis in cancer genomes; there are several pathways that can explain this process (<xref ref-type="fig" rid="F4">Figure 4</xref>). Analysis of association between pols &#x03B7; and &#x03B8; mutable motifs and somatic mutations detected a difference between driver and non-driver genes: mutable motifs in G:C pairs of pols &#x03B7; and &#x03B8; correlate with somatic mutations in non-driver genes only. There was no correlation with pol &#x03B7; mutations at A:T pairs, whereas the pattern of somatic mutation correlated with pol &#x03B8; at A:T sites both in driver and non-driver genes (<xref ref-type="table" rid="T2">Table 2</xref>). These observations indicate that the contribution of different pathways of generation of mutations in cancers (<xref ref-type="fig" rid="F4">Figure 4</xref>) is distinct for AID, pols &#x03B7; and pol &#x03B8;.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Correlation between mutable motifs and the sequence context of somatic mutations in driver and non-driver genes.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Group of genes</td>
<td valign="top" align="center">Test</td>
<td valign="top" align="center">Number of G:C mutations</td>
<td valign="top" align="center">AID/G:C</td>
<td valign="top" align="center">Pol &#x03B7;/G:C</td>
<td valign="top" align="center">Pol &#x03B8;/G:C</td>
<td valign="top" align="center">Number of A:T mutations</td>
<td valign="top" align="center">Pol &#x03B7;/A:T</td>
<td valign="top" align="center">Pol &#x03B8;/A:T</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">All genes</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">137,775</td>
<td valign="top" align="center">1.021</td>
<td valign="top" align="center">1.005</td>
<td valign="top" align="center">1.091</td>
<td valign="top" align="center">145,768</td>
<td valign="top" align="center">0.992</td>
<td valign="top" align="center">1.011</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>23.4&#x002A;</bold></td>
<td valign="top" align="center"><bold>7.2&#x002A;</bold></td>
<td valign="top" align="center"><bold>23.0&#x002A;</bold></td>
<td/>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>15.8&#x002A;</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">Drivers</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">4,246</td>
<td valign="top" align="center">1.107</td>
<td valign="top" align="center">1.001</td>
<td valign="top" align="center">1.007</td>
<td valign="top" align="center">3,918</td>
<td valign="top" align="center">0.98</td>
<td valign="top" align="center">1.032</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>20.0&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center">NSE</td>
<td/>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>7.8&#x002A;</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.346</td>
<td valign="top" align="center">0.037</td>
<td/>
<td valign="top" align="center">1</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">Non-drivers</td>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">3,553</td>
<td valign="top" align="center">1.079</td>
<td valign="top" align="center">1.059</td>
<td valign="top" align="center">1.057</td>
<td valign="top" align="center">2,793</td>
<td valign="top" align="center">0.995</td>
<td valign="top" align="center">1.045</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td/>
<td valign="top" align="center"><bold>14.2&#x002A;</bold></td>
<td valign="top" align="center"><bold>13.8&#x002A;</bold></td>
<td valign="top" align="center"><bold>11.7&#x002A;</bold></td>
<td/>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>8.9&#x002A;</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="center">MC test</td>
<td/>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td/>
<td valign="top" align="center">0.874</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic>&#x201C;Ratio&#x201D; is the mean weight of mutated sites divided by the mean weight of non-mutated sites.</italic></attrib>
<attrib><italic>NSE (no significant excess) indicates the absence of a significant excess of mutations in mutable motifs suggesting there to be no association between mutagenesis and mutable motifs. The significance of any excess was measured using the Student <italic>t</italic> and Monte Carlo (MC) tests. The asterisk (&#x002A;) denotes that the corresponding <italic>P</italic> &#x003C; 0.01; this is a conservative estimate of the critical overall value of the <italic>t</italic>-test having allowed for multiple testing by means of the Bonferroni correction (5 comparisons).</italic></attrib>
</table-wrap-foot>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>Putative mechanism of an interplay between AID and TLS polymerases.</p></caption>
<graphic xlink:href="fgene-12-671866-g004.tif"/>
</fig>
<p>Another important feature of driver genes is a higher frequency of mutations at G:C nucleotides (4,246 and 3,918 in G:C and A:T, accordingly) compared to all other genes (137,775 &#x2013; 4,246 = 133,529 and 145,786 &#x2013; 3,918 = 141,868 in G:C and A:T, accordingly, <xref ref-type="table" rid="T2">Table 2</xref>) (<italic>P</italic> &#x003C; 0.0001 according to the two-tailed Fisher&#x2019;s exact test).<sup><xref ref-type="fn" rid="footnote3">3</xref></sup> A similar trend was observed for non-driver genes (<xref ref-type="table" rid="T2">Table 2</xref>, <italic>P</italic> &#x003C; 0.0001). This may be explained by a leading role for AID/APOBEC enzyme(s) that preferentially participate in mutagenesis pathways in G:C nucleotides; AID is one such enzyme (<xref ref-type="fig" rid="F4">Figure 4</xref>).</p>
</sec>
<sec id="S3.SS5">
<title>Patient-Specific Analysis of Somatic Mutations and Methylation</title>
<p>We analyzed the significance of association between AID/pol mutable motifs and the sequence context of somatic mutations for each sample (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 7</xref>). The results suggested that all studied samples have a significant association between AID/pols mutable motifs and mutation (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 7</xref>). The <italic>t</italic>-test values were similar to those in the merged dataset (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 7</xref> and <xref ref-type="table" rid="T2">Table 2</xref>). For example, <italic>t</italic>-test values for AID vary from 4.2 to 35.8 (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 7</xref>), this value for the merged dataset was estimated as 23.4 (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<p>We also analyzed the level of methylation in CpG sites for driver and non-driver genes for each sample separately. We derived profiles of methylation (methylation levels, positions, and chromosomes) across driver and non-driver genes separately. After that, pairwise correlation coefficients (Pearson&#x2019;s linear correlation coefficients CC) were estimated across all studied samples. All correlation coefficients were larger than 0.9 (the significance level &#x003C; 0.001). Plots of pairwise CC values are shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 6</xref>, <xref ref-type="supplementary-material" rid="SM1">7</xref>; these plots appear homogeneous (no blocks of &#x201C;high&#x201D; and &#x201C;low&#x201D; CC values that are adjacent in data matrices) (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 6</xref>, <xref ref-type="supplementary-material" rid="SM1">7</xref>).</p>
<p>These results suggest that studied patient-specific associations of mutable motifs with somatic mutations as well as patterns of methylation are homogeneous for driver and non-driver genes. Thus, we pooled patient-specific samples into merged datasets of somatic mutations and methylation profiles. This procedure is especially important for the analysis of small datasets that will be described below.</p>
</sec>
<sec id="S3.SS6">
<title>Analysis of DNA Methylation Patterns of Driver and Non-driver Genes Using Weight Matrices</title>
<p>The average methylation level of driver and non-driver genes was found to be approximately the same: &#x223C;78% for both sets of genes (all CpG dinucleotides in driver and non-driver genes were computationally analyzed using the MALY-DE dataset). Analysis of methylation in mutable motifs was performed using the threshold methylation values 25 and 75%. These two values were chosen arbitrarily, values of 75 (close to the average methylation level) and higher correspond to heavily methylated CpG sites. The value 25% and smaller correspond to CpG sites that are close to the unmethylated state. Thus, values 25 and 75% reflect a dramatically different methylation status for CpG sites in the studied sets of genes (<xref ref-type="fig" rid="F2">Figure 2</xref>).</p>
<p>Let us illustrate the logic of combined analysis of methylation in mutable motifs using an example from <xref ref-type="table" rid="T3">Table 3A</xref>. For the set of driver genes and the threshold methylation value = 25%, average weights of AID mutable motifs for subsets of CpG sites with methylation values smaller than and greater than the threshold = 25% were 57.8 and 56.4, respectively. The ratio of these values is 1.025 (57.8/56.4 = 1.025) and is shown in <xref ref-type="table" rid="T3">Table 3A</xref>. This difference is statistically significant, albeit not dramatically so (<xref ref-type="table" rid="T3">Table 3</xref>). Average weights of AID mutable motifs for non-driver genes below and above the threshold = 25% are 57.7 and 56.2, accordingly. The ratio is 1.027, and this difference is also statistically significant (<xref ref-type="table" rid="T3">Table 3</xref>). These results suggest that a high frequency of AID-mutable motifs is associated with lower methylation levels in driver and non-driver genes. For pol &#x03B7; and &#x03B8;, no significant differences were detected for both driver and non-driver genes (<xref ref-type="table" rid="T3">Table 3A</xref>), suggesting that the global level of methylation of CpG sites of driver and non-driver genes for the threshold methylation level = 25% may not interfere with mutagenesis by pols &#x03B7; and &#x03B8;.</p>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Analysis of methylation in CpG sites that overlap with pols &#x03B7; and &#x03B8; mutable motifs.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Group of genes</td>
<td valign="top" align="center">Number of CpG sites <underline>below</underline> and <underline>above</underline> the threshold</td>
<td valign="top" align="center">Tests</td>
<td valign="top" align="center">AID</td>
<td valign="top" align="center">Pol &#x03B7;</td>
<td valign="top" align="center">Pol &#x03B8;</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="6"><bold>A. Levels of methylation in CpG sites that overlap with mutable motifs, with the threshold value = 25%</bold></td>
</tr>
<tr>
<td valign="top" align="left">Driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.025</td>
<td valign="top" align="center">0.997</td>
<td valign="top" align="center">0.994</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">2,867</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center"><bold>3.2&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center">NSE</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">149,480</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.772</td>
<td valign="top" align="center">0.95</td>
</tr>
<tr>
<td valign="top" align="left">Non-driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.027</td>
<td valign="top" align="center">0.993</td>
<td valign="top" align="center">0.985</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">5,558</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center"><bold>5.4&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center">NSE</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">239,220</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.989</td>
<td valign="top" align="center">0.989</td>
</tr>
<tr>
<td valign="top" align="left" colspan="6"><bold>B. Levels of methylation in CpG sites that overlap with mutable motifs, with the threshold value = 75%</bold></td>
</tr>
<tr>
<td valign="top" align="left">Driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.004</td>
<td valign="top" align="center">1.009</td>
<td valign="top" align="center">1.021</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">96,917</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>7.9&#x002A;</bold></td>
<td valign="top" align="center"><bold>20.4&#x002A;</bold></td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">51,290</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">0.433</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
<tr>
<td valign="top" align="left">Non-driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.007</td>
<td valign="top" align="center">1.009</td>
<td valign="top" align="center">1.023</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">155,205</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center"><bold>4.5&#x002A;</bold></td>
<td valign="top" align="center"><bold>9.8&#x002A;</bold></td>
<td valign="top" align="center"><bold>28.6&#x002A;</bold></td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">89,573</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic>&#x201C;Ratio&#x201D; is the mean weight of mutable motifs in CpG sites with methylation values <underline>below</underline> (or <underline>above</underline>) the threshold divided by the mean weight of mutable motifs in CpG sites with methylation values <underline>above</underline> (or <underline>below</underline>) the threshold (25 or 75%, respectively) (a schematic representation of this analysis is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. NSE (no significant excess) indicates the absence of any significant excess suggesting there to be no association between methylation and mutable motifs. The significance of an excess was measured using the Student <italic>t</italic> and Monte Carlo (MC) tests. The asterisk (&#x002A;) denotes that the corresponding <italic>P</italic> &#x003C; 0.01; this is a conservative estimate of the critical overall value of the <italic>t</italic>-test having allowed for multiple testing by means of the Bonferroni correction (3 comparisons).</italic></attrib>
</table-wrap-foot>
</table-wrap>
<p>For the threshold methylation value = 75%, we observed to some extent the opposite trend. For example, the average weights of AID-mutable motifs for driver genes greater and smaller than 75% were 56.9 and 56.7, respectively. The ratio of these values is 1.004 (56.9/56.7 = 1.004) (<xref ref-type="table" rid="T3">Table 3B</xref>). This difference is not statistically significant (<xref ref-type="table" rid="T3">Table 3B</xref>). The ratio is also relatively low for the non-driver gene set although it is significant (<xref ref-type="table" rid="T3">Table 3B</xref>). Mutable motifs for both studied DNA polymerases appear to be associated with the methylation level for this threshold (heavily methylated CpG sites). These results suggest that the global level of methylation in driver genes for the heavily methylated positions may be affected by pol &#x03B7; and pol &#x03B8; transactions on methylated CpG&#x2019;s but not AID transactions. The methylation levels of non-driver genes may be affected by all studied enzymes (<xref ref-type="table" rid="T3">Table 3B</xref>).</p>
</sec>
<sec id="S3.SS7">
<title>Analysis of Somatic Mutations in CpG Sites in Driver and Non-driver Genes</title>
<p>We analyzed the level of methylation in CpG sites that coincide with positions of somatic mutation. This dataset is much smaller compared to all methylated CpG&#x2019;s (the previous section). It should be noted that the studied sets are small. However, they are still amenable to statistical analysis using the threshold = 75% (<xref ref-type="table" rid="T4">Table 4</xref>, heavily methylated CpG sites). Unfortunately, the number of mutations for the threshold = 25% (CpG sites that are close to the unmethylated state) was too small for statistical analysis: the number of mutated sites with methylation levels below 25% is 0 and 3 for driver and non-driver genes, accordingly. Thus, we did not use the threshold value 25% but instead used the threshold value 75% only.</p>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>Levels of methylation in positions of somatic mutation in CpG sites, the threshold value = 75%.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Group of genes</td>
<td valign="top" align="center">Number of mutations in CpGs sites <underline>above</underline> and <underline>below</underline> the threshold</td>
<td valign="top" align="center">Tests</td>
<td valign="top" align="center">AID</td>
<td valign="top" align="center">Pol &#x03B7;</td>
<td valign="top" align="center">Pol &#x03B8;</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.111</td>
<td valign="top" align="center">1.136</td>
<td valign="top" align="center">1.046</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">249</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center"><bold>2.9&#x002A;</bold></td>
<td valign="top" align="center"><bold>7.8&#x002A;</bold></td>
<td valign="top" align="center">NSE</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">52</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">0.004</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">0.009</td>
</tr>
<tr>
<td valign="top" align="left">Non-driver</td>
<td valign="top" align="left"/>
<td valign="top" align="center">Ratio</td>
<td valign="top" align="center">1.015</td>
<td valign="top" align="center">1.125</td>
<td valign="top" align="center">1.061</td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">390</td>
<td valign="top" align="center"><italic>t</italic>-test</td>
<td valign="top" align="center">NSE</td>
<td valign="top" align="center"><bold>7.3&#x002A;</bold></td>
<td valign="top" align="center"><bold>3.7&#x002A;</bold></td>
</tr>
<tr>
<td valign="top" align="left"/>
<td valign="top" align="center">264</td>
<td valign="top" align="center">MC test</td>
<td valign="top" align="center">0.222</td>
<td valign="top" align="center">&#x003C;0.001</td>
<td valign="top" align="center">&#x003C;0.001</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic>&#x201C;Ratio&#x201D; is the mean weight of mutated CpG sites above the methylation threshold divided by the mean weight of mutated sites below the threshold (a schematic representation of this analysis is shown in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 8</xref>). NSE (no significant excess) indicates the absence of any significant differences between these sets suggesting there to be no association between mutagenesis and motifs in the CpG sites. The significance of any excess was measured using the Student <italic>t</italic> and Monte Carlo (MC) tests. The asterisk (&#x002A;) denotes that the corresponding <italic>P</italic> &#x003C; 0.01; this is a conservative estimate of the critical overall value of the <italic>t</italic>-test having allowed for multiple testing by means of the Bonferroni correction (3 comparisons).</italic></attrib>
</table-wrap-foot>
</table-wrap>
<p>The first result obtained is that the fraction of mutated CpG sites with methylation values below the threshold 75% is dramatically different for driver genes (52/(52+249) = 0.17, <xref ref-type="table" rid="T4">Table 4</xref>, the second column) and non-driver genes (0.40, <xref ref-type="table" rid="T4">Table 4</xref>, the second column). This difference is statistically significant (P &#x003C; 0.0001 according to the two-tailed Fisher&#x2019;s exact test). Thus, CpG sites with somatic mutations in driver genes tend to have higher methylation values compared to non-driver genes.</p>
<p>The second interesting result is the significant correlation of AID, pol &#x03B7; and pol &#x03B8; with mutation positions having a lower methylation level (below 75%) (<xref ref-type="table" rid="T4">Table 4</xref>). The correlation of the AID motif presence and mutation is more pronounced for driver genes, indicating that AID-induced mutagenesis is likely to be associated with heavily methylated CpG dinucleotides. Pol &#x03B7; has a role in CpG mutagenesis for both sets of genes whereas pol &#x03B8; is likely to be largely involved in the mutagenesis of non-driver genes (<xref ref-type="table" rid="T4">Table 4</xref>). Thus, it is likely that methylation levels influence mutagenesis pathways in CpG sites through the action of all the studied enzymes, although the individual impact of studied enzymes may be different for driver and non-driver genes (for example, AID, <xref ref-type="table" rid="T4">Table 4</xref>). It is likely to depend on various factors including gene expression. This will be discussed in the next section.</p>
</sec>
<sec id="S3.SS8">
<title>Analysis of Expression of Driver and Non-driver Genes</title>
<p>We analyzed the expression levels (FPMK values) for both sets of genes (<xref ref-type="supplementary-material" rid="SM1">Supplementary Tables 1</xref>, <xref ref-type="supplementary-material" rid="SM1">2</xref>). Analysis of mean and variance (<xref ref-type="fig" rid="F5">Figure 5</xref> and <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 8</xref>) suggested that mean values were not substantially different. However, the variance of expression values observed in the set of driver genes was larger as compared to the set of non-driver genes (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 8</xref>). The difference between mean values (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 8</xref>) was not statistically significant (<italic>t</italic>-test <italic>P</italic> value = 0.086), whereas the difference between variance values (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 8</xref>) was statistically significant (<italic>F</italic>-test <italic>P</italic> value = 0.007).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Violin plot of mRNA expression (FPKM values) for sets of driver and non-driver genes. Log<sub>2</sub> transformation was used.</p></caption>
<graphic xlink:href="fgene-12-671866-g005.tif"/>
</fig>
</sec>
</sec>
<sec id="S4">
<title>Discussion</title>
<p>Some results of this study seem to be counterintuitive. For example, the AID mutable motif would appear to correlate with the context of somatic mutations in heavily methylated CpG&#x2019;s for driver genes only (<xref ref-type="table" rid="T4">Table 4</xref>). It is hard to determine the factors that are responsible for this difference. For example, variability of gene expression is significantly higher for driver genes (<xref ref-type="fig" rid="F5">Figure 5</xref>). This may be associated with the differential regulation of expression of driver genes in different patients or methylation levels. Copy number variation of driver genes (<xref ref-type="bibr" rid="B26">Loohuis et al., 2014</xref>; <xref ref-type="bibr" rid="B11">Cheng et al., 2016</xref>) may cause problems for precise estimates of CpG methylation levels.</p>
<p>AID and DNA polymerases &#x03B7;/&#x03B8; are known to participate in somatic hypermutation of immunoglobulin genes (<xref ref-type="bibr" rid="B29">Matsuda et al., 2001</xref>; <xref ref-type="bibr" rid="B10">Casali et al., 2006</xref>; <xref ref-type="bibr" rid="B33">Neuberger and Rada, 2007</xref>; <xref ref-type="bibr" rid="B6">Bhattacharya et al., 2008</xref>). In addition, it has been suggested that AID and pol &#x03B7; are likely to contribute to a lowering methylation levels of CpG dinucleotides in cancer cells (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>). Thus, we focused this study on AID and pols &#x03B7;/&#x03B8; employing the weight matrix technique and mutation/methylation profiles. Our results suggest that AID and pols &#x03B7;/&#x03B8; combine to generate footprint mutations in B-cell derived lymphomas and other cancers. It was reported that methylation substantially reduces the rate of APOBEC-induced mutations in CpG dinucleotides (<xref ref-type="bibr" rid="B52">Seplyarskiy et al., 2016</xref>). For this reason, we did not include other members of the AID/APOBEC superfamily in the current study.</p>
<p>The advantage of the weight matrix approach is that it is a unified computational technique that allows an accurate and objective comparison of the mutational contribution of various mutator enzymes under the same experimental conditions and for the same datasets. We confirm that while the mutational footprints of DNA polymerases &#x03B7; and &#x03B8; are prominent in some cancers, mutable motifs characteristic of the humoral immune response somatic hypermutation machine, AID, is likely to be the most widespread feature of somatic mutational spectra attributed to any enzyme in cancer genomes (<xref ref-type="bibr" rid="B45">Rogozin et al., 2018b</xref>, <xref ref-type="bibr" rid="B49">2019</xref>). It is important to note that the suggested technique does not depend on expert opinion as to the exact consensus sequences, and therefore objectively represents mutable motifs.</p>
<p>We derived matrices for A:T and G:C residues. However, the ratio of A:T to G:C mutations is variable (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure 1</xref>). For example, it known that Pol &#x03B7; mutates G residues at a lower frequency than A residues. However, two matrices (G:C and A:T residues, <xref ref-type="fig" rid="F1">Figure 1</xref>) for the two motifs were used independently (<xref ref-type="fig" rid="F3">Figure 3</xref>). We would like to develop a probabilistic model that integrates two matrices in one model. However, this approach has never been attempted before in this context and would require further investigation.</p>
<p>It is not possible to delineate the exact mechanism of the interplay between AID and DNA polymerases. It may be replication of the deaminated strand, separate pathways of U vs. T removal by glycosylases generating abasic sites followed by TLS by pol &#x03B7; or pol &#x03B8;, and/or specialized mismatch repair with gap filling by pol &#x03B7; or pol &#x03B8; (<xref ref-type="fig" rid="F4">Figure 4</xref>) (<xref ref-type="bibr" rid="B38">Pilzecker and Jacobs, 2019</xref>). Unfortunately, precise mechanisms have not been clearly defined even for mutagenesis of immunoglobulin genes, with attempts to define those mechanisms having been ongoing for over 20 years.</p>
<p>A high rate of prediction errors for many types of cancer (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 5</xref>) is likely to be due to the small mutational spectra available for DNA polymerase &#x03B7; and &#x03B8; (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figures 2</xref>, <xref ref-type="supplementary-material" rid="SM1">3</xref>). Larger sets of mutations are likely to improve the quality of prediction. We can nevertheless infer that some types of cancer, including GCB lymphomas, do not have a noticeable rate of false positives (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 5</xref>). We applied all weight matrices to study mutable motifs and methylation in the MALY-DE datasets and demonstrated that mutable motifs correlate with CpG dinucleotides and their methylation status. Another methodological problem is the small number of MALY-DE samples (26 samples) which may cause problems for the prediction of driver and passenger mutations. These problems are one of several possible explanations as to why differences between driver and non-driver genes are subtle (albeit significant) (<xref ref-type="table" rid="T2">Tables 2</xref>-<xref ref-type="table" rid="T4">4</xref>). However, it is likely that these differences are responsible for the major difference observed between the expression of driver and non-driver genes (<xref ref-type="fig" rid="F5">Figure 5</xref>). The much large variance observed for driver genes may be the result of greater (de)methylation of driver gene sequences causing substantial variability of mRNA expression across patients (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<p>Sophisticated classification approaches (prediction of mutational signatures) have been developed to extract the most prominent signatures from a complex mix of mutational spectra resulting from the action of a variety of mutagens, both exogenous and endogenous, operating during tumor evolution (<xref ref-type="bibr" rid="B36">Petljak and Alexandrov, 2016</xref>; <xref ref-type="bibr" rid="B39">Rahbari et al., 2016</xref>; <xref ref-type="bibr" rid="B20">Goncearenco et al., 2017</xref>; <xref ref-type="bibr" rid="B48">Rogozin et al., 2018c</xref>; <xref ref-type="bibr" rid="B1">Alexandrov et al., 2020</xref>). Both driver and passenger mutations have been used in the analysis without any attempt to analyze them separately. In this study, we analyzed driver and non-driver genes separately and detected significant differences in the relationship between mutable motifs and mutations with the methylation/demethylation status of driver and non-driver genes (<xref ref-type="table" rid="T3">Tables 3</xref> and <xref ref-type="table" rid="T4">4</xref>). It is not that easy to interpret these differences because the role of methylated CpG dinucleotides in exons is not yet fully understood (<xref ref-type="bibr" rid="B32">Neri et al., 2017</xref>). It has been suggested that changes in intragenic DNA methylation is important in several human diseases including syndromic and sporadic forms of various neurological disorders that involve methylation defects, including Rett syndrome, Prader&#x2013;Willi and Angelman syndromes, and others, suggesting that the differential (de)methylation of genes may underpin one aspect of various neurological disorders (<xref ref-type="bibr" rid="B16">Dunaway et al., 2016</xref>; <xref ref-type="bibr" rid="B44">Rogozin et al., 2018a</xref>; <xref ref-type="bibr" rid="B51">Scandaglia and Barco, 2019</xref>). Such differential methylation may be caused by differences in (de)methylation processes in somatic/germ cells (<xref ref-type="bibr" rid="B53">Shanak and Helms, 2020</xref>). Moreover, several studies of likely deleterious mutations have observed that genes controlling methylation status, chromatin accessibility or remodeling (and hence gene expression) are enriched for genes with recurrent mutations (<xref ref-type="bibr" rid="B19">Geschwind and State, 2015</xref>; <xref ref-type="bibr" rid="B50">Sanders et al., 2015</xref>; <xref ref-type="bibr" rid="B17">Geisheker et al., 2017</xref>).</p>
<p>The difference in AID and polymerase properties (<xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>) for driver and non-driver genes is consistent with the participation of different mechanisms of mutagenesis and (de)methylation processes (<xref ref-type="fig" rid="F4">Figure 4</xref>) on non-methylated and methylated DNA. The observed differences between driver and non-driver genes associated with somatic mutations in driver genes (<xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>) are likely to cause changes in gene expression (<xref ref-type="fig" rid="F5">Figure 5</xref>) that then trigger cancer initiation and/or progression. This is not surprising if we consider that chromatin modification pathways (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table 6</xref>) as well as the observed changes in CpG methylation levels (<xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>) are likely to cause changes in the expression levels of driver genes that could affect both cancer initiation and/or progression.</p>
</sec>
<sec id="S5">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://dcc.icgc.org/projects/MALY-DE">https://dcc.icgc.org/projects/MALY-DE</ext-link>; <ext-link ext-link-type="uri" xlink:href="https://cancer.sanger.ac.uk">https://cancer.sanger.ac.uk</ext-link>.</p>
</sec>
<sec id="S6">
<title>Author Contributions</title>
<p>IBR, AR-L, KT, KC-C, AL, LP, and ES: formal analysis. All authors: investigation.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</body>
<back>
<fn-group>
<fn fn-type="financial-disclosure">
<p><bold>Funding.</bold> This work was supported by the Intramural Research Program of the National Library of Medicine at the National Institutes of Health (IBR), RCMI grant U54 MD007600 (National Institute on Minority Health and Health Disparities) from the National Institutes of Health (AR-L), NE DHHS LB506, grant 2017-48 (YIP) and Qiagen, Inc. through a License Agreement with Cardiff University (DNC). YIP was also partially supported by the Russian Science Foundation grant 20-15-00081, and the Fred &#x0026; Pamela Buffett Cancer Center Support Grant from the National Cancer Institute under award number P30 CA072720. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health. ARP and KT were supported by the Department of Pathology and Molecular Medicine, Queen&#x2019;s University, Canada. ARP is the recipient of a Senior Canada Research Chair in Computational Biology and Biophysics and a Senior Investigator Award from the Ontario Institute of Cancer Research, Canada.</p>
</fn>
</fn-group>
<ack>
<p>ARP and KT thank Alexander Goncearenco and Jiaying You for help with data acquisition.</p>
</ack>
<sec id="S9" sec-type="supplementary material">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2021.671866/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2021.671866/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.doc" id="SM1" mimetype="application/msword" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_2.xls" id="SM2" mimetype="application/vnd.ms-excel" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_3.xls" id="SM3" mimetype="application/vnd.ms-excel" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name> <name><surname>Kim</surname> <given-names>J.</given-names></name> <name><surname>Haradhvala</surname> <given-names>N. J.</given-names></name> <name><surname>Huang</surname> <given-names>M. N.</given-names></name> <name><surname>Tian Ng</surname> <given-names>A. W.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>The repertoire of mutational signatures in human cancer.</article-title> <source><italic>Nature</italic></source> <volume>578</volume> <fpage>94</fpage>&#x2013;<lpage>101</lpage>.</citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name> <name><surname>Nik-Zainal</surname> <given-names>S.</given-names></name> <name><surname>Wedge</surname> <given-names>D. C.</given-names></name> <name><surname>Aparicio</surname> <given-names>S. A.</given-names></name> <name><surname>Behjati</surname> <given-names>S.</given-names></name> <name><surname>Biankin</surname> <given-names>A. V.</given-names></name><etal/></person-group> (<year>2013</year>). <article-title>Signatures of mutational processes in human cancer.</article-title> <source><italic>Nature</italic></source> <volume>500</volume> <fpage>415</fpage>&#x2013;<lpage>421</lpage>.</citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name> <name><surname>Stratton</surname> <given-names>M. R.</given-names></name></person-group> (<year>2014</year>). <article-title>Mutational signatures: the patterns of somatic mutations hidden in cancer genomes.</article-title> <source><italic>Curr. Opin. Genet. Dev.</italic></source> <volume>24</volume> <fpage>52</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1016/j.gde.2013.11.014</pub-id> <pub-id pub-id-type="pmid">24657537</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Als&#x00F8;e</surname> <given-names>L.</given-names></name> <name><surname>Sarno</surname> <given-names>A.</given-names></name> <name><surname>Carracedo</surname> <given-names>S.</given-names></name> <name><surname>Domanska</surname> <given-names>D.</given-names></name> <name><surname>Dingler</surname> <given-names>F.</given-names></name> <name><surname>Lirussi</surname> <given-names>L.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Uracil accumulation and mutagenesis dominated by cytosine deamination in CpG dinucleotides in mice lacking UNG and SMUG1.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>7</volume>:<issue>7199</issue>.</citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Arana</surname> <given-names>M. E.</given-names></name> <name><surname>Seki</surname> <given-names>M.</given-names></name> <name><surname>Wood</surname> <given-names>R. D.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Kunkel</surname> <given-names>T. A.</given-names></name></person-group> (<year>2008</year>). <article-title>Low-fidelity DNA synthesis by human DNA polymerase theta.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>36</volume> <fpage>3847</fpage>&#x2013;<lpage>3856</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkn310</pub-id> <pub-id pub-id-type="pmid">18503084</pub-id></citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bhattacharya</surname> <given-names>P.</given-names></name> <name><surname>Grigera</surname> <given-names>F.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>McCarty</surname> <given-names>T.</given-names></name> <name><surname>Morse</surname> <given-names>H. C.</given-names> <suffix>III</suffix></name> <name><surname>Kenter</surname> <given-names>A. L.</given-names></name></person-group> (<year>2008</year>). <article-title>Identification of murine B cell lines that undergo somatic hypermutation focused to A:T and G:C residues.</article-title> <source><italic>Eur. J. Immunol.</italic></source> <volume>38</volume> <fpage>227</fpage>&#x2013;<lpage>239</lpage>. <pub-id pub-id-type="doi">10.1002/eji.200737664</pub-id> <pub-id pub-id-type="pmid">18081040</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brambati</surname> <given-names>A.</given-names></name> <name><surname>Barry</surname> <given-names>R. M.</given-names></name> <name><surname>Sfeir</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>DNA polymerase theta (Pol&#x03B8;) - an error-prone polymerase necessary for genome stability.</article-title> <source><italic>Curr. Opin. Genet. Dev.</italic></source> <volume>60</volume> <fpage>119</fpage>&#x2013;<lpage>126</lpage>. <pub-id pub-id-type="doi">10.1016/j.gde.2020.02.017</pub-id> <pub-id pub-id-type="pmid">32302896</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brinkman</surname> <given-names>A. B.</given-names></name> <name><surname>Nik-Zainal</surname> <given-names>S.</given-names></name> <name><surname>Simmer</surname> <given-names>F.</given-names></name> <name><surname>Rodriguez-Gonzalez</surname> <given-names>F. G.</given-names></name> <name><surname>Smid</surname> <given-names>M.</given-names></name> <name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>Partially methylated domains are hypervariable in breast cancer and fuel widespread CpG island hypermethylation.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>10</volume>:<issue>1749</issue>.</citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brown</surname> <given-names>A. L.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Goncearenco</surname> <given-names>A.</given-names></name> <name><surname>Panchenko</surname> <given-names>A. R.</given-names></name></person-group> (<year>2019</year>). <article-title>Finding driver mutations in cancer: elucidating the role of background mutational processes.</article-title> <source><italic>PLoS Comput. Biol.</italic></source> <volume>15</volume>:<issue>e1006981</issue>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006981</pub-id> <pub-id pub-id-type="pmid">31034466</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Casali</surname> <given-names>P.</given-names></name> <name><surname>Pal</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Zan</surname> <given-names>H.</given-names></name></person-group> (<year>2006</year>). <article-title>DNA repair in antibody somatic hypermutation.</article-title> <source><italic>Trends Immunol.</italic></source> <volume>27</volume> <fpage>313</fpage>&#x2013;<lpage>321</lpage>. <pub-id pub-id-type="doi">10.1016/j.it.2006.05.001</pub-id> <pub-id pub-id-type="pmid">16737852</pub-id></citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cheng</surname> <given-names>F.</given-names></name> <name><surname>Zhao</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name></person-group> (<year>2016</year>). <article-title>Advances in computational approaches for prioritizing driver mutations and significantly mutated genes in cancer genomes.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>17</volume> <fpage>642</fpage>&#x2013;<lpage>656</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbv068</pub-id> <pub-id pub-id-type="pmid">26307061</pub-id></citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cooper</surname> <given-names>D. N.</given-names></name> <name><surname>Youssoufian</surname> <given-names>H.</given-names></name></person-group> (<year>1988</year>). <article-title>The CpG dinucleotide and human genetic disease.</article-title> <source><italic>Hum Genet</italic></source> <volume>78</volume> <fpage>151</fpage>&#x2013;<lpage>155</lpage>. <pub-id pub-id-type="doi">10.1007/bf00278187</pub-id> <pub-id pub-id-type="pmid">3338800</pub-id></citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Coulondre</surname> <given-names>C.</given-names></name> <name><surname>Miller</surname> <given-names>J. H.</given-names></name> <name><surname>Farabaugh</surname> <given-names>P. J.</given-names></name> <name><surname>Gilbert</surname> <given-names>W.</given-names></name></person-group> (<year>1978</year>). <article-title>Molecular basis of base substitution hotspots in <italic>Escherichia coli</italic>.</article-title> <source><italic>Nature</italic></source> <volume>274</volume> <fpage>775</fpage>&#x2013;<lpage>780</lpage>. <pub-id pub-id-type="doi">10.1038/274775a0</pub-id> <pub-id pub-id-type="pmid">355893</pub-id></citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dietlein</surname> <given-names>F.</given-names></name> <name><surname>Weghorn</surname> <given-names>D.</given-names></name> <name><surname>Taylor-Weiner</surname> <given-names>A.</given-names></name> <name><surname>Richters</surname> <given-names>A.</given-names></name> <name><surname>Reardon</surname> <given-names>B.</given-names></name> <name><surname>Liu</surname> <given-names>D.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>Identification of cancer driver genes based on nucleotide context.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>52</volume> <fpage>208</fpage>&#x2013;<lpage>218</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-019-0572-y</pub-id> <pub-id pub-id-type="pmid">32015527</pub-id></citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>D&#x00F6;rner</surname> <given-names>T.</given-names></name> <name><surname>Lipsky</surname> <given-names>P. E.</given-names></name></person-group> (<year>2001</year>). <article-title>Smaller role for pol &#x03B7;?</article-title> <source><italic>Nat. Immunol.</italic></source> <volume>2</volume> <fpage>982</fpage>&#x2013;<lpage>984</lpage>. <pub-id pub-id-type="doi">10.1038/ni1101-982</pub-id> <pub-id pub-id-type="pmid">11685213</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dunaway</surname> <given-names>K. W.</given-names></name> <name><surname>Islam</surname> <given-names>M. S.</given-names></name> <name><surname>Coulson</surname> <given-names>R. L.</given-names></name> <name><surname>Lopez</surname> <given-names>S. J.</given-names></name> <name><surname>Vogel Ciernia</surname> <given-names>A.</given-names></name> <name><surname>Chu</surname> <given-names>R. G.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Cumulative impact of polychlorinated biphenyl and large chromosomal duplications on DNA methylation, chromatin, and expression of autism candidate genes.</article-title> <source><italic>Cell Rep.</italic></source> <volume>17</volume> <fpage>3035</fpage>&#x2013;<lpage>3048</lpage>. <pub-id pub-id-type="doi">10.1016/j.celrep.2016.11.058</pub-id> <pub-id pub-id-type="pmid">27974215</pub-id></citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Geisheker</surname> <given-names>M. R.</given-names></name> <name><surname>Heymann</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Coe</surname> <given-names>B. P.</given-names></name> <name><surname>Turner</surname> <given-names>T. N.</given-names></name> <name><surname>Stessman</surname> <given-names>H. A. F.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Hotspots of missense mutation identify neurodevelopmental disorder genes and functional domains.</article-title> <source><italic>Nat. Neurosci.</italic></source> <volume>20</volume> <fpage>1043</fpage>&#x2013;<lpage>1051</lpage>. <pub-id pub-id-type="doi">10.1038/nn.4589</pub-id> <pub-id pub-id-type="pmid">28628100</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gelfand</surname> <given-names>M. S.</given-names></name></person-group> (<year>1995</year>). <article-title>Prediction of function in DNA sequence analysis.</article-title> <source><italic>J. Comput. Biol.</italic></source> <volume>2</volume> <fpage>87</fpage>&#x2013;<lpage>115</lpage>. <pub-id pub-id-type="doi">10.1089/cmb.1995.2.87</pub-id> <pub-id pub-id-type="pmid">7497122</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Geschwind</surname> <given-names>D. H.</given-names></name> <name><surname>State</surname> <given-names>M. W.</given-names></name></person-group> (<year>2015</year>). <article-title>Gene hunting in autism spectrum disorder: on the path to precision medicine.</article-title> <source><italic>Lancet. Neurol.</italic></source> <volume>14</volume> <fpage>1109</fpage>&#x2013;<lpage>1120</lpage>. <pub-id pub-id-type="doi">10.1016/s1474-4422(15)00044-7</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Goncearenco</surname> <given-names>A.</given-names></name> <name><surname>Rager</surname> <given-names>S. L.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Sang</surname> <given-names>Q. X.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Panchenko</surname> <given-names>A. R.</given-names></name></person-group> (<year>2017</year>). <article-title>Exploring background mutational processes to decipher cancer genetic heterogeneity.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>45</volume> <fpage>W514</fpage>&#x2013;<lpage>W522</lpage>.</citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Granadillo Rodriguez</surname> <given-names>M.</given-names></name> <name><surname>Flath</surname> <given-names>B.</given-names></name> <name><surname>Chelico</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>The interesting relationship between APOBEC3 deoxycytidine deaminases and cancer: a long road ahead.</article-title> <source><italic>Open Biol.</italic></source> <volume>10</volume>:<issue>200188</issue>. <pub-id pub-id-type="doi">10.1098/rsob.200188</pub-id> <pub-id pub-id-type="pmid">33292100</pub-id></citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Green</surname> <given-names>M. R.</given-names></name> <name><surname>Kihira</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>C. L.</given-names></name> <name><surname>Nair</surname> <given-names>R. V.</given-names></name> <name><surname>Salari</surname> <given-names>R.</given-names></name> <name><surname>Gentles</surname> <given-names>A. J.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Mutations in early follicular lymphoma progenitors are associated with suppressed antigen presentation.</article-title> <source><italic>Proc. Natl. Acad. Sci. U.S.A.</italic></source> <volume>112</volume> <fpage>E1116</fpage>&#x2013;<lpage>E1125</lpage>.</citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Howe</surname> <given-names>E. A.</given-names></name> <name><surname>Sinha</surname> <given-names>R.</given-names></name> <name><surname>Schlauch</surname> <given-names>D.</given-names></name> <name><surname>Quackenbush</surname> <given-names>J.</given-names></name></person-group> (<year>2011</year>). <article-title>RNA-Seq analysis in MeV.</article-title> <source><italic>Bioinformatics</italic></source> <volume>27</volume> <fpage>3209</fpage>&#x2013;<lpage>3210</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr490</pub-id> <pub-id pub-id-type="pmid">21976420</pub-id></citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname> <given-names>S. M. A.</given-names></name> <name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name></person-group> (<year>2021</year>). <article-title>Bioinformatic methods to identify mutational signatures in cancer.</article-title> <source><italic>Methods Mol. Biol.</italic></source> <volume>2185</volume> <fpage>447</fpage>&#x2013;<lpage>473</lpage>. <pub-id pub-id-type="doi">10.1007/978-1-0716-0810-4_28</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jiao</surname> <given-names>X.</given-names></name> <name><surname>Sherman</surname> <given-names>B. T.</given-names></name> <name><surname>Huang</surname> <given-names>D. W.</given-names></name> <name><surname>Stephens</surname> <given-names>R.</given-names></name> <name><surname>Baseler</surname> <given-names>M. W.</given-names></name> <name><surname>Lane</surname> <given-names>H. C.</given-names></name><etal/></person-group> (<year>2012</year>). <article-title>DAVID-WS: a stateful web service to facilitate gene/protein list analysis.</article-title> <source><italic>Bioinformatics</italic></source> <volume>28</volume> <fpage>1805</fpage>&#x2013;<lpage>1806</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts251</pub-id> <pub-id pub-id-type="pmid">22543366</pub-id></citation></ref>
<ref id="B26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Loohuis</surname> <given-names>L. O.</given-names></name> <name><surname>Witzel</surname> <given-names>A.</given-names></name> <name><surname>Mishra</surname> <given-names>B.</given-names></name></person-group> (<year>2014</year>). <article-title>Improving detection of driver genes: power-law null model of copy number variation in cancer.</article-title> <source><italic>IEEE/ACM Trans. Comput. Biol. Bioinform.</italic></source> <volume>11</volume> <fpage>1260</fpage>&#x2013;<lpage>1263</lpage>. <pub-id pub-id-type="doi">10.1109/tcbb.2014.2351805</pub-id> <pub-id pub-id-type="pmid">26357061</pub-id></citation></ref>
<ref id="B27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Luque-Baena</surname> <given-names>R. M.</given-names></name> <name><surname>Urda</surname> <given-names>D.</given-names></name> <name><surname>Gonzalo Claros</surname> <given-names>M.</given-names></name> <name><surname>Franco</surname> <given-names>L.</given-names></name> <name><surname>Jerez</surname> <given-names>J. M.</given-names></name></person-group> (<year>2014</year>). <article-title>Robust gene signatures from microarray data using genetic algorithms enriched with biological pathway keywords.</article-title> <source><italic>J. Biomed. Inform.</italic></source> <volume>49</volume> <fpage>32</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbi.2014.01.006</pub-id> <pub-id pub-id-type="pmid">24480647</pub-id></citation></ref>
<ref id="B28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Martomo</surname> <given-names>S. A.</given-names></name> <name><surname>Saribasak</surname> <given-names>H.</given-names></name> <name><surname>Yokoi</surname> <given-names>M.</given-names></name> <name><surname>Hanaoka</surname> <given-names>F.</given-names></name> <name><surname>Gearhart</surname> <given-names>P. J.</given-names></name></person-group> (<year>2008</year>). <article-title>Reevaluation of the role of DNA polymerase &#x03B8; in somatic hypermutation of immunoglobulin genes.</article-title> <source><italic>DNA Repair</italic></source> <volume>7</volume> <fpage>1603</fpage>&#x2013;<lpage>1608</lpage>. <pub-id pub-id-type="doi">10.1016/j.dnarep.2008.04.002</pub-id> <pub-id pub-id-type="pmid">18485835</pub-id></citation></ref>
<ref id="B29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Matsuda</surname> <given-names>T.</given-names></name> <name><surname>Bebenek</surname> <given-names>K.</given-names></name> <name><surname>Masutani</surname> <given-names>C.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Hanaoka</surname> <given-names>F.</given-names></name> <name><surname>Kunkel</surname> <given-names>T. A.</given-names></name></person-group> (<year>2001</year>). <article-title>Error rate and specificity of human and murine DNA polymerase eta.</article-title> <source><italic>J. Mol. Biol.</italic></source> <volume>312</volume> <fpage>335</fpage>&#x2013;<lpage>346</lpage>.</citation></ref>
<ref id="B30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mayorov</surname> <given-names>V. I.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Adkison</surname> <given-names>L. R.</given-names></name> <name><surname>Gearhart</surname> <given-names>P. J.</given-names></name></person-group> (<year>2005</year>). <article-title>DNA polymerase eta contributes to strand bias of mutations of A versus T in immunoglobulin genes.</article-title> <source><italic>J. Immunol.</italic></source> <volume>174</volume> <fpage>7781</fpage>&#x2013;<lpage>7786</lpage>. <pub-id pub-id-type="doi">10.4049/jimmunol.174.12.7781</pub-id> <pub-id pub-id-type="pmid">15944281</pub-id></citation></ref>
<ref id="B31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Milstein</surname> <given-names>C.</given-names></name> <name><surname>Neuberger</surname> <given-names>M. S.</given-names></name> <name><surname>Staden</surname> <given-names>R.</given-names></name></person-group> (<year>1998</year>). <article-title>Both DNA strands of antibody genes are hypermutation targets.</article-title> <source><italic>Proc. Natl. Acad. Sci. U.S.A.</italic></source> <volume>95</volume> <fpage>8791</fpage>&#x2013;<lpage>8794</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.95.15.8791</pub-id> <pub-id pub-id-type="pmid">9671757</pub-id></citation></ref>
<ref id="B32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Neri</surname> <given-names>F.</given-names></name> <name><surname>Rapelli</surname> <given-names>S.</given-names></name> <name><surname>Krepelova</surname> <given-names>A.</given-names></name> <name><surname>Incarnato</surname> <given-names>D.</given-names></name> <name><surname>Parlato</surname> <given-names>C.</given-names></name> <name><surname>Basile</surname> <given-names>G.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Intragenic DNA methylation prevents spurious transcription initiation.</article-title> <source><italic>Nature</italic></source> <volume>543</volume> <fpage>72</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1038/nature21373</pub-id> <pub-id pub-id-type="pmid">28225755</pub-id></citation></ref>
<ref id="B33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Neuberger</surname> <given-names>M. S.</given-names></name> <name><surname>Rada</surname> <given-names>C.</given-names></name></person-group> (<year>2007</year>). <article-title>Somatic hypermutation: activation-induced deaminase for C/G followed by polymerase &#x03B7; for A/T.</article-title> <source><italic>J. Exp. Med.</italic></source> <volume>204</volume> <fpage>7</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1084/jem.20062409</pub-id> <pub-id pub-id-type="pmid">17190841</pub-id></citation></ref>
<ref id="B34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oliver</surname> <given-names>J.</given-names></name> <name><surname>Garcia-Aranda</surname> <given-names>M.</given-names></name> <name><surname>Chaves</surname> <given-names>P.</given-names></name> <name><surname>Alba</surname> <given-names>E.</given-names></name> <name><surname>Cobo-Dols</surname> <given-names>M.</given-names></name> <name><surname>Onieva</surname> <given-names>J. L.</given-names></name><etal/></person-group> (<year>2021</year>). <article-title>Emerging noninvasive methylation biomarkers of cancer prognosis and drug response prediction.</article-title> <source><italic>Semin. Cancer. Biol.</italic></source> <pub-id pub-id-type="doi">10.1016/j.semcancer.2021.03.012</pub-id> <pub-id pub-id-type="pmid">33757849</pub-id></citation></ref>
<ref id="B35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pavlov</surname> <given-names>Y. I.</given-names></name> <name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Galkin</surname> <given-names>A. P.</given-names></name> <name><surname>Aksenova</surname> <given-names>A. Y.</given-names></name> <name><surname>Hanaoka</surname> <given-names>F.</given-names></name> <name><surname>Rada</surname> <given-names>C.</given-names></name><etal/></person-group> (<year>2002</year>). <article-title>Correlation of somatic hypermutation specificity and A-T base pair substitution errors by DNA polymerase &#x03B7; during copying of a mouse immunoglobulin &#x03BA; light chain transgene.</article-title> <source><italic>Proc. Natl. Acad. Sci. U.S.A.</italic></source> <volume>99</volume> <fpage>9954</fpage>&#x2013;<lpage>9959</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.152126799</pub-id> <pub-id pub-id-type="pmid">12119399</pub-id></citation></ref>
<ref id="B36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Petljak</surname> <given-names>M.</given-names></name> <name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name></person-group> (<year>2016</year>). <article-title>Understanding mutagenesis through delineation of mutational signatures in human cancer.</article-title> <source><italic>Carcinogenesis</italic></source> <volume>37</volume> <fpage>531</fpage>&#x2013;<lpage>540</lpage>. <pub-id pub-id-type="doi">10.1093/carcin/bgw055</pub-id> <pub-id pub-id-type="pmid">27207657</pub-id></citation></ref>
<ref id="B37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pham</surname> <given-names>P.</given-names></name> <name><surname>Calabrese</surname> <given-names>P.</given-names></name> <name><surname>Park</surname> <given-names>S. J.</given-names></name> <name><surname>Goodman</surname> <given-names>M. F.</given-names></name></person-group> (<year>2011</year>). <article-title>Analysis of a single-stranded DNA-scanning process in which activation-induced deoxycytidine deaminase (AID) deaminates C to U haphazardly and inefficiently to ensure mutational diversity.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>286</volume> <fpage>24931</fpage>&#x2013;<lpage>24942</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.m111.241208</pub-id> <pub-id pub-id-type="pmid">21572036</pub-id></citation></ref>
<ref id="B38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pilzecker</surname> <given-names>B.</given-names></name> <name><surname>Jacobs</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>Mutating for good: DNA damage responses during somatic hypermutation.</article-title> <source><italic>Front. Immunol.</italic></source> <volume>10</volume>:<issue>438</issue>. <pub-id pub-id-type="doi">10.3389/fimmu.2019.00438</pub-id> <pub-id pub-id-type="pmid">30915081</pub-id></citation></ref>
<ref id="B39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rahbari</surname> <given-names>R.</given-names></name> <name><surname>Wuster</surname> <given-names>A.</given-names></name> <name><surname>Lindsay</surname> <given-names>S. J.</given-names></name> <name><surname>Hardwick</surname> <given-names>R. J.</given-names></name> <name><surname>Alexandrov</surname> <given-names>L. B.</given-names></name> <name><surname>Turki</surname> <given-names>S. A.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Timing, rates and spectra of human germline mutation.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>48</volume> <fpage>126</fpage>&#x2013;<lpage>133</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3469</pub-id> <pub-id pub-id-type="pmid">26656846</pub-id></citation></ref>
<ref id="B40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Revy</surname> <given-names>P.</given-names></name> <name><surname>Muto</surname> <given-names>T.</given-names></name> <name><surname>Levy</surname> <given-names>Y.</given-names></name> <name><surname>Geissmann</surname> <given-names>F.</given-names></name> <name><surname>Plebani</surname> <given-names>A.</given-names></name> <name><surname>Sanal</surname> <given-names>O.</given-names></name><etal/></person-group> (<year>2000</year>). <article-title>Activation-induced cytidine deaminase (AID) deficiency causes the autosomal recessive form of the hyper-IgM syndrome (HIGM2).</article-title> <source><italic>Cell</italic></source> <volume>102</volume> <fpage>565</fpage>&#x2013;<lpage>575</lpage>. <pub-id pub-id-type="doi">10.1016/s0092-8674(00)00079-9</pub-id></citation></ref>
<ref id="B41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roberts</surname> <given-names>S. A.</given-names></name> <name><surname>Gordenin</surname> <given-names>D. A.</given-names></name></person-group> (<year>2014</year>). <article-title>Hypermutation in human cancer genomes: footprints and mechanisms.</article-title> <source><italic>Nat. Rev. Cancer</italic></source> <volume>14</volume> <fpage>786</fpage>&#x2013;<lpage>800</lpage>. <pub-id pub-id-type="doi">10.1038/nrc3816</pub-id> <pub-id pub-id-type="pmid">25568919</pub-id></citation></ref>
<ref id="B42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roberts</surname> <given-names>S. A.</given-names></name> <name><surname>Lawrence</surname> <given-names>M. S.</given-names></name> <name><surname>Klimczak</surname> <given-names>L. J.</given-names></name> <name><surname>Grimm</surname> <given-names>S. A.</given-names></name> <name><surname>Fargo</surname> <given-names>D.</given-names></name> <name><surname>Stojanov</surname> <given-names>P.</given-names></name><etal/></person-group> (<year>2013</year>). <article-title>An APOBEC cytidine deaminase mutagenesis pattern is widespread in human cancers.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>45</volume> <fpage>970</fpage>&#x2013;<lpage>976</lpage>. <pub-id pub-id-type="doi">10.1038/ng.2702</pub-id> <pub-id pub-id-type="pmid">23852170</pub-id></citation></ref>
<ref id="B43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Diaz</surname> <given-names>M.</given-names></name></person-group> (<year>2004</year>). <article-title>Cutting edge: DGYW/WRCH is a better predictor of mutability at G:C bases in Ig hypermutation than the widely accepted RGYW/WRCY motif and probably reflects a two-step activation-induced cytidine deaminase-triggered process.</article-title> <source><italic>J. Immunol.</italic></source> <volume>172</volume> <fpage>3382</fpage>&#x2013;<lpage>3384</lpage>. <pub-id pub-id-type="doi">10.4049/jimmunol.172.6.3382</pub-id> <pub-id pub-id-type="pmid">15004135</pub-id></citation></ref>
<ref id="B44"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Gertz</surname> <given-names>E. M.</given-names></name> <name><surname>Baranov</surname> <given-names>P. V.</given-names></name> <name><surname>Poliakov</surname> <given-names>E.</given-names></name> <name><surname>Schaffer</surname> <given-names>A. A.</given-names></name></person-group> (<year>2018a</year>). <article-title>Genome-wide changes in protein translation efficiency are associated with autism.</article-title> <source><italic>Genome Biol. Evol.</italic></source> <volume>10</volume> <fpage>1902</fpage>&#x2013;<lpage>1919</lpage>. <pub-id pub-id-type="doi">10.1093/gbe/evy146</pub-id> <pub-id pub-id-type="pmid">29986017</pub-id></citation></ref>
<ref id="B45"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Goncearenco</surname> <given-names>A.</given-names></name> <name><surname>Lada</surname> <given-names>A. G.</given-names></name> <name><surname>De</surname> <given-names>S.</given-names></name> <name><surname>Yurchenko</surname> <given-names>V.</given-names></name> <name><surname>Nudelman</surname> <given-names>G.</given-names></name><etal/></person-group> (<year>2018b</year>). <article-title>DNA polymerase &#x03B7; mutational signatures are found in a variety of different types of cancer.</article-title> <source><italic>Cell Cycle</italic></source> <volume>17</volume> <fpage>348</fpage>&#x2013;<lpage>355</lpage>. <pub-id pub-id-type="doi">10.1080/15384101.2017.1404208</pub-id> <pub-id pub-id-type="pmid">29139326</pub-id></citation></ref>
<ref id="B46"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Lada</surname> <given-names>A. G.</given-names></name> <name><surname>Goncearenco</surname> <given-names>A.</given-names></name> <name><surname>Green</surname> <given-names>M. R.</given-names></name> <name><surname>De</surname> <given-names>S.</given-names></name> <name><surname>Nudelman</surname> <given-names>G.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Activation induced deaminase mutational signature overlaps with CpG methylation sites in follicular lymphoma and other cancers.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>6</volume>:<issue>38133</issue>.</citation></ref>
<ref id="B47"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Pavlov</surname> <given-names>Y. I.</given-names></name> <name><surname>Bebenek</surname> <given-names>K.</given-names></name> <name><surname>Matsuda</surname> <given-names>T.</given-names></name> <name><surname>Kunkel</surname> <given-names>T. A.</given-names></name></person-group> (<year>2001</year>). <article-title>Somatic mutation hotspots correlate with DNA polymerase eta error spectrum.</article-title> <source><italic>Nat. Immunol.</italic></source> <volume>2</volume> <fpage>530</fpage>&#x2013;<lpage>536</lpage>. <pub-id pub-id-type="doi">10.1038/88732</pub-id> <pub-id pub-id-type="pmid">11376340</pub-id></citation></ref>
<ref id="B48"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Pavlov</surname> <given-names>Y. I.</given-names></name> <name><surname>Goncearenco</surname> <given-names>A.</given-names></name> <name><surname>De</surname> <given-names>S.</given-names></name> <name><surname>Lada</surname> <given-names>A. G.</given-names></name> <name><surname>Poliakov</surname> <given-names>E.</given-names></name><etal/></person-group> (<year>2018c</year>). <article-title>Mutational signatures and mutable motifs in cancer genomes.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>19</volume> <fpage>1085</fpage>&#x2013;<lpage>1101</lpage>.</citation></ref>
<ref id="B49"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rogozin</surname> <given-names>I. B.</given-names></name> <name><surname>Roche-Lima</surname> <given-names>A.</given-names></name> <name><surname>Lada</surname> <given-names>A. G.</given-names></name> <name><surname>Belinky</surname> <given-names>F.</given-names></name> <name><surname>Sidorenko</surname> <given-names>I. A.</given-names></name> <name><surname>Glazko</surname> <given-names>G. V.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>Nucleotide weight matrices reveal ubiquitous mutational footprints of AID/APOBEC deaminases in human cancer genomes.</article-title> <source><italic>Cancers</italic></source> <volume>11</volume>:<issue>211</issue>. <pub-id pub-id-type="doi">10.3390/cancers11020211</pub-id> <pub-id pub-id-type="pmid">30759888</pub-id></citation></ref>
<ref id="B50"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sanders</surname> <given-names>S. J.</given-names></name> <name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Willsey</surname> <given-names>A. J.</given-names></name> <name><surname>Ercan-Sencicek</surname> <given-names>A. G.</given-names></name> <name><surname>Samocha</surname> <given-names>K. E.</given-names></name> <name><surname>Cicek</surname> <given-names>A. E.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Insights into autism spectrum disorder genomic architecture and biology from 71 risk loci.</article-title> <source><italic>Neuron</italic></source> <volume>87</volume> <fpage>1215</fpage>&#x2013;<lpage>1233</lpage>.</citation></ref>
<ref id="B51"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Scandaglia</surname> <given-names>M.</given-names></name> <name><surname>Barco</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Contribution of spurious transcription to intellectual disability disorders.</article-title> <source><italic>J. Med. Genet.</italic></source> <volume>56</volume> <fpage>491</fpage>&#x2013;<lpage>498</lpage>. <pub-id pub-id-type="doi">10.1136/jmedgenet-2018-105668</pub-id> <pub-id pub-id-type="pmid">30745423</pub-id></citation></ref>
<ref id="B52"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Seplyarskiy</surname> <given-names>V. B.</given-names></name> <name><surname>Soldatov</surname> <given-names>R. A.</given-names></name> <name><surname>Popadin</surname> <given-names>K. Y.</given-names></name> <name><surname>Antonarakis</surname> <given-names>S. E.</given-names></name> <name><surname>Bazykin</surname> <given-names>G. A.</given-names></name> <name><surname>Nikolaev</surname> <given-names>S. I.</given-names></name></person-group> (<year>2016</year>). <article-title>APOBEC-induced mutations in human cancers are strongly enriched on the lagging DNA strand during replication.</article-title> <source><italic>Genome Res.</italic></source> <volume>26</volume> <fpage>174</fpage>&#x2013;<lpage>182</lpage>. <pub-id pub-id-type="doi">10.1101/gr.197046.115</pub-id> <pub-id pub-id-type="pmid">26755635</pub-id></citation></ref>
<ref id="B53"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shanak</surname> <given-names>S.</given-names></name> <name><surname>Helms</surname> <given-names>V.</given-names></name></person-group> (<year>2020</year>). <article-title>DNA methylation and the core pluripotency network.</article-title> <source><italic>Dev. Biol.</italic></source> <volume>464</volume> <fpage>145</fpage>&#x2013;<lpage>160</lpage>. <pub-id pub-id-type="doi">10.1016/j.ydbio.2020.06.001</pub-id> <pub-id pub-id-type="pmid">32562758</pub-id></citation></ref>
<ref id="B54"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sina</surname> <given-names>A. A.</given-names></name> <name><surname>Carrascosa</surname> <given-names>L. G.</given-names></name> <name><surname>Liang</surname> <given-names>Z.</given-names></name> <name><surname>Grewal</surname> <given-names>Y. S.</given-names></name> <name><surname>Wardiana</surname> <given-names>A.</given-names></name> <name><surname>Shiddiky</surname> <given-names>M. J. A.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>Epigenetically reprogrammed methylation landscape drives the DNA self-assembly and serves as a universal cancer biomarker.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>9</volume>:<issue>4915</issue>.</citation></ref>
<ref id="B55"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Soldatos</surname> <given-names>T. G.</given-names></name> <name><surname>Perdigao</surname> <given-names>N.</given-names></name> <name><surname>Brown</surname> <given-names>N. P.</given-names></name> <name><surname>Sabir</surname> <given-names>K. S.</given-names></name> <name><surname>O&#x2019;Donoghue</surname> <given-names>S. I.</given-names></name></person-group> (<year>2015</year>). <article-title>How to learn about gene function: text-mining or ontologies?</article-title> <source><italic>Methods</italic></source> <volume>74</volume> <fpage>3</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2014.07.004</pub-id> <pub-id pub-id-type="pmid">25088781</pub-id></citation></ref>
<ref id="B56"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Staden</surname> <given-names>R.</given-names></name></person-group> (<year>1984</year>). <article-title>Computer methods to locate signals in nucleic acid sequences.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>12</volume> <fpage>505</fpage>&#x2013;<lpage>519</lpage>. <pub-id pub-id-type="doi">10.1093/nar/12.1part2.505</pub-id> <pub-id pub-id-type="pmid">6364039</pub-id></citation></ref>
<ref id="B57"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stratton</surname> <given-names>M. R.</given-names></name> <name><surname>Campbell</surname> <given-names>P. J.</given-names></name> <name><surname>Futreal</surname> <given-names>P. A.</given-names></name></person-group> (<year>2009</year>). <article-title>The cancer genome.</article-title> <source><italic>Nature</italic></source> <volume>458</volume> <fpage>719</fpage>&#x2013;<lpage>724</lpage>.</citation></ref>
<ref id="B58"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Swanton</surname> <given-names>C.</given-names></name> <name><surname>McGranahan</surname> <given-names>N.</given-names></name> <name><surname>Starrett</surname> <given-names>G. J.</given-names></name> <name><surname>Harris</surname> <given-names>R. S.</given-names></name></person-group> (<year>2015</year>). <article-title>APOBEC enzymes: mutagenic fuel for cancer evolution and heterogeneity.</article-title> <source><italic>Cancer Discov.</italic></source> <volume>5</volume> <fpage>704</fpage>&#x2013;<lpage>712</lpage>. <pub-id pub-id-type="doi">10.1158/2159-8290.cd-15-0344</pub-id> <pub-id pub-id-type="pmid">26091828</pub-id></citation></ref>
<ref id="B59"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tokheim</surname> <given-names>C.</given-names></name> <name><surname>Karchin</surname> <given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>CHASMplus reveals the scope of somatic missense mutations driving human cancers.</article-title> <source><italic>Cell Syst.</italic></source> <volume>9</volume> <fpage>9</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1016/j.cels.2019.05.005</pub-id> <pub-id pub-id-type="pmid">31202631</pub-id></citation></ref>
<ref id="B60"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>J. H.</given-names></name> <name><surname>Zhao</surname> <given-names>L. F.</given-names></name> <name><surname>Lin</surname> <given-names>P.</given-names></name> <name><surname>Su</surname> <given-names>X. R.</given-names></name> <name><surname>Chen</surname> <given-names>S. J.</given-names></name> <name><surname>Huang</surname> <given-names>L. Q.</given-names></name><etal/></person-group> (<year>2014</year>). <article-title>GenCLiP 2.0: a web server for functional clustering of genes and construction of molecular networks based on free terms.</article-title> <source><italic>Bioinformatics</italic></source> <volume>30</volume> <fpage>2534</fpage>&#x2013;<lpage>2536</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btu241</pub-id> <pub-id pub-id-type="pmid">24764463</pub-id></citation></ref>
<ref id="B61"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wood</surname> <given-names>R. D.</given-names></name> <name><surname>Doubli&#x00E9;</surname> <given-names>S.</given-names></name></person-group> (<year>2016</year>). <article-title>DNA polymerase &#x03B8; (POLQ), double-strand break repair, and cancer.</article-title> <source><italic>DNA Repair</italic></source> <volume>44</volume> <fpage>22</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1016/j.dnarep.2016.05.003</pub-id> <pub-id pub-id-type="pmid">27264557</pub-id></citation></ref>
<ref id="B62"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zan</surname> <given-names>H.</given-names></name> <name><surname>Shima</surname> <given-names>N.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Al-Qahtani</surname> <given-names>A.</given-names></name> <name><surname>Evinger Iii</surname> <given-names>A. J.</given-names></name> <name><surname>Zhong</surname> <given-names>Y.</given-names></name><etal/></person-group> (<year>2005</year>). <article-title>The translesion DNA polymerase &#x03B8; plays a dominant role in immunoglobulin gene somatic hypermutation.</article-title> <source><italic>EMBO J.</italic></source> <volume>24</volume> <fpage>3757</fpage>&#x2013;<lpage>3769</lpage>. <pub-id pub-id-type="doi">10.1038/sj.emboj.7600833</pub-id> <pub-id pub-id-type="pmid">16222339</pub-id></citation></ref>
</ref-list><fn-group>
<fn id="footnote1">
<label>1</label>
<p><ext-link ext-link-type="uri" xlink:href="https://cancer.sanger.ac.uk">https://cancer.sanger.ac.uk</ext-link></p></fn>
<fn id="footnote2">
<label>2</label>
<p><ext-link ext-link-type="uri" xlink:href="https://dcc.icgc.org/projects/MALY-DE">https://dcc.icgc.org/projects/MALY-DE</ext-link></p></fn>
<fn id="footnote3">
<label>3</label>
<p><ext-link ext-link-type="uri" xlink:href="http://www.graphpad.com/quickcalcs/contingency1.cfm">www.graphpad.com/quickcalcs/contingency1.cfm</ext-link></p></fn>
</fn-group>
</back>
</article>
