<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1222112</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2023.1222112</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Chromatin structure and context-dependent sequence features control prime editing efficiency</article-title>
<alt-title alt-title-type="left-running-head">Kim et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2023.1222112">10.3389/fgene.2023.1222112</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Kim</surname>
<given-names>Somang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2348803/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Jimmy B.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2348418/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Woods</surname>
<given-names>Wendy S.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Newton</surname>
<given-names>Destry A.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Perez-Pinera</surname>
<given-names>Pablo</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Song</surname>
<given-names>Jun S.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/25538/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Physics, University of Illinois at Urbana-Champaign</institution>, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Carl R. Woese Institute for Genomic Biology</institution>, University of Illinois at Urbana-Champaign, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Bioengineering, University of Illinois at Urbana-Champaign</institution>, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff4">
<sup>4</sup>Department of Biomedical and Translational Sciences, <institution>Carle-Illinois College of Medicine</institution>, University of Illinois at Urbana-Champaign, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Cancer Center at Illinois</institution>, University of Illinois at Urbana-Champaign, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Department of Molecular and Integrative Physiology, University of Illinois at Urbana-Champaign</institution>, <addr-line>Urbana</addr-line>, <addr-line>IL</addr-line>, <country>United States</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>Center for Theoretical Physics</institution>, <institution>Department of Physics</institution>, <institution>Massachusetts Institute of Technology</institution>, <addr-line>Cambridge</addr-line>, <addr-line>MA</addr-line>, <country>United States</country>
</aff>
<aff id="aff8">
<sup>8</sup>
<institution>Department of Statistics</institution>, <institution>Harvard University</institution>, <addr-line>Cambridge</addr-line>, <addr-line>MA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1962710/overview">Meizhu Bai</ext-link>, Yale University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1165673/overview">Simon Sretenovic</ext-link>, University of Maryland, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/353082/overview">Zahir Ali</ext-link>, King Abdullah University of Science and Technology, Saudi Arabia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jun S. Song, <email>songj@illinois.edu</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>06</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1222112</elocation-id>
<history>
<date date-type="received">
<day>13</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>16</day>
<month>06</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Kim, Yuan, Woods, Newton, Perez-Pinera and Song.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Kim, Yuan, Woods, Newton, Perez-Pinera and Song</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Prime editing (PE) is a highly versatile CRISPR&#x2013;Cas9 genome editing technique. The current constructs, however, have variable efficiency and may require laborious experimental optimization. This study presents statistical models for learning the salient epigenomic and sequence features of target sites modulating the editing efficiency and provides guidelines for designing optimal PEs. We found that both regional constitutive heterochromatin and local nucleosome occlusion of target sites impede editing, while position-specific G/C nucleotides in the primer-binding site (PBS) and reverse transcription (RT) template regions of PE guide RNA (pegRNA) yield high editing efficiency, especially for short PBS designs. The presence of G/C nucleotides was most critical immediately 5&#x2019; to the protospacer adjacent motif (PAM) site for all designs. The effects of different last templated nucleotides were quantified and observed to depend on the length of both PBS and RT templates. Our models found AGG to be the preferred PAM and detected a guanine nucleotide four bases downstream of the PAM to facilitate editing, suggesting a hitherto-unrecognized interaction with Cas9. A neural network interpretation method based on nonextensive statistical mechanics further revealed multi-nucleotide preferences, indicating dependency among several bases across pegRNA. Our work clarifies previous conflicting observations and uncovers context-dependent features important for optimizing PE designs.</p>
</abstract>
<kwd-group>
<kwd>prime editing</kwd>
<kwd>CRISPR&#x2013;Cas9</kwd>
<kwd>heterochromatin</kwd>
<kwd>nucleosome positioning</kwd>
<kwd>DNA-RNA hybridization</kwd>
<kwd>nucleotide preference</kwd>
<kwd>machine learning</kwd>
<kwd>neural network interpretation</kwd>
</kwd-group>
<contract-num rid="cn001">R01CA163336 R01GM141296</contract-num>
<contract-sponsor id="cn001">National Institutes of Health<named-content content-type="fundref-id">10.13039/100000002</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Human and Medical Genomics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>One of the most powerful gene editing tools available today is the complex of clustered regularly interspaced short palindromic repeats (CRISPRs) with CRISPR-associated protein 9 (Cas9) (<xref ref-type="bibr" rid="B26">Jinek et al., 2012</xref>; <xref ref-type="bibr" rid="B10">Cong et al., 2013</xref>; <xref ref-type="bibr" rid="B27">Jinek et al., 2013</xref>; <xref ref-type="bibr" rid="B44">Mali et al., 2013</xref>). In nature, CRISPR&#x2013;Cas9 is found in bacteria as a natural defense mechanism of excising foreign DNA in the CRISPR DNA regions. In a rapid succession of development, researchers have repurposed and further engineered this process as a laboratory tool for editing the genome of a variety of cell types across species, including diseased cells in humans (<xref ref-type="bibr" rid="B12">Doudna and Charpentier, 2014</xref>; <xref ref-type="bibr" rid="B23">Hsu et al., 2014</xref>; <xref ref-type="bibr" rid="B57">Wright et al., 2016</xref>). Several different techniques have been developed to date to improve the CRISPR&#x2013;Cas9 system to be more programmable and suitable for <italic>in vivo</italic> editing, while reducing off-targets and unintended mutations (<xref ref-type="bibr" rid="B31">Komor et al., 2016</xref>; <xref ref-type="bibr" rid="B20">Gaudelli et al., 2017</xref>; <xref ref-type="bibr" rid="B30">Kim et al., 2017</xref>; <xref ref-type="bibr" rid="B32">Komor et al., 2017</xref>; <xref ref-type="bibr" rid="B19">Gapinske et al., 2018</xref>; <xref ref-type="bibr" rid="B56">Winter et al., 2019</xref>). In particular, prime editors (PEs) are the latest state-of-the-art tool, which are highly versatile (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). The biomolecular architecture of PEs consists of a partially inactivated Cas9 fused to a reverse transcriptase and a customizable prime editing guide RNA (pegRNA), which contains a scaffolding sequence bound by Cas9. A PE targets the desired edit locus via the combination of two main processes: complementary pairing of a &#x223c;20 nucleotide (nt) guide sequence at the 5&#x2032; end of pegRNA to the non-edited DNA strand and recognition of a short protospacer adjacent motif (PAM) on the edited strand by Cas9. The modified Cas9 includes a domain fused to a reverse transcriptase (RT) and a nickase domain that nicks only the edited DNA strand, three bases upstream of the PAM sequence. The 3&#x2032; end of pegRNA hybridizes to a region in the edited strand and acts as a primer for the ensuing reverse transcription. Starting at the nick site, the RT reverse-transcribes part of the pegRNA 3&#x2032; extension that is immediately upstream of the primer-binding site (PBS) region. This region is denoted as the RT template and contains the complementary RNA template for the desired DNA edit sequence. After nicking the edited strand and reverse-transcribing the templated DNA, two single-stranded DNA flaps are formed: the 3&#x2032; flap, created via reverse transcription containing the edit of interest, and the 5&#x2032; flap, created via the Cas9 nick, which does not contain the edit. Successful edits are completed upon the action of 5&#x2032;-flap-specific endonucleases, such as FEN1 (<xref ref-type="bibr" rid="B41">Liu et al., 2004</xref>).</p>
<p>There are two main advantages of PEs over other genome editors. First, compared to the CRISPR&#x2013;Cas9 endonuclease construct, PEs are less likely to produce unintended nearby insertions and deletions, as they avoid DNA double-strand breaks (<xref ref-type="bibr" rid="B8">Chapman et al., 2012</xref>; <xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). Second, unlike base editors that can currently introduce only C &#x3e; T or A &#x3e; G conversions, PEs can produce all 12 base changes, as well as small insertions and deletions (<xref ref-type="bibr" rid="B20">Gaudelli et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). However, a notable difficulty in using PEs stems from the complication that the flexible pegRNA design has several adjustable parameters, yielding varying degrees of editing efficiency, and from the fact that there is currently a dearth of reliable computational models capable of <italic>a priori</italic> predicting these differences in efficiencies.</p>
<p>Anzalone <italic>et al.</italic> originally introduced three variants of PE, denoted as PE1, PE2, and PE3, where PE1 used a reverse transcriptase derived from Moloney murine leukemia virus (MMLV RT), PE2 used the MMLV RT from PE1 with an additional 5 point mutations, and PE3 used the Cas9 nickase in PE2 to perform non-concurrent nicks on both strands, and thus, perform non-concurrent edits on both strands to avoid creating double-stranded breaks (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). To date, numerous engineering approaches have been applied to improve the editing efficiency of PEs, such as mutating the PE components or co-expressing additional components together with the PE, resulting in multiple PE variants (<xref ref-type="bibr" rid="B52">Spencer and Zhang, 2017</xref>; <xref ref-type="bibr" rid="B40">Liu P. et al., 2021</xref>; <xref ref-type="bibr" rid="B9">Chen et al., 2021</xref>; <xref ref-type="bibr" rid="B48">Park et al., 2021</xref>; <xref ref-type="bibr" rid="B33">Kweon et al., 2022</xref>). There have also been additional improvements in PE design by either altering the pegRNA outside the regions directly hybridizing with the target site (<xref ref-type="bibr" rid="B42">Liu Y. et al., 2021</xref>; <xref ref-type="bibr" rid="B46">Nelson et al., 2022</xref>; <xref ref-type="bibr" rid="B49">Petri et al., 2022</xref>) or by impeding the mismatch repair mechanism (<xref ref-type="bibr" rid="B33">Kweon et al., 2022</xref>).</p>
<p>Despite the rapid experimental progress, accurately predicting the editing efficiency of PEs at previously untested genomic loci remains a major challenge. The prediction problem is complicated by the fact that PE efficiency may depend on numerous factors, such as sequence content and chromatin accessibility of the targeted locus, lengths of the PBS and RT regions on pegRNA, and the intended editing type. Notably, Kim <italic>et al.</italic> recently embarked on the difficult task of probing PE2 efficiency in human cells by using lentiviral plasmid libraries to screen the efficiency of &#x223c;48,000 pegRNA designs on &#x223c;2,000 integrated target sequences (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>); analyzing the resulting data using a deep learning model, they predicted the measured PE2 efficiency based on an extensive set of variables, including the target sequence, GC counts, melting temperature, minimum self-folding energy, and the DeepSpCas9 score of Cas9 nuclease activity (<xref ref-type="bibr" rid="B28">Kim et al., 2019</xref>). Although the authors highlighted the most relevant features using the Tree SHAP approach, some predictive variables, such as GC content and melting temperature, were highly correlated and might have confounded the interpretation. In addition, their analysis focused on common features shared across varying lengths of PBS and RT templates, rather than features distinguishing different designs. Furthermore, the experimental method measuring the editing efficiencies mostly assayed exogenously integrated sequences rather than endogenous sequences. The specific locations of the integration sites were thus not known; as a result, data and models used by Kim <italic>et al.&#x2019;</italic>s did not incorporate epigenetic information in predicting PE2 efficiency.</p>
<p>This study presents statistical models that systematically examine the effects of both epigenomic and sequence-dependent features on PE efficiencies by analyzing data from both existing publications and additional in-house experiments (<xref ref-type="sec" rid="s11">Supplementary Method S1</xref>). We consider only PE2 editors, which have by far the most number of publicly available data. We describe preferred target features for each pair of PBS length (PBSL) and RT template length (RTTL), revealing specific nucleotide effects that depend on these lengths and resolving in the process discrepancies in the literature regarding the role of certain nucleotides. Our models, utilizing only a small number of parameters compared to those used in previous approaches (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>), capture both marginal and joint effects of nucleotides across pegRNA and provide practical guidelines for choosing optimal PE2 designs.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and methods</title>
<sec id="s2-1">
<title>Cell culture and transfection</title>
<p>The cell line HEK293T was obtained from the American Type Culture Collection (ATCC) and was maintained in DMEM supplemented with 10% fetal bovine serum and 1% penicillin/streptomycin at 37&#xb0;C with 5% CO<sub>2</sub>. HEK293T cells were transfected in 24-well plates with Lipofectamine 2000 (Invitrogen) following the manufacturer&#x2019;s instructions. The amount of DNA used for lipofection was 1&#xa0;&#x3bc;g per well. Transfection efficiency was always higher than 90% as determined by fluorescent microscopy or flow cytometry, following the delivery of a control GFP expression plasmid.</p>
</sec>
<sec id="s2-2">
<title>Plasmids and cloning</title>
<p>The plasmids encoding PE2 (Plasmid&#x23;132775), as well as the plasmid expressing the pegRNA (Plasmid &#x23;132777), were obtained from Addgene. PegRNAs were cloned into the plasmid backbone with paired oligonucleotides (IDT, <xref ref-type="sec" rid="s11">Supplementary Table S1</xref>), as previously described (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). Briefly, the oligonucleotides used to create the guide sequences were hybridized, phosphorylated, and cloned into the sgRNA vector using BsaI-HFv2 (NEB), in a reaction that included T4-PNK (NEB) and T4 DNA Ligase (NEB).</p>
</sec>
<sec id="s2-3">
<title>Next-generation sequencing (NGS)</title>
<p>DNA amplicons for NGS were generated by PCR using KAPA HiFi HotStart (Roche), according to the manufacturer&#x2019;s instructions, using primers with overhangs compatible with Nextera XT indexing (IDT, <xref ref-type="sec" rid="s11">Supplementary Table S2</xref>). Following validation of the quality of PCR products by gel electrophoresis, the PCR products were isolated using AMPure XP PCR purification beads (Beckman Coulter). Indexed amplicons were then generated using a Nextera XT DNA Library Prep Kit (Illumina), quantitated, and pooled. Libraries were sequenced with a MiSeq Nano flow cell for 251 cycles from each end of the fragment using a MiSeq Reagent Kit v2 (500 cycles). FASTQ files were created and demultiplexed using bcl2fastq v2.17.1.14 Conversion Software (Illumina). Deep sequencing was performed by the Roy J. Carver Biotechnology Center at the University of Illinois at Urbana-Champaign.</p>
</sec>
<sec id="s2-4">
<title>Replicate information</title>
<p>The in-house experiments of prime editing for 25 target sites (<xref ref-type="sec" rid="s11">Supplementary Tables S1, S2</xref>) were conducted in three biological replicates on different days by the same person.</p>
</sec>
<sec id="s2-5">
<title>Quantification of editing efficiency</title>
<p>The editing efficiency data for PE2 at endogenously edited sites in <xref ref-type="bibr" rid="B2">Anzalone et al. (2019</xref>) were obtained from the Sequence Read Archive (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/sra/">https://www.ncbi.nlm.nih.gov/sra/</ext-link>) under the accession code PRJNA565979. In-house experimental sequencing data for 25 target sites can be found in the SRA under the accession code PRJNA949853. Raw sequences from Anzalone <italic>et al.</italic> and in-house experiments were aligned to the human genome (GRCh38) using the Bowtie package (version 2.4.1), with a maximum fragment length of 500 base pairs (bp) for paired-end sequences (<xref ref-type="bibr" rid="B35">Langmead and Salzberg, 2012</xref>). Poorly aligned sequences were filtered using the Samtools package (version 1.7) with options -h -F 4 -q 10 (<xref ref-type="bibr" rid="B37">Li et al., 2009</xref>). The editing efficiency for each biological replicate was calculated by dividing the number of reads that contain the edit of interest, but were otherwise perfectly aligned, by the number of perfectly aligned reads containing either the wild-type or edited sequence. For paired-end sequencing data, half of the number of reads found in strand pairs was excluded from both the mutant and wild-type sequences to prevent double-counting of reads. The final editing percentage for each target locus was determined by averaging over the editing percentages in all biological replicates. The processed editing efficiencies in <xref ref-type="bibr" rid="B29">Kim et al. (2021</xref>) were obtained from their Supplementary Tables 3, 4.</p>
</sec>
<sec id="s2-6">
<title>Assessment of differential enrichment of histone modifications</title>
<p>Genomic locations of H3K9me3 and H3K27me3 histone modifications were determined from the aggregate of chromatin immuno-precipitation sequencing (ChIP-seq) datasets in HEK293 cells from the Encyclopedia of DNA Elements (ENCODE) consortium (<xref ref-type="bibr" rid="B13">ENCODE Project Consortium, 2012</xref>; <xref ref-type="bibr" rid="B43">Luo et al., 2020</xref>), with the accession codes having the prefix &#x201c;ENC,&#x201d; and the Gene Expression Omnibus (GEO) (<xref ref-type="bibr" rid="B4">Barrett et al., 2013</xref>), with the accession codes having the prefix &#x201c;GSM.&#x201d; For H3K9me3, the IP sequencing data used were GSM4301086 (<xref ref-type="bibr" rid="B6">Broche et al., 2021</xref>), the combination of ENCFF002AAX and ENCFF002AAZ as replicates, and the combination of GSM3452796 and GSM3452797 also as replicates (<xref ref-type="bibr" rid="B54">Tarjan et al., 2019</xref>). The corresponding inputs were GSM4301092, ENCFF000WXY, and GSM4445881 (<xref ref-type="bibr" rid="B34">Lamb et al., 2019</xref>). Raw reads were aligned to GRCh38 using Bowtie (version 2.4.1). For H3K27me3, the IP sequencing data were GSM3907592 (<xref ref-type="bibr" rid="B34">Lamb et al., 2019</xref>), GSM4301076 (<xref ref-type="bibr" rid="B6">Broche et al., 2021</xref>), GSM4586041 (<xref ref-type="bibr" rid="B15">Fan et al., 2020</xref>), and GSM4859391 (<xref ref-type="bibr" rid="B14">Fan et al., 2021</xref>); the corresponding inputs were GSM3907592, GSM4301081, GSM4586041, and GSM4859385. To determine the consensus genomic locations of H3K9me3 enrichment relative to the input, we first constructed six row vectors from the <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-normalized density of reads, partitioned into 1-kb genomic bins, with the three IP data and three input data treated as separate samples. We performed singular value decomposition (SVD) on the resulting matrix of the six row vectors (<xref ref-type="sec" rid="s11">Supplementary Method S2</xref>). A similar analysis was performed to detect the consensus genomic locations of H3K27me3 enrichment relative to the input. SVD and further analysis of variance (ANOVA) were performed using the decomposition and classification of epigenomic tensors (DeCET) package (<xref ref-type="bibr" rid="B36">Leistico et al., 2021</xref>).</p>
</sec>
<sec id="s2-7">
<title>Calculation of RNA-DNA hybridization energy</title>
<p>RNA-DNA hybridization energies were obtained by computing the difference in length-normalized Gibbs free energy (<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:msup>
<mml:mi>G</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) at 37&#xb0;C between a paired RNA-DNA oligomer and two unpaired oligonucleotides. The <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:msup>
<mml:mi>G</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values of paired oligonucleotides were computed by adding up all the <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:msup>
<mml:mi>G</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values of the dinucleotide components of the oligonucleotides, in addition to a helix initiation term that accounts for forming the first base pair in the double helix. The <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:msup>
<mml:mi>G</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> values of all paired dinucleotides and initiation terms were obtained from <xref ref-type="bibr" rid="B53">Sugimoto et al. (1995</xref>).</p>
</sec>
<sec id="s2-8">
<title>Summary statistic of the nucleosome occupancy signal</title>
<p>Nucleosome occupancy in the lymphoblastoid cell line GM12878 was determined from MNase-sequencing bigwig tracks from the ENCODE portal under the accession code ENCFFOOOVME (<xref ref-type="bibr" rid="B43">Luo et al., 2020</xref>). Average nucleosome occupancy in a particular genomic range was calculated by determining the sum of MNase-seq signals at every genomic coordinate in the protospacer region, normalized by the length of the protospacer.</p>
</sec>
<sec id="s2-9">
<title>Off-target determination</title>
<p>PE off-targets were computed by aligning the protospacer sequence to regions in the hg38 genome that are also upstream of an NGG PAM site. A genomic locus was considered to be an off-target if its aligned sequence matched the reference protospacer sequence up to three mismatches outside the PAM site and if there were no mismatches in the GG dinucleotide of the PAM. These off-targets were determined by using the Cas-OFFInder package (<xref ref-type="bibr" rid="B3">Bae et al., 2014</xref>).</p>
</sec>
<sec id="s2-10">
<title>Elastic net model: linear regression with combined <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-script">l</mml:mi>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-script">l</mml:mi>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> penalities</title>
<p>Linear regression with combined <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> penalties was computed in R (version 4.2.1) using the function glmnet. cv in the glmnet (version 4.1&#x2013;4) library (<xref ref-type="bibr" rid="B60">Zou and Hastie, 2005</xref>). Given a set of <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> sequences, sequence <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of length <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and edit percentage <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> was modeled according to the equation<disp-formula id="equ1">
<mml:math id="m14">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the indicator variable for the presence of A, C, G, or T at the <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th position in the <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>th sequence, <inline-formula id="inf17">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the regression coefficients for the indicator variables, and <inline-formula id="inf18">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the constant intercept. The vector of regression coefficients is estimated as<disp-formula id="equ2">
<mml:math id="m20">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>argmin</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>&#x3b2;</mml:mi>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the vector of edit percentages and <inline-formula id="inf20">
<mml:math id="m22">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are parameters to be tuned via cross-validation. After performing 10-fold cross-validation with a default value of <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the value of <inline-formula id="inf23">
<mml:math id="m25">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> was tuned to minimize the mean cross-validated error across all 10 folds of the observed edit percentage compared to the predicted edit percentage of the withheld validation dataset.</p>
</sec>
<sec id="s2-11">
<title>Ordinary least squares (OLS) linear regression model for stepwise difference in editing efficiency of consecutive RTTLs using last two templated nucleotides</title>
<p>It was previously observed that having G as the last templated nucleotide at RTTL &#x3d; <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> tended to decrease the editing efficiency compared to the shorter pegRNA design with RTTL &#x3d; <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). In the absence of G, we noticed that the nucleotide A had a similar effect as G. We thus hypothesized that the stepwise differences in editing efficiency between two consecutive RTTLs can be predicted by the last templated nucleotides at those RTTLs. To confirm our hypothesis, we built a linear regression model represented by the equation<disp-formula id="equ3">
<mml:math id="m28">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3f5;</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the edit percentage at RTTL &#x3d; <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the range [10&#xa0;nt, 20&#xa0;nt] for each target locus, <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the indicator variable for nucleotide <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> being the <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> th templated nucleotide, the <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>s are the regression coefficients, and <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is an error term.</p>
<p>The data provided by Kim <italic>et al.</italic> contained editing efficiencies for PBSL &#x3d; 13&#xa0;at only nonconsecutive RTTLs (RTTL &#x3d; 10, 12, 15, and 20); thus, the OLS was applied on the sum of stepwise differences in three ranges of RTTL: 1) <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>12</mml:mn>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, 2) <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mn>12</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>15</mml:mn>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and 3) <inline-formula id="inf35">
<mml:math id="m38">
<mml:mrow>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mn>15</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. To assess the performance of the OLS model, 8-fold or 10-fold cross-validation was performed using data of Anzalone <italic>et al.</italic> or Kim <italic>et al.</italic>, respectively. The target loci were partitioned into eight (or 10) sets, and OLS was trained on the <inline-formula id="inf36">
<mml:math id="m39">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> (or <inline-formula id="inf37">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>) at the target loci in the union of seven (or nine) sets (training set), where <inline-formula id="inf38">
<mml:math id="m41">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The OLS was validated on the remaining held-out set (test set). This process was repeated eight (or 10) times permuting the test set.</p>
</sec>
<sec id="s2-12">
<title>Structure of the deep neural network (DNN) for predicting prime editor efficiency</title>
<p>We trained a DNN model to predict the edit percentages of all pegRNA designs from <xref ref-type="bibr" rid="B29">Kim et al. (2021</xref>) and thereby learn the salient sequence features influencing prime editor efficiency. The inputs to the DNN were 47 <inline-formula id="inf39">
<mml:math id="m42">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 5 matrices, where the dimension 47 was chosen to accommodate the length of the &#x201c;wide-target sequences&#x201d; in Supplementary Table 4 of Kim et al. The first four columns of the input matrix were used for one-hot encoding of the edited strand sequence at locations from &#x2212;21 to &#x2b;26 relative to the nick site. To account for the fact that a given site could be targeted by different pegRNA designs of varying lengths, the fifth column of the input matrix was used to one-hot encode for the coverage of the corresponding locations by each input pegRNA.</p>
<p>To capture position-specific effects of sequence features relative to the nick site, we used independent filters for different positions in the target sequence, instead of using convolutional filters with shared weights. The input sequences were thus divided into all possible overlapping 8-mers, resulting in 40 8-mers for a sequence of length 47&#xa0;nt. Each 8-mer was passed through 10 filters of kernel size <inline-formula id="inf40">
<mml:math id="m43">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and the resulting output was then flattened to a vector. The flattened output was processed through a fully connected layer of 10 neurons. The output of the fully connected layer was then passed through a single neuron whose output was the prediction of the edit percentage of the pegRNA design at the corresponding target site. All layers of the DNN used a rectified linear unit (ReLu) activation function. To prevent overfitting, one dropout layer, where 20% of layer outputs were set to 0, was applied after the input layer, and another dropout layer was applied after the filter layer. The DNN was constructed and trained using the Python package Keras version 2.2.0.</p>
</sec>
<sec id="s2-13">
<title>DNN training and interpretation</title>
<p>The full dataset from <italic>Kim et al.</italic> was divided into 80% training, 10% validation, and 10% test sets, grouping together pegRNA designs with the same target site but with different PBSLs or RTTLs into a common set. Since the edit percentage of approximately 60% of the pegRNA designs was less than 10%, we assigned training weights to each pegRNA design in the training set in order to balance the dataset and prevent the DNN from learning biased features from over-represented poorly edited sites. For this purpose, the training set was partitioned according to their observed edit percentages, with a bin size of 1%. A pegRNA design, indexed by <inline-formula id="inf41">
<mml:math id="m44">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, was given a weight<disp-formula id="equ4">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="center">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>30</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mn>30</mml:mn>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>30</mml:mn>
<mml:mo>%</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>where <inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of pegRNAs in the partition corresponding to the range [<inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:mi mathvariant="italic">int</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf44">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="italic">int</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>). The aforementioned weights were applied to the loss function during DNN training. Training was terminated when there was no improvement in the weighted mean-squared error loss between the predicted <inline-formula id="inf45">
<mml:math id="m49">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and the observed <inline-formula id="inf46">
<mml:math id="m50">
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for the validation set after 10 consecutive epochs. The DNN model at the epoch with the lowest validation loss was considered for further analysis. The Pearson correlation was determined between the predicted and observed edit percentages in the test set. The DNN was trained using the stochastic gradient descent optimizer with Nesterov momentum of 0.9, initial learning rate of 10<sup>&#x2013;3</sup>, and decay factor of 10<sup>&#x2013;6</sup>.</p>
<p>To extract sequence features learned by the DNN, we used a modified version of the simulated annealing (SA) algorithm to determine the optimal sequences that maximize the output of the DNN (<xref ref-type="bibr" rid="B55">Tsallis and Stariolo, 1996</xref>; <xref ref-type="bibr" rid="B16">Finnegan et al., 2020</xref>); we then used the MaxEnt algorithm to identify salient positions and nucleotide preferences in the optimal sequences (<xref ref-type="bibr" rid="B17">Finnegan and Song, 2017</xref>) (<xref ref-type="sec" rid="s11">Supplementary Methods S3, S4</xref>).</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Constitutive heterochromatin impedes PE2 efficiency</title>
<p>In order for a PE to be able to edit its target site, the spacer region of pegRNA must first access and hybridize to the complementary DNA sequence at the targeted locus. We thus hypothesize that target sites residing in heterochromatin, a condensed state of highly packaged DNA, may be blocked from access by PEs and thereby have low editing efficiency. Consistent with our reasoning, it has been previously observed that chromatin structure can interfere with the CRISPR-Cas9 endonuclease activity (<xref ref-type="bibr" rid="B58">Wu et al., 2014</xref>; <xref ref-type="bibr" rid="B11">Daer et al., 2017</xref>; <xref ref-type="bibr" rid="B59">Yarrington et al., 2018</xref>; <xref ref-type="bibr" rid="B38">Liu et al., 2019</xref>). We thus investigated the effect of heterochromatin on PE efficiency using genome editing data from <xref ref-type="bibr" rid="B2">Anzalone et al. (2019</xref>), <xref ref-type="bibr" rid="B29">Kim et al. (2021</xref>), and additional in-house validation experiments. To determine heterochromatin locations in HEK293 cells, which were used in all three studies, we integrated several publicly available H3K9me3 and H3K27me3 ChIP-seq datasets as a proxy for indicating closed chromatin regions (Methods). After obtaining consensus regions of enrichment for each of these histone modifications using singular value decomposition of the joint IP and input data matrices (Methods; <xref ref-type="sec" rid="s11">Supplementary Method S2</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S1</xref>), we computed the genomic distances from each target site to the nearest H3K9me3- and H3K27me3-modified regions and used these distances as features for predicting editing efficiency. When considering edit percentage as a function of the distance from the protospacer to the nearest H3K9me3-modified region, we observed that the edit percentages at sites close to H3K9me3 were approximately 0%, whereas higher edit percentages were possible farther away (<xref ref-type="sec" rid="s11">Supplementary Figure S2A</xref>).</p>
<p>This observation motivated us to search for binary classification of target sites as being either weakly or strongly editable, choosing an efficiency threshold of 1% to binarize the data. We then trained a multivariate logistic regression model to classify all 32 endogenous target sites in Kim <italic>et al.</italic>, using the distances to H3K9me3- and H3K27me3-modified regions as features (Methods; <xref ref-type="sec" rid="s11">Supplementary Method S2</xref>). The probability threshold separating the binary editing categories was set to maximize the finite positive likelihood ratio of the true positive rate to the false positive rate. Performing the <italic>t</italic>-test for each regression coefficient and determining the bivariate and univariate logistic regression decision boundary showed the H3K27me3 variable to be insignificant (<inline-formula id="inf47">
<mml:math id="m51">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.52</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <xref ref-type="sec" rid="s11">Supplementary Table S3</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S2B</xref>). After removing the insignificant feature and retraining the model, the probability threshold that maximized the finite positive likelihood ratio corresponded to a distance threshold of 26&#xa0;kb for the nearest H3K9me3 peak from the protospacer (<xref ref-type="fig" rid="F1">Figure 1A</xref>). We observed a substantial increase in PE efficiency at the target sites located at least &#x223c;10&#xa0;kb away from H3K9me3 modification, suggesting that chromatin assumed an open conformation around this distance threshold. Similarly, it was reported that transcription of genes started to increase &#x223c;10&#xa0;kb away from H3K9me3 peaks (<xref ref-type="bibr" rid="B5">Barski et al., 2007</xref>). Using the 26&#xa0;kb distance threshold, the accuracy of the classifier on the test set consisting of the data from Anzalone <italic>et al.</italic> and additional in-house validation experiments was 75%, with an area under the receiver operator characteristic curve (AUROC) of 0.90 (<xref ref-type="fig" rid="F1">Figures 1B, C</xref>). Our findings thus support that heterochromatin may be sufficient to block PEs from targeting; however, heterochromatin is likely not necessary to hinder PE targeting efficiency, and other factors may also contribute to low edit rate, as discussed as follows.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Constitutive heterochromatin near target sites impedes prime editing. <bold>(A)</bold> Scatter plot of binary classes of editability as a function of log<sub>10</sub> distance from the target site to the nearest H3K9me3 peak (Methods; <xref ref-type="sec" rid="s11">Supplementary Method S2</xref>). The distance threshold (dashed gray line) is at &#x223c;26&#xa0;kb. The logistic regression was trained on 32 endogenous targets from Kim <italic>et al.</italic> and tested on editing data from Anzalone <italic>et al.</italic> and our in-house experiments. <bold>(B)</bold> Receiver operator characteristic curve (ROC) for the logistic regression evaluated on the test set, with an area under the ROC (AUROC) of 0.90. <bold>(C)</bold> Confusion matrix for the logistic regression predictions on the test set. The overall accuracy was 75%. <bold>(D)</bold> Scatter plot of edit percentage as a function of the MNase-seq signal averaged over the protospacer region (Pearson <inline-formula id="inf48">
<mml:math id="m52">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.49</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8.72</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) using the endogenous target sites from Kim <italic>et al.</italic> and Anzalone <italic>et al.</italic> that are at least 26&#xa0;kb away from H3K9me3 peaks. The edit percentage for each target locus with different pegRNA designs was determined by averaging the edit percentages of all pegRNA designs sharing the same protospacer. <bold>(E)</bold> Scatter plot of edit percentages averaged over all integrated target sites sharing the same protospacer as a function of log-transformed number of potential off-targets. The colors represent the density of data points in the scatter plot.</p>
</caption>
<graphic xlink:href="fgene-14-1222112-g001.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>Nucleosome occlusion and pegRNA off-targets may decrease prime editing efficiency</title>
<p>Having confirmed that high-order chromatin accessibility affects PE efficiency, we next investigated the effect of local chromatin structure on prime editing. It has been previously reported both <italic>in vitro</italic> and <italic>in vivo</italic> that nucleosomes inhibit the efficiency of CRISPR-Cas9 endonuclease, suggesting that histone proteins may block the target DNA access to Cas9 (<xref ref-type="bibr" rid="B22">Hinz et al., 2015</xref>; <xref ref-type="bibr" rid="B59">Yarrington et al., 2018</xref>). Given the shared components between the CRISPR-Cas9 and PE constructs, we thus sought to test whether nucleosome positioning at the target sites might also inhibit PE editing efficiencies. Since there were no publicly available MNase-seq datasets for HEK293 and most nucleosome positioning was shown to possess some degree of consistency across different cell lines (<xref ref-type="bibr" rid="B18">Gaffney et al., 2012</xref>) and partially exhibit intrinsic DNA sequence preferences (<xref ref-type="bibr" rid="B24">Ioshikhes et al., 1996</xref>; <xref ref-type="bibr" rid="B51">Segal et al., 2006</xref>; <xref ref-type="bibr" rid="B21">Gupta et al., 2008</xref>; <xref ref-type="bibr" rid="B25">Jin et al., 2016</xref>), we used the available nucleosome occupancy data in the lymphoblastoid cell line GM12878 (Methods). We found a significant negative correlation between PE edit percentage and nucleosome occupancy in the endogenous target sites of Kim <italic>et al.</italic> and Anzalone <italic>et al.</italic> (Pearson <inline-formula id="inf49">
<mml:math id="m53">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.46</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.36</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; <xref ref-type="fig" rid="F1">Figure 1D</xref>; Methods). While target sites in nucleosomal DNA tended to have low edit rates, target sites in nucleosome-free regions did not necessarily have high edit rates. Similar to the effect of heterochromatin, our findings thus support that nucleosomes are sufficient, but not necessary, to partially block PEs from targeting, as other factors may also contribute to low edit rate, as shown in the following sections.</p>
<p>Given that the PBS and spacer regions of pegRNA must hybridize to the target DNA sequence in order for editing to occur, with the spacer being the longer of the two sequences, we examined the off-target effect of having several genomic loci complementary to the spacer region on editing efficiency. We found only a weak but statistically significant negative correlation between the number of spacer off-targets and the edit rate averaged over all pegRNAs with the same spacer (Spearman <inline-formula id="inf50">
<mml:math id="m54">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.11</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf51">
<mml:math id="m55">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>7</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; <xref ref-type="fig" rid="F1">Figure 1E</xref>; Methods). This result indicates that the presence of off-targets does not substantially affect the editing efficiency at the intended target and is consistent with the previous finding for single-guide RNAs (sgRNA) of CRISPR-Cas9 (<xref ref-type="bibr" rid="B45">Moreno-Mateos et al., 2015</xref>).</p>
</sec>
<sec id="s3-3">
<title>Last two templated nucleotides embody stepwise differences in PE2 efficiency</title>
<p>Our aforementioned logistic regression model has revealed that target sites close to heterochromatin are unlikely to be editable. Once a target site away from heterochromatin is selected, the next important step is to optimize the pegRNA design by adjusting the lengths of PBS and RT templates. It was previously reported that altering the PBS length (PBSL) and RT template length (RTTL) could have a drastic effect on PE2 efficiency, even when targeting the same site for the same edit, and that having G as the last templated nucleotide tended to decrease the PE2 efficiency (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>). We further observed in Anzalone <italic>et al.</italic>&#x2019;s data that 1) at the <italic>EMX1</italic> locus containing no Gs in the RT region (Anzalone <italic>et al.&#x0027;s</italic> Figure 2B), having A as the last templated nucleotide consistently resulted in lower edit rates, suggesting that other nucleotides apart from G might also modulate the editing efficiency; 2) at the <italic>FANCF</italic> locus (Anzalone <italic>et al.&#x0027;s</italic> Figure 2B), even though the last templated nucleotide was G at RTTLs of both 10 and 18&#xa0;nts, the edit rate for 18&#xa0;nts was much higher than that for 10&#xa0;nts, suggesting that the presence of G alone could not explain the variability of edit rates as a function of RT template sequence composition. These findings motivated us to use the sequence content in the target region as features in predicting the PE2 efficiency as PBSL and RTTL were varied. We thus developed an ordinary least squares (OLS) linear regression model to predict stepwise differences in edit percentages between two adjacent RTTLs, using the last templated nucleotides as features (Methods).</p>
<p>Anzalone <italic>et al.</italic> generated edit percentages (defined as the fractions of total reads with correct edits) of PE2 at eight different gene loci using pegRNAs with varying ranges of RTTL for each locus, with the largest overlap of lengths among the designs being between 10 and 20&#xa0;nts. We thus chose this range [10&#xa0;nt, 20&#xa0;nt] to train an OLS linear regression model for predicting the stepwise differences, <inline-formula id="inf52">
<mml:math id="m56">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> in edit percentages between <inline-formula id="inf53">
<mml:math id="m57">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf54">
<mml:math id="m58">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> RTTL (<xref ref-type="fig" rid="F2">Figure 2A</xref>; Pearson <inline-formula id="inf55">
<mml:math id="m59">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; 0.79; Methods). Eight-fold cross-validation of holding out and testing on each gene locus supported its robustness, with the root mean square error (RMSE) of the test set being mostly similar to that of the training set (&#x223c;2.3%) (<xref ref-type="sec" rid="s11">Supplementary Table S4</xref>). Our model learned the effect of the last templated nucleotide on <inline-formula id="inf56">
<mml:math id="m60">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to be in increasing order of G, A, C, and T for the (<inline-formula id="inf57">
<mml:math id="m61">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)<sup>th</sup> templated nucleotide and decreasing order of G, A, C, and T for the <inline-formula id="inf58">
<mml:math id="m62">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<sup>th</sup> templated nucleotide (<xref ref-type="sec" rid="s11">Supplementary Table S5</xref>). The relative ordering of G &#x3c; A &#x3c; C &#x3c; T resembles the order of ionization energy of the nucleotides (<xref ref-type="bibr" rid="B7">Burrows and Muller, 1998</xref>), but clarifying the connection would require further experimental and theoretical investigation.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>OLS linear regression using the last two templated nucleotides robustly predicts stepwise differences in edit percentage between two consecutive RTTLs. <bold>(A)</bold> Scatter plot of the predicted <italic>versus</italic> observed differences in edit percentage when the OLS linear regression model was trained and tested on Anzalone <italic>et al.</italic>&#x2019;s data. <bold>(B)</bold> Scatter plot of the predicted <italic>versus</italic> observed differences in edit percentage when trained on the integrated sites from Kim <italic>et al.&#x2019;</italic>s data and tested on Anzalone <italic>et al.</italic>&#x2019;s data. <bold>(C)</bold> Predicted and observed edit percentages as a function of RTTL for eight different target sites from Anzalone <italic>et al.</italic> For each target, the absolute predicted edit percentage at RTTL &#x3d; 10 was set such that the average of the predicted edit percentages across RTTLs matches the corresponding average of the observed edit percentages.</p>
</caption>
<graphic xlink:href="fgene-14-1222112-g002.tif"/>
</fig>
<p>Having confirmed that our OLS linear regression model performed well on the eight target sites from Anzalone <italic>et al.</italic>, we further investigated whether a similar approach could be generalized to Kim <italic>et al.</italic>&#x2019;s data (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>), which measured editing efficiencies at thousands of target sites by high-throughput sequencing. The PBSL was set to 13&#xa0;nts, to be consistent with the range used in Anzalone <italic>et al.</italic> Since Kim <italic>et al.</italic> measured the edit percentages only for RTTLs &#x3d; 10, 12, 15, and 20, we could not directly apply our stepwise approach on these data. We thus divided the data for different RTTLs into three ranges and trained an OLS linear regression model to predict consecutive stepwise differences in edit percentages for each range separately (Methods): 1) RTTLs from 10 to 12; 2) RTTLs from 12 to 15; and 3) RTTLs from 15 to 20 (<xref ref-type="sec" rid="s11">Supplementary Figure S3</xref>). Ten-fold cross-validation verified the robustness of our model within each range (5.1% RMSE; average of Pearson correlation across folds &#x3d; 0.33 for RTTLs in the range [10,12] and [15,20], and 0.25 for RTTLs of [12,15]; <xref ref-type="sec" rid="s11">Supplementary Table S6</xref>). However, the effect sizes of the four nucleotides differed somewhat across the three ranges (<xref ref-type="sec" rid="s11">Supplementary Tables S7&#x2013;S9</xref>). When the RTTL is in the range [15,20], the effect on <inline-formula id="inf59">
<mml:math id="m63">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> was in the increasing order of G, A, T, and C for the (<inline-formula id="inf60">
<mml:math id="m64">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)<sup>th</sup> templated nucleotide and decreasing order of G, A, C, and T for the <inline-formula id="inf61">
<mml:math id="m65">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<sup>th</sup> templated nucleotide (<xref ref-type="sec" rid="s11">Supplementary Table S9</xref>), similar to the aforementioned results for Anzalone e<italic>t al</italic>.&#x2019;s data. RTTL in the [12,15] range also showed the pattern that G as the (<inline-formula id="inf62">
<mml:math id="m66">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>)<sup>th</sup> templated nucleotide was associated with the smallest <inline-formula id="inf63">
<mml:math id="m67">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and G as the <inline-formula id="inf64">
<mml:math id="m68">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<sup>th</sup> templated nucleotide was associated with the largest <inline-formula id="inf65">
<mml:math id="m69">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> compared to other nucleotides at these respective positions (<xref ref-type="sec" rid="s11">Supplementary Table S8</xref>). This pattern was reversed for the RTTL in the [10,12] range, where G as the <inline-formula id="inf66">
<mml:math id="m70">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<sup>th</sup> templated nucleotide was associated with the smallest <inline-formula id="inf67">
<mml:math id="m71">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> compared to other nucleotides at this position (<xref ref-type="sec" rid="s11">Supplementary Table S7</xref>).</p>
<p>Testing the regression model trained on Kim <italic>et al.</italic>&#x2019;s data on Anzalone <italic>et al.</italic>&#x2019;s data in the corresponding RTTL ranges (<xref ref-type="sec" rid="s11">Supplementary Tables S7&#x2013;S9</xref>), the predicted <inline-formula id="inf68">
<mml:math id="m72">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> still significantly correlated with the observed values, albeit to a lesser extent than the model directly trained on Anzalone <italic>et al.&#x2019;</italic>s data (Pearson <inline-formula id="inf69">
<mml:math id="m73">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; 0.37 <inline-formula id="inf70">
<mml:math id="m74">
<mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>7.13</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; <xref ref-type="fig" rid="F2">Figures 2A, B</xref>). Repeating the same analysis in individual RTTL ranges revealed that the predicted <inline-formula id="inf71">
<mml:math id="m75">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> was significantly correlated with the observed values only in the 15&#x2013;20 RTTL range (Pearson <inline-formula id="inf72">
<mml:math id="m76">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.25</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf73">
<mml:math id="m77">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.34</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in the 10&#x2013;12 RTTL range; Pearson <inline-formula id="inf74">
<mml:math id="m78">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf75">
<mml:math id="m79">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in the 12&#x2013;15 RTTL range; Pearson <inline-formula id="inf76">
<mml:math id="m80">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.62</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf77">
<mml:math id="m81">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2.12</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> in the 15&#x2013;20 RTTL range). We also predicted the trend of absolute edit percentages in Anzalone <italic>et al.&#x2019;</italic>s data as a function of the RTTL (<xref ref-type="fig" rid="F2">Figure 2C</xref>). Overall, the model trained on Anzalone <italic>et al.&#x2019;</italic>s data was able to reproduce the observed trend that the edit percentage decreased whenever the last templated nucleotide was G (green line in <xref ref-type="fig" rid="F2">Figure 2C</xref>). The models trained on Kim <italic>et al.</italic>&#x2019;s data were able to capture the observed trend in longer RTTLs, but not so well in shorter RTTLs (magenta line in <xref ref-type="fig" rid="F2">Figure 2C</xref>), consistent with the fact that our predicted <inline-formula id="inf78">
<mml:math id="m82">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> was more accurate in the long RTTL range [15,20].</p>
<p>There were some discrepancies between the two independent datasets regarding the regression coefficients trained on shorter RTTL designs. For example, Anzalone <italic>et al.</italic> observed a decrease in editing efficiency whenever the last templated nucleotide was G throughout all RTTLs. By contrast, Kim <italic>et al.</italic> observed that, on average, editing efficiency was highest when the last templated nucleotide was G at RTTLs &#x3d; 10 and 12, while it was lowest for the last templated G only at RTTL &#x3d; 20 (Kim <italic>et al.&#x0027;s</italic> Figure 2F). Further investigation is needed to understand why G has an opposite effect on editing efficiency at short RTTLs and why this phenomenon is not universal across independent datasets. We shall revisit the position-specific effects of G on editing efficiency in subsequent sections.</p>
</sec>
<sec id="s3-4">
<title>Elastic net regression accurately predicts editing efficiency using the sequence content of target DNA and flanking regions</title>
<p>The OLS linear regression analysis yielded insight into the preferred RTTLs of pegRNAs for a given target site, but it could not reveal the optimal target, given multiple PAM candidates in the region of interest. We therefore built an elastic net model (<xref ref-type="bibr" rid="B60">Zou and Hastie, 2005</xref>), a linear regression model with <inline-formula id="inf79">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> shrinkage to impose sparsity of features and <inline-formula id="inf80">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> shrinkage to reduce overfitting, for each combination of PBSL and RTTL, separately (PBSLs &#x3d; 7, 9, 11, 13, 15, and 17 and RTTLs &#x3d; 10, 12, 15, and 20) (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>) and predicted the absolute editing efficiency of a target site using only the sequence information of the PBS, RT template, and its flanking regions (Methods). The predictive variables were the indicator variables encoding the four nucleotides at each position of the 47-bp-long target site sequence, except for the fixed GG dinucleotide in the NGG PAM (Methods). Since this dataset did not contain information about where the target sites were integrated in the genome, features involving relative distance to epigenomic modifications were not included in our model. Our approach was much simpler and more interpretable than the previously reported model pooling together different PBSL and RTTL designs and using 1,766 features (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>), many of which might have been highly correlated and redundant.</p>
<p>The elastic net predictions significantly correlated with the observed edit percentages in each pair of PBSL and RTTL (Pearson <inline-formula id="inf81">
<mml:math id="m85">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.60</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>; <inline-formula id="inf82">
<mml:math id="m86">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>150</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; <xref ref-type="sec" rid="s11">Supplementary Table S10</xref>). <xref ref-type="fig" rid="F3">Figure 3A</xref> shows the regression coefficients for PBSL &#x3d; 13 and RTTL &#x3d; 15 (other combinations of PBSL and RTTL are given in <xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>). In general, the higher the GC content in the PBS region, the more likely the target site was to have high editing efficiency (<xref ref-type="sec" rid="s11">Supplementary Figure S5</xref>), consistent with the previous report (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>). However, the extent of this preference depended on the PBSL, that is, the high-performance designs with shorter PBSL required higher GC content in the PBS region than those with longer PBSL (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>). Considering that G: C bonds are stronger than A: T bonds, this result suggested that a short PBS lacking a sufficient number of G/C nucleotides may not stably hybridize to the 3&#x2019; end of pegRNA, thereby yielding low editing efficiency. To further investigate this idea, we computed the length-normalized RNA-DNA hybridization energy between the pegRNA PBS region and the corresponding DNA and found a significant negative correlation between PBS-DNA hybridization energy and edit percentage for all combinations of PBSLs and RTTLs (<xref ref-type="fig" rid="F3">Figures 3B, C</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S6</xref>; Methods). Additionally, we confirmed again that the shorter the PBSL, the larger the magnitude of the anti-correlation between editing efficiency and PBS-DNA hybridization energy (<xref ref-type="fig" rid="F3">Figure 3B</xref> vs. <xref ref-type="fig" rid="F3">Figure 3C</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S6</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Elastic net regression model learns sequence features important for predicting PE2 efficiency. <bold>(A)</bold> Elastic net regression coefficients for all edited-strand nucleotides in the range [-21,&#x2b;26] relative to the nick site for PBSL &#x3d; 13 and RTTL &#x3d; 15. The model was trained on the editing data for integrated target sites from Kim <italic>et al.</italic> The nucleotide heights represent the absolute value of the corresponding regression coefficients. The vertical lines denote the borders of the PBS and RT template regions, and the PAM site is shaded. <bold>(B)</bold> Scatter plot of the edit percentage <italic>versus</italic> PBS&#x2013;DNA hybridization energy for PBSL &#x3d; 7 and RTTL &#x3d; 10, where the colors represent the density of data points. Pearson <inline-formula id="inf83">
<mml:math id="m87">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.45</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>78</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. <bold>(C)</bold> Same as in <bold>(B)</bold>, but for PBSL &#x3d; 17 and RTTL &#x3d; 20. Pearson <inline-formula id="inf84">
<mml:math id="m88">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.13</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2.8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. <bold>(D)</bold> Violin plot of edit percentages for each PBSL in two groups of target sites, divided based on the GC composition of the 9-bp-long sequence upstream of the nick site. The first group contains target sites that have only G or C in the range [&#x2212;7,&#x2212;1] and only A or T at positions &#x2212;9 and &#x2212;8. The second group contains target sites that have less than six&#xa0;G or C nucleotides in the range [&#x2212;7, &#x2212;1]. The white dots represent the median edit percentage, and the thick blank lines represent the portion of the data between the first and third quartiles of edit percentage. <bold>(E)</bold> Bar plot denoting the fraction of target sites in each quantile of edit percentages. Blue (or orange) bars are for target sites with (or without) G at position &#x2b;10 for PBSL &#x3d; 13 and RTTL &#x3d; 15. <bold>(F)</bold> Same as in <bold>(E),</bold> but for A at position &#x2b;17.</p>
</caption>
<graphic xlink:href="fgene-14-1222112-g003.tif"/>
</fig>
<p>The regression coefficients also showed that the presence of G as the last templated nucleotide tended to increase the editing efficiency when RTTL &#x3d; 10, regardless of the PBSL, or when RTTL &#x3d; 12 and PBSL <inline-formula id="inf85">
<mml:math id="m89">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 9 (<xref ref-type="fig" rid="F3">Figure 3A</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>). This finding recapitulated the observation made by Kim <italic>et al.</italic> that the editing efficiency was, on average, highest when the last templated nucleotide was G for RTTL &#x3d; 10 or 12 (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>); however, our analysis clearly highlighted that this effect depended on the PBSL being sufficiently short for the case RTTL &#x3d; 12. By contrast, when PBSL <inline-formula id="inf86">
<mml:math id="m90">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 11 and RTTL <inline-formula id="inf87">
<mml:math id="m91">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 15, the presence of G as the last templated nucleotide decreased the editing efficiency (<xref ref-type="fig" rid="F3">Figure 3A</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>), consistent with the findings of our aforementioned OLS model and <xref ref-type="bibr" rid="B2">Anzalone et al. (2019</xref>); for shorter PBSL, however, the last templated nucleotide had only a minor effect on editing efficiency, and the GC content in the PBS region instead had a pronounced effect.</p>
<p>The large effect size of the GC content in the PBS region for short PBSL designs, together with the accompanying reduction in the negative effect of the last templated nucleotide, suggested that optimal choices of pegRNA for target loci containing high GC content in the PBS region would involve short PBSLs. Supporting this idea, we observed that when the pegRNA contained only G/C in the range [&#x2212;7,&#x2212;1] and A/T at &#x2212;9 and &#x2212;8, the edit percentage was, on average, highest for PBSL &#x3d; 7 (<xref ref-type="fig" rid="F3">Figure 3D</xref>; highest median edit percentage of 20.5% when PBSL &#x3d; 7); here, we considered A/T at the &#x2212;9 and &#x2212;8 positions based on the regression coefficients learned by the elastic net for PBSL &#x3d; 7 (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>). When fewer than 6&#xa0;G/Cs were found in the [&#x2212;7,&#x2212;1] range, PBSLs between 11 and 15 had, on average, higher editing efficiency than other PBSLs (<xref ref-type="fig" rid="F3">Figure 3D</xref>; highest median edit percentage of 9.8% when PBSL &#x3d; 13). Given a target site, these findings thus provided a general guideline for determining the optimal PBSL based solely on the 9-bp-long sequence upstream of the nick site in the edited strand.</p>
<p>Some patterns of regression coefficients were shared among most combinations of PBSL and RTTL. Based on the magnitude of regression coefficients, the most salient nucleotides were those positioned around the nick site (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>): C and G nucleotides at locations from &#x2212;1 to &#x2b;3, immediately 5&#x2019; to the PAM site, led to high editing efficiency, whereas T in the range from &#x2212;2 to &#x2b;3 was to be avoided. G/A at the &#x2212;17 position was positively correlated with editing efficiency, while T at the same position was negatively correlated. In general, C at the &#x2b;20 position was positively correlated with editing efficiency across designs. For the PAM site, AGG was favored and CGG disfavored for optimizing editing efficiency. Immediately after the PAM site, G at the &#x2b;7 position was strongly anti-correlated with editing efficiency. Surprisingly, the presence of G at the &#x2b;10 position was significant and tended to increase editing efficiency, not only for RTTL &#x3d; 10, in which case G would be the last templated nucleotide as described previously, but also for RTTL &#x3d; 15 or 20 when PBSL <inline-formula id="inf88">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 11 (<xref ref-type="fig" rid="F3">Figure 3E</xref>). Considering that Cas9 variants similar to SpCas9 in size were known to recognize PAM sequences of varying length between 2 and 8&#xa0;nts (<xref ref-type="bibr" rid="B1">Anders et al., 2014</xref>) and that the &#x2b;10 position was only 4 bps downstream of the NGG PAM site, the fact that the presence of G at the &#x2b;10 position consistently increased the editing efficiency across multiple RTTLs suggested that SpCas9 preferentially interacted with this specific nucleotide. Another unexpected finding was that select edited strand nucleotides even outside the regions of the protospacer, PBS, and RT template seemed to modulate editing efficiency. For example, when PBSL &#x3d; 13 and RTTL &#x3d; 15, the presence of A at the &#x2b;17 position, lying outside the range of the RT templated region, tended to decrease the editing efficiency (<xref ref-type="fig" rid="F3">Figure 3F</xref>).</p>
<p>In summary, the elastic net regression models trained on each combination of PBSL and RTTL learned distinct and common sequence features modulating editing efficiency. The effect of the last templated G nucleotide on editing efficiency depended on both PBSL and RTTL of pegRNA designs. Moreover, pegRNAs with short PBSLs highly depended on the GC content of the PBS region, perhaps to help stabilize the hybridization to the complementary DNA. The PBS RNA-DNA hybridization energy was significantly anti-correlated with edit percentage in all combinations of PBSL and RTTL, but pegRNAs with shorter PBSLs showed more pronounced dependence. Finally, the presence of G/C nucleotides was most critical immediately 5&#x2019; to the PAM site for all combinations of PBSL and RTTL.</p>
</sec>
<sec id="s3-5">
<title>Deep neural network accurately predicts PE2 efficiency and yields interpretable features</title>
<p>It is challenging to model nonlinear effects of coupled nucleotides at multiple positions using elastic net regression. To extend our linear regression approach, we thus trained a DNN model on Kim <italic>et al.</italic>&#x2019;s data (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>) to predict the edit percentage of a pegRNA design given its length and the sequence of the target site and flanking regions (<xref ref-type="fig" rid="F4">Figure 4A</xref>; Methods). The Pearson correlation coefficient between the observed and predicted edit percentages in the test set was 0.73 (<xref ref-type="fig" rid="F4">Figure 4B</xref>; Methods).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>DNN learns marginal and multi-nucleotide sequence features. <bold>(A)</bold> DNN architecture. The first four columns of the input matrix indicate the presence of a particular nucleotide in the range [&#x2212;21,&#x2b;26] relative to the nick site via one-hot encoding; the 5th column indicates whether a particular position resides in the union of the PBS and RT template regions of the pegRNA being considered. The input is passed through a layer of kernels and two dense layers to yield an output value between 0 and 100. <bold>(B)</bold> Scatter plot of observed <italic>versus</italic> predicted edit percentages of the target sites in the test set (Pearson <inline-formula id="inf89">
<mml:math id="m93">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.73</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>300</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>). <bold>(C)</bold> Position-wise KL divergence of MaxEnt output nucleotide distributions with respect to the uniform nucleotide distribution (<xref ref-type="sec" rid="s11">Supplementary Method S4</xref>). The vertical lines denote the borders of the PBS and RT template regions, and the PAM site is shaded. <bold>(D)</bold> Histograms of marginal edit percentages of the target sites containing T at &#x2b;9 (blue) <italic>versus</italic> A, C, and G at &#x2b;9 (orange). <bold>(E)</bold> Same as in <bold>(D)</bold>, but the target sites are conditioned to have T at &#x2b;8 and G at &#x2b;10.</p>
</caption>
<graphic xlink:href="fgene-14-1222112-g004.tif"/>
</fig>
<p>To understand optimal sequences associated with high edit percentages, we applied a simulated annealing (SA) method for maximizing the prediction of a trained DNN over its input space via Markov Chain Monte Carlo (MCMC) sampling (<xref ref-type="bibr" rid="B16">Finnegan et al., 2020</xref>). We accelerated the simulation by using a broadened sampling distribution stemming from nonextensive statistical mechanics (<xref ref-type="bibr" rid="B55">Tsallis and Stariolo, 1996</xref>) and obtained the target sequences maximizing the predicted edit percentages for each combination of PBSL and RTTL (<xref ref-type="sec" rid="s11">Supplementary Method S3</xref>; <xref ref-type="sec" rid="s11">Supplementary Tables S11&#x2013;S14</xref>). To identify the DNN-learned salient features in these optimal sequences, we next applied MaxEnt, another MCMC sampling method based on the maximum entropy principle, for generating new input sequences that produce similar DNN predictions as those of the initial input sequence (<xref ref-type="bibr" rid="B17">Finnegan and Song, 2017</xref>). Upon initializing the MaxEnt chains at the optimal target sequences, nucleotide preferences important for maximizing PE efficiency were extracted by calculating the position-wise Kullback&#x2013;Leibler (KL) divergence between the nucleotide distribution in the sampled sequences and the uniform null distribution; in this formalism, the larger the KL divergence, the more relevant the nucleotide (<xref ref-type="fig" rid="F4">Figure 4C</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S7</xref>; <xref ref-type="sec" rid="s11">Supplementary Method S4</xref>).</p>
<p>The DNN confirmed the sequence features previously detected by the elastic net model. For example, it learned that C at the &#x2b;1 position, G at the &#x2b;3 position, and G at the &#x2212;17 position relative to the nick site were associated with high edit percentage; other similarities with the elastic net results included the preferences for A as the first nucleotide in the PAM sequence, for C or G in the PBS region, and for C at the &#x2b;20 position. In addition, the DNN also discovered new features. For example, although the elastic net model detected no strong preference for any particular nucleotide at position &#x2b;9, except for perhaps a weak preference for G (<xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>), the DNN found T at this position to be associated with optimal editing. This discrepancy in the feature importance arose from the difference in model architectures, that is, the elastic net model learned only the independent effects of individual nucleotides, whereas the DNN accounted for aggregate effects of nucleotides captured by the filters. When considering only the marginal effect of single nucleotides at the &#x2b;9 position, target sequences with T at that position actually had a lower edit percentage on average than those with other nucleotides at that same position (9.00% for T vs. 9.72% for A, C, and G at &#x2b;9; <xref ref-type="fig" rid="F4">Figure 4D</xref>). However, imposing the &#x2b;8 position to be T and the &#x2b;10 position to be G showed that target sequences with T at &#x2b;9 had a higher edit percentage on average than those with other nucleotides at &#x2b;9 (14.33% for T vs. 10.72% for A, C, and G at &#x2b;9; <xref ref-type="fig" rid="F4">Figure 4E</xref>; <xref ref-type="sec" rid="s11">Supplementary Figures S8A, B</xref>). Similarly, the marginal effect of T at the &#x2212;4 position, residing in the PBS region, was negligible; however, the DNN uncovered a role of T in the context of a GC-rich background: for sequences with 100% GC content in the range [&#x2212;7,&#x2212;1], except at the &#x2212;4 position, the presence of T at &#x2212;4 substantially increased the edit percentage by 6.90% on average (<xref ref-type="sec" rid="s11">Supplementary Figures S8C, D</xref>). Finally, the DNN found a preference for CAG at [&#x2b;1,&#x2b;3] in optimal target sequences, whereas the elastic net model found only a marginal effect of G/C at &#x2b;2 and a less pronounced trinucleotide effect of C (G/C) G in the same region (<xref ref-type="sec" rid="s11">Supplementary Figures S8E, F</xref>). In summary, our DNN was able to predict the observed edit percentage with high accuracy and yielded interpretable results regarding both marginal and context-dependent sequence features of optimal target sites.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>We have demonstrated that both regional heterochromatin and local nucleosome occlusion of target sites may decrease PE2 editing efficiency, with a more pronounced effect observed for the H3K9me3 modification, perhaps because unwinding DNA may be particularly difficult in a heterochromatin environment where multiple nucleosomes are condensed together. These results are consistent with those of a recent report showing that inducing an open chromatin state improves PE and base editor efficiencies (<xref ref-type="bibr" rid="B39">Liu et al., 2022</xref>). Our study thus provides evidence for the hypothesis that chromatin structure modulates genome editing efficiency by interfering with the accessibility of target sites to the Cas9 protein and guide RNA. Unlike the significant predictive power of H3K9me3 in classifying strongly and weakly editable target sites, however, the feature of H3K27me3 was insignificant (<xref ref-type="sec" rid="s11">Supplementary Figures S2A, B</xref>; <xref ref-type="sec" rid="s11">Supplementary Table S3</xref>). While both H3K9me3 and H3K27me3 are associated with heterochromatin formation, H3K9me3 is associated with constitutive heterochromatin, whereas H3K27me3 is associated with facultative heterochromatin (<xref ref-type="bibr" rid="B50">Saksouk et al., 2014</xref>), which depends on the presence of certain stimuli, and thus, remains conducive to dynamic changes (<xref ref-type="bibr" rid="B47">Oberdoerffer and Sinclair, 2007</xref>). Further investigation is needed to decipher the precise differences in folding pattern, making constitutive heterochromatin more resistant to editing than facultative heterochromatin.</p>
<p>Our work shows that, in addition to the chromatin environment, the local sequence content of target sites can also modulate editing efficiency. For example, strategic positioning of G and C nucleotides in the PBS and RT regions increases the editing efficiency, indicating that G: C base pairing between DNA and pegRNA helps anchor the pegRNA to the target site prior to editing. A similar observation regarding the GC content in the PBS region was previously reported (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>); however, our study further clarifies that 1) the presence of G or C is most important in the RTT region, immediately downstream of the PBS region, at positions &#x2b;1 and &#x2b;3; 2) within the PBS region itself, the positive effect of G or C is most pronounced near the nick site and rapidly decays away from the nick site for PBSL <inline-formula id="inf90">
<mml:math id="m94">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 11 (<xref ref-type="fig" rid="F3">Figure 3A</xref>; <xref ref-type="sec" rid="s11">Supplementary Figure S4</xref>); and 3) the importance of GC content depends on PBSL (<xref ref-type="sec" rid="s11">Supplementary Figure S5</xref>), with the effect size being particularly strong for pegRNAs with shorter PBSL, which may need more G: C base pairs to compensate for weak overall PBS&#x2013;DNA hybridization interactions of only a small number of bases. As previously observed (<xref ref-type="bibr" rid="B2">Anzalone et al., 2019</xref>), we find that having G as the last templated nucleotide may decrease the editing efficiency; however, we show that this effect holds only in pegRNAs with sufficiently long PBSL and RTTL. Kim <italic>et al.</italic> similarly observed that for RTTL <inline-formula id="inf91">
<mml:math id="m95">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 12, editing efficiency is, on average, the highest if the last templated nucleotide is G (<xref ref-type="bibr" rid="B29">Kim et al., 2021</xref>); however, we show that this positive effect may be unrelated to G being the last templated nucleotide, as G at the &#x2b;10 position has a positive correlation with editing efficiency regardless of the RTTL. Considering that the &#x2b;10 position is only 4 bp downstream of the canonical NGG PAM site, it is plausible that the G nucleotide at this location affects the binding of Cas9.</p>
<p>When choosing a target site, our results recommend selecting target sequences that 1) have AGG in the PAM site; 2) have high GC content in the PBS region, especially when the PBS is short; 3) avoid G as the last templated nucleotide if the RT template is longer than 12; and 4) have G at the &#x2212;17 position relative to the nick site on the edited strand (<xref ref-type="fig" rid="F5">Figure 5</xref>). Once the target site has been chosen, a PBSL in the [11,13] nt range is generally recommended, unless the target site has high GC content that provides stable hybridization between the pegRNA and target DNA, in which case a shorter PBSL is recommended. As for the effect of RTTL, pegRNAs with shorter RTTL have, on average, higher editing efficiency than those with longer RTTL; when the available PAM site is far away from the desired site, such that having RTTL &#x3e;15&#xa0;nt is unavoidable, we recommend the following designs: prioritize the last templated nucleotide in the order T <inline-formula id="inf92">
<mml:math id="m96">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> C&#x3e;A&#x3e;G. Short pegRNAs are generally advantageous, as long as the PBS region has sufficiently high GC content to hybridize stably with its target DNA (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Flowchart for designing an optimal pegRNA for the selected target site. The yellow boxes represent the target site, which is defined to be the union of edited-strand nucleotides spanning the protospacer region, PBS region, and RT region. The green boxes represent the DNA flanking the target site. All nucleotide numbers are relative to the nick site.</p>
</caption>
<graphic xlink:href="fgene-14-1222112-g005.tif"/>
</fig>
<p>Our computational analysis has identified several epigenetic and sequence features that need to be considered when designing pegRNAs to optimize PE efficiency. Future availability of additional high-throughput genome editing data and biophysical studies investigating how PEs search and bind target sequences will help further improve our understanding and make this technology feasible for effective biomedical applications. In particular, our current models do not take into account biochemical and biophysical processes involving reverse transcriptase activity, priming, flap equilibration, and DNA repair. Unfortunately, relevant parameters required for investigating these processes are currently limited. Incorporating such features obtained from detailed biophysical experiments and molecular dynamics simulations in the future could further improve the optimization of prime editors for practical applications.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Author&#x2019;s note</title>
<p>Part of this project was completed while JS was on a sabbatical leave in the Center for Theoretical Physics at the Massachusetts Institute of Technology and the Department of Statistics at Harvard University.</p>
</sec>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found at: <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/">https://www.ncbi.nlm.nih.gov/</ext-link>, PRJNA949853.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>PP-P and JS conceived and supervised the project. SK and JY carried out the computational analyses, aided by DN. WW performed the validation experiments. SK, JY, and JS wrote the manuscript with contributions from other authors. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>The National Institutes of Health (R01CA163336 to JS, R01GM141296 to PP-P and JS); Grainger Engineering Breakthroughs Initiative to JS.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2023.1222112/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2023.1222112/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<sec id="s12">
<title>Abbreviations</title>
<p>bp: base pair; CRISPR: clustered regularly interspaced short palindromic repeats; Cas9: CRISPR-associated protein 9; ChIP-seq: chromatin immune-precipitation sequencing; DNN: deep neural network; EP: edit percentage; MCMC: Markov Chain Monte Carlo; OLS: ordinary least squares; PAM: protospacer adjacent motif; PBS: primer-binding site; PBSL: primer-binding site length; PE: prime editor; pegRNA: prime editor guide RNA; RT: reverse transcription; RMSE: root mean square error; RTTL: reverse transcription template length; SA: simulated annealing.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anders</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Niewoehner</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Duerst</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jinek</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Structural basis of PAM-dependent target DNA recognition by the Cas9 endonuclease</article-title>. <source>Nature</source> <volume>513</volume>, <fpage>569</fpage>&#x2013;<lpage>573</lpage>. <pub-id pub-id-type="doi">10.1038/nature13579</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anzalone</surname>
<given-names>A. V.</given-names>
</name>
<name>
<surname>Randolph</surname>
<given-names>P. B.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Sousa</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Koblan</surname>
<given-names>L. W.</given-names>
</name>
<name>
<surname>Levy</surname>
<given-names>J. M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Search-and-replace genome editing without double-strand breaks or donor DNA</article-title>. <source>Nature</source> <volume>576</volume>, <fpage>149</fpage>&#x2013;<lpage>157</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1711-4</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bae</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J. S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Cas-OFFinder: A fast and versatile algorithm that searches for potential off-target sites of Cas9 RNA-guided endonucleases</article-title>. <source>Bioinformatics</source> <volume>30</volume>, <fpage>1473</fpage>&#x2013;<lpage>1475</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btu048</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barrett</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wilhite</surname>
<given-names>S. E.</given-names>
</name>
<name>
<surname>Ledoux</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Evangelista</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>I. F.</given-names>
</name>
<name>
<surname>Tomashevsky</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>NCBI GEO: Archive for functional genomics data sets--update</article-title>. <source>Nucleic Acids Res.</source> <volume>41</volume>, <fpage>D991</fpage>&#x2013;<lpage>D995</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gks1193</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barski</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cuddapah</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Roh</surname>
<given-names>T. Y.</given-names>
</name>
<name>
<surname>Schones</surname>
<given-names>D. E.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>High-resolution profiling of histone methylations in the human genome</article-title>. <source>Cell</source> <volume>129</volume>, <fpage>823</fpage>&#x2013;<lpage>837</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2007.05.009</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Broche</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kungulovski</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bashtrykov</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Rathert</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jeltsch</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Genome-wide investigation of the dynamic changes of epigenome modifications after global DNA methylation editing</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>158</fpage>&#x2013;<lpage>176</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa1169</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Burrows</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Muller</surname>
<given-names>J. G.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Oxidative nucleobase modifications leading to strand scission</article-title>. <source>Chem. Rev.</source> <volume>98</volume>, <fpage>1109</fpage>&#x2013;<lpage>1152</lpage>. <pub-id pub-id-type="doi">10.1021/cr960421s</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chapman</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Boulton</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Playing the end game: DNA double-strand break repair pathway choice</article-title>. <source>Mol. Cell</source> <volume>47</volume>, <fpage>497</fpage>&#x2013;<lpage>510</lpage>. <pub-id pub-id-type="doi">10.1016/j.molcel.2012.07.029</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Hussmann</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Knipping</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ravisankar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P. F.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Enhanced prime editing systems by manipulating cellular determinants of editing outcomes</article-title>. <source>Cell</source> <volume>184</volume>, <fpage>5635</fpage>&#x2013;<lpage>5652.e29</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2021.09.018</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cong</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ran</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Cox</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Barretto</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Habib</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Multiplex genome engineering using CRISPR/Cas systems</article-title>. <source>Science</source> <volume>339</volume>, <fpage>819</fpage>&#x2013;<lpage>823</lpage>. <pub-id pub-id-type="doi">10.1126/science.1231143</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Daer</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Cutts</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Brafman</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Haynes</surname>
<given-names>K. A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The impact of chromatin dynamics on cas9-mediated genome editing in human cells</article-title>. <source>ACS Synth. Biol.</source> <volume>6</volume>, <fpage>428</fpage>&#x2013;<lpage>438</lpage>. <pub-id pub-id-type="doi">10.1021/acssynbio.5b00299</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Doudna</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Charpentier</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Genome editing. The new frontier of genome engineering with CRISPR-Cas9</article-title>. <source>Science</source> <volume>346</volume>, <fpage>1258096</fpage>. <pub-id pub-id-type="doi">10.1126/science.1258096</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<collab>Encode Project Consortium</collab> (<year>2012</year>). <article-title>An integrated encyclopedia of DNA elements in the human genome</article-title>. <source>Nature</source> <volume>489</volume>, <fpage>57</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1038/nature11247</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>Y. H.</given-names>
</name>
<name>
<surname>Storey</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>A conserved BAH module within mammalian BAHD1 connects H3K27me3 to Polycomb gene silencing</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>4441</fpage>&#x2013;<lpage>4455</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkab210</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z. M.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>Y. H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>BAHCC1 binds H3K27me3 via a conserved BAH module to mediate gene silencing and oncogenesis</article-title>. <source>Nat. Genet.</source> <volume>52</volume>, <fpage>1384</fpage>&#x2013;<lpage>1396</lpage>. <pub-id pub-id-type="doi">10.1038/s41588-020-00729-3</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Finnegan</surname>
<given-names>A. I.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gapinske</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Woods</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>Perez-Pinera</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Epigenetic engineering of yeast reveals dynamic molecular adaptation to methylation stress and genetic modulators of specific DNMT3 family members</article-title>. <source>Nucleic Acids Res.</source> <volume>48</volume>, <fpage>4081</fpage>&#x2013;<lpage>4099</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa161</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Finnegan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>J. S.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Maximum entropy methods for extracting the learned features of deep neural networks</article-title>. <source>PLoS Comput. Biol.</source> <volume>13</volume>, <fpage>e1005836</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1005836</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gaffney</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Mcvicker</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Pai</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Fondufe-Mittendorf</surname>
<given-names>Y. N.</given-names>
</name>
<name>
<surname>Lewellen</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Michelini</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Controls of nucleosome positioning in the human genome</article-title>. <source>PLoS Genet.</source> <volume>8</volume>, <fpage>e1003036</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pgen.1003036</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gapinske</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Luu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Winter</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Woods</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>Kostan</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Shiva</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>CRISPR-SKIP: Programmable gene splicing with single base editors</article-title>. <source>Genome Biol.</source> <volume>19</volume>, <fpage>107</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-018-1482-5</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gaudelli</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Komor</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Rees</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Badran</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Bryson</surname>
<given-names>D. I.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Programmable base editing of A&#x2a;T to G&#x2a;C in genomic DNA without DNA cleavage</article-title>. <source>Nature</source> <volume>551</volume>, <fpage>464</fpage>&#x2013;<lpage>471</lpage>. <pub-id pub-id-type="doi">10.1038/nature24644</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dennis</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Thurman</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Kingston</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Stamatoyannopoulos</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Noble</surname>
<given-names>W. S.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Predicting human nucleosome occupancy from primary sequence</article-title>. <source>PLoS Comput. Biol.</source> <volume>4</volume>, <fpage>e1000134</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000134</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hinz</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Laughery</surname>
<given-names>M. F.</given-names>
</name>
<name>
<surname>Wyrick</surname>
<given-names>J. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Nucleosomes inhibit Cas9 endonuclease activity <italic>in vitro</italic>
</article-title>. <source>Biochemistry</source> <volume>54</volume>, <fpage>7063</fpage>&#x2013;<lpage>7066</lpage>. <pub-id pub-id-type="doi">10.1021/acs.biochem.5b01108</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hsu</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Lander</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Development and applications of CRISPR-Cas9 for genome engineering</article-title>. <source>Cell</source> <volume>157</volume>, <fpage>1262</fpage>&#x2013;<lpage>1278</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2014.05.010</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ioshikhes</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bolshoy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Derenshteyn</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Borodovsky</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Trifonov</surname>
<given-names>E. N.</given-names>
</name>
</person-group> (<year>1996</year>). <article-title>Nucleosome DNA sequence pattern revealed by multiple alignment of experimentally mapped sequences</article-title>. <source>J. Mol. Biol.</source> <volume>262</volume>, <fpage>129</fpage>&#x2013;<lpage>139</lpage>. <pub-id pub-id-type="doi">10.1006/jmbi.1996.0503</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Rube</surname>
<given-names>H. T.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>J. S.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Categorical spectral analysis of periodicity in nucleosomal DNA</article-title>. <source>Nucleic Acids Res.</source> <volume>44</volume>, <fpage>2047</fpage>&#x2013;<lpage>2057</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkw101</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jinek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chylinski</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Fonfara</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hauer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Doudna</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Charpentier</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A programmable dual-RNA-guided DNA endonuclease in adaptive bacterial immunity</article-title>. <source>Science</source> <volume>337</volume>, <fpage>816</fpage>&#x2013;<lpage>821</lpage>. <pub-id pub-id-type="doi">10.1126/science.1225829</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jinek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>East</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Doudna</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>RNA-programmed genome editing in human cells</article-title>. <source>Elife</source> <volume>2</volume>, <fpage>e00471</fpage>. <pub-id pub-id-type="doi">10.7554/eLife.00471</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>H. K.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Min</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bae</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>J. W.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>SpCas9 activity prediction by DeepSpCas9, a deep learning-based model with high generalization performance</article-title>. <source>Sci. Adv.</source> <volume>5</volume>, <fpage>eaax9249</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.aax9249</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>H. K.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Min</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yoon</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Predicting the efficiency of prime editing guide RNAs in human cells</article-title>. <source>Nat. Biotechnol.</source> <volume>39</volume>, <fpage>198</fpage>&#x2013;<lpage>206</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-020-0677-y</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>Y. B.</given-names>
</name>
<name>
<surname>Komor</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Levy</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>K. T.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D. R.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Increasing the genome-targeting scope and precision of base editing with engineered Cas9-cytidine deaminase fusions</article-title>. <source>Nat. Biotechnol.</source> <volume>35</volume>, <fpage>371</fpage>&#x2013;<lpage>376</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.3803</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Komor</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y. B.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Zuris</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D. R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Programmable editing of a target base in genomic DNA without double-stranded DNA cleavage</article-title>. <source>Nature</source> <volume>533</volume>, <fpage>420</fpage>&#x2013;<lpage>424</lpage>. <pub-id pub-id-type="doi">10.1038/nature17946</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Komor</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>K. T.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Gaudelli</surname>
<given-names>N. M.</given-names>
</name>
<name>
<surname>Waterbury</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Koblan</surname>
<given-names>L. W.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Improved base excision repair inhibition and bacteriophage Mu Gam protein yields C:G-to-T:A base editors with higher efficiency and product purity</article-title>. <source>Sci. Adv.</source> <volume>3</volume>, <fpage>eaao4774</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.aao4774</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kweon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hwang</surname>
<given-names>H. Y.</given-names>
</name>
<name>
<surname>Ryu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jang</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Targeted genomic translocations and inversions generated using a paired prime editing strategy</article-title>. <source>Mol. Ther.</source> <volume>31</volume>, <fpage>249</fpage>&#x2013;<lpage>259</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymthe.2022.09.008</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lamb</surname>
<given-names>K. N.</given-names>
</name>
<name>
<surname>Bsteh</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Dishman</surname>
<given-names>S. N.</given-names>
</name>
<name>
<surname>Moussa</surname>
<given-names>H. F.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Stuckey</surname>
<given-names>J. I.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Discovery and characterization of a cellular potent positive allosteric modulator of the polycomb repressive complex 1 chromodomain, CBX7</article-title>. <source>Cell Chem. Biol.</source> <volume>26</volume>, <fpage>1365</fpage>&#x2013;<lpage>1379.e22</lpage>. <pub-id pub-id-type="doi">10.1016/j.chembiol.2019.07.013</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Langmead</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Salzberg</surname>
<given-names>S. L.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Fast gapped-read alignment with Bowtie 2</article-title>. <source>Nat. Methods</source> <volume>9</volume>, <fpage>357</fpage>&#x2013;<lpage>359</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.1923</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leistico</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Saini</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Futtner</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Hejna</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Omura</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Soni</surname>
<given-names>P. N.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Epigenomic tensor predicts disease subtypes and reveals constrained tumor evolution</article-title>. <source>Cell Rep.</source> <volume>34</volume>, <fpage>108927</fpage>. <pub-id pub-id-type="doi">10.1016/j.celrep.2021.108927</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Handsaker</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wysoker</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fennell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ruan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Homer</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2009</year>). <article-title>The sequence alignment/map format and SAMtools</article-title>. <source>Bioinformatics</source> <volume>25</volume>, <fpage>2078</fpage>&#x2013;<lpage>2079</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btp352</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>J. L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Modulating chromatin accessibility by transactivation and targeting proximal dsgRNAs enhances Cas9 editing efficiency <italic>in vivo</italic>
</article-title>. <source>Genome Biol.</source> <volume>20</volume>, <fpage>145</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-019-1762-8</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>HDAC inhibitors improve CRISPR-Cas9 mediated prime editing and base editing</article-title>. <source>Mol. Ther. Nucleic Acids</source> <volume>29</volume>, <fpage>36</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1016/j.omtn.2022.05.036</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>S. Q.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Mintzer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y. G.</given-names>
</name>
<name>
<surname>Ponnienselvan</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). <article-title>Improved prime editors enable pathogenic allele correction and cancer modelling in adult mice</article-title>. <source>Nat. Commun.</source> <volume>12</volume>, <fpage>2121</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-22295-w</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kao</surname>
<given-names>H. I.</given-names>
</name>
<name>
<surname>Bambara</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Flap endonuclease 1: A central component of DNA metabolism</article-title>. <source>Annu. Rev. Biochem.</source> <volume>73</volume>, <fpage>589</fpage>&#x2013;<lpage>615</lpage>. <pub-id pub-id-type="doi">10.1146/annurev.biochem.73.012803.092453</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2021b</year>). <article-title>Enhancing prime editing by Csy4-mediated processing of pegRNA</article-title>. <source>Cell Res.</source> <volume>31</volume>, <fpage>1134</fpage>&#x2013;<lpage>1136</lpage>. <pub-id pub-id-type="doi">10.1038/s41422-021-00520-x</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hitz</surname>
<given-names>B. C.</given-names>
</name>
<name>
<surname>Gabdank</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hilton</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Kagda</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Lam</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>New developments on the Encyclopedia of DNA Elements (ENCODE) data portal</article-title>. <source>Nucleic Acids Res.</source> <volume>48</volume>, <fpage>D882</fpage>&#x2013;<lpage>D889</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkz1062</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mali</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Esvelt</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Aach</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Guell</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dicarlo</surname>
<given-names>J. E.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>RNA-guided human genome engineering via Cas9</article-title>. <source>Science</source> <volume>339</volume>, <fpage>823</fpage>&#x2013;<lpage>826</lpage>. <pub-id pub-id-type="doi">10.1126/science.1232033</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moreno-Mateos</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Vejnar</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Beaudoin</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Fernandez</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Mis</surname>
<given-names>E. K.</given-names>
</name>
<name>
<surname>Khokha</surname>
<given-names>M. K.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>CRISPRscan: Designing highly efficient sgRNAs for CRISPR-cas9 targeting <italic>in vivo</italic>
</article-title>. <source>Nat. Methods</source> <volume>12</volume>, <fpage>982</fpage>&#x2013;<lpage>988</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.3543</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nelson</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Randolph</surname>
<given-names>P. B.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>S. P.</given-names>
</name>
<name>
<surname>Everette</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Anzalone</surname>
<given-names>A. V.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Engineered pegRNAs improve prime editing efficiency</article-title>. <source>Nat. Biotechnol.</source> <volume>40</volume>, <fpage>402</fpage>&#x2013;<lpage>410</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-021-01039-7</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oberdoerffer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sinclair</surname>
<given-names>D. A.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The role of nuclear architecture in genomic instability and ageing</article-title>. <source>Nat. Rev. Mol. Cell Biol.</source> <volume>8</volume>, <fpage>692</fpage>&#x2013;<lpage>702</lpage>. <pub-id pub-id-type="doi">10.1038/nrm2238</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Jeong</surname>
<given-names>T. Y.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Yoon</surname>
<given-names>D. E.</given-names>
</name>
<name>
<surname>Lim</surname>
<given-names>S. Y.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S. P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Targeted mutagenesis in mouse cells and embryos using an enhanced prime editor</article-title>. <source>Genome Biol.</source> <volume>22</volume>, <fpage>170</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-021-02389-w</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Petri</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schmidts</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Horng</surname>
<given-names>J. E.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>CRISPR prime editing with ribonucleoprotein complexes in zebrafish and primary human cells</article-title>. <source>Nat. Biotechnol.</source> <volume>40</volume>, <fpage>189</fpage>&#x2013;<lpage>193</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-021-00901-y</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saksouk</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Barth</surname>
<given-names>T. K.</given-names>
</name>
<name>
<surname>Ziegler-Birling</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Olova</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Nowak</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rey</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Redundant mechanisms to form silent chromatin at pericentromeric regions rely on BEND3 and DNA methylation</article-title>. <source>Mol. Cell</source> <volume>56</volume>, <fpage>580</fpage>&#x2013;<lpage>594</lpage>. <pub-id pub-id-type="doi">10.1016/j.molcel.2014.10.001</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Segal</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Fondufe-Mittendorf</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Thastrom</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Field</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>I. K.</given-names>
</name>
<etal/>
</person-group> (<year>2006</year>). <article-title>A genomic code for nucleosome positioning</article-title>. <source>Nature</source> <volume>442</volume>, <fpage>772</fpage>&#x2013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1038/nature04979</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Spencer</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Deep mutational scanning of S. pyogenes Cas9 reveals important functional domains</article-title>. <source>Sci. Rep.</source> <volume>7</volume>, <fpage>16836</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-17081-y</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sugimoto</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Nakano</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Katoh</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Matsumura</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nakamuta</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ohmichi</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>1995</year>). <article-title>Thermodynamic parameters to predict stability of RNA/DNA hybrid duplexes</article-title>. <source>Biochemistry</source> <volume>34</volume>, <fpage>11211</fpage>&#x2013;<lpage>11216</lpage>. <pub-id pub-id-type="doi">10.1021/bi00035a029</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tarjan</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Flavahan</surname>
<given-names>W. A.</given-names>
</name>
<name>
<surname>Bernstein</surname>
<given-names>B. E.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Epigenome editing strategies for the functional annotation of CTCF insulators</article-title>. <source>Nat. Commun.</source> <volume>10</volume>, <fpage>4258</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-019-12166-w</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tsallis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Stariolo</surname>
<given-names>D. A.</given-names>
</name>
</person-group> (<year>1996</year>). <article-title>Generalized simulated annealing</article-title>. <source>Phys. A</source> <volume>233</volume>, <fpage>395</fpage>&#x2013;<lpage>406</lpage>. <pub-id pub-id-type="doi">10.1016/s0378-4371(96)00271-3</pub-id>
</citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Winter</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Luu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gapinske</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Manandhar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shirguppe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Woods</surname>
<given-names>W. S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Targeted exon skipping with AAV-mediated split adenine base editors</article-title>. <source>Cell Discov.</source> <volume>5</volume>, <fpage>41</fpage>. <pub-id pub-id-type="doi">10.1038/s41421-019-0109-7</pub-id>
</citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wright</surname>
<given-names>A. V.</given-names>
</name>
<name>
<surname>Nunez</surname>
<given-names>J. K.</given-names>
</name>
<name>
<surname>Doudna</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Biology and applications of CRISPR systems: Harnessing nature&#x27;s toolbox for genome engineering</article-title>. <source>Cell</source> <volume>164</volume>, <fpage>29</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2015.12.035</pub-id>
</citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Scott</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Kriz</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Chiu</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Hsu</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Dadon</surname>
<given-names>D. B.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Genome-wide binding of the CRISPR endonuclease Cas9 in mammalian cells</article-title>. <source>Nat. Biotechnol.</source> <volume>32</volume>, <fpage>670</fpage>&#x2013;<lpage>676</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.2889</pub-id>
</citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yarrington</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Verma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schwartz</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Trautman</surname>
<given-names>J. K.</given-names>
</name>
<name>
<surname>Carroll</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Nucleosomes inhibit target cleavage by CRISPR-Cas9 <italic>in vivo</italic>
</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>115</volume>, <fpage>9351</fpage>&#x2013;<lpage>9358</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1810062115</pub-id>
</citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hastie</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Regularization and variable selection via the elastic net</article-title>. <source>J. R. Stat. Soc. Ser. B Stat. Methodol.</source> <volume>67</volume>, <fpage>301</fpage>&#x2013;<lpage>320</lpage>. <pub-id pub-id-type="doi">10.1111/j.1467-9868.2005.00503.x</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>