<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Immunol.</journal-id>
<journal-title>Frontiers in Immunology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Immunol.</abbrev-journal-title>
<issn pub-type="epub">1664-3224</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fimmu.2021.739179</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Immunology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Novel Allele Detection Tool Benchmark and Application With Antibody Repertoire Sequencing Dataset</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Xiujia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1430821"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhu</surname>
<given-names>Yan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chen</surname>
<given-names>Sen</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zeng</surname>
<given-names>Huikun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guan</surname>
<given-names>Junjie</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1430885"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Qilong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/648139"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lan</surname>
<given-names>Chunhong</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Deqiang</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yu</surname>
<given-names>Xueqing</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhang</surname>
<given-names>Zhenhai</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1173309"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Center for Precision Medicine, Guangdong Provincial People&#x2019;s Hospital, Guangdong Academy of Medical Sciences</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Guangdong-Hong Kong Joint Laboratory on Immunological and Genetic Kidney Diseases, Guangdong Provincial People&#x2019;s Hospital, Guangdong Academy of Medical Sciences</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>State Key Laboratory of Organ Failure Research, National Clinical Research Center for Kidney Disease, Division of Nephrology, Nanfang Hospital, Southern Medical University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Bioinformatics, School of Basic Medical Sciences, Southern Medical University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Center Laboratory, The Fifth Affiliated Hospital of Guangzhou Medical University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Division of Nephrology, Guangdong Provincial People&#x2019;s Hospital, Guangdong Academy of Medical Sciences</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>Key Laboratory of Mental Health of the Ministry of Education, Guangdong-Hong Kong-Macao Greater Bay Area Center for Brain Science and Brain-Inspired Intelligence, Southern Medical University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Masaki Hikida, Akita Univerity, Japan</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Konrad Krawczyk, NaturalAntibody, Poland; Victor Greiff, University of Oslo, Norway</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Zhenhai Zhang, <email xlink:href="mailto:zhangzhenhai@gdph.org.cn">zhangzhenhai@gdph.org.cn</email>; <email xlink:href="mailto:zhenhaismu@163.com">zhenhaismu@163.com</email>; Xueqing Yu, <email xlink:href="mailto:yuxueqing@gdph.org.cn">yuxueqing@gdph.org.cn</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work and share first authorship</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to B Cell Biology, a section of the journal Frontiers in Immunology</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>10</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>739179</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>10</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Yang, Zhu, Chen, Zeng, Guan, Wang, Lan, Sun, Yu and Zhang</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Yang, Zhu, Chen, Zeng, Guan, Wang, Lan, Sun, Yu and Zhang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Detailed knowledge of the diverse immunoglobulin germline genes is critical for the study of humoral immunity. Hundreds of alleles have been discovered by analyzing antibody repertoire sequencing (Rep-seq or Ig-seq) data <italic>via</italic> multiple novel allele detection tools (NADTs). However, the performance of these NADTs through antibody sequences with intrinsic somatic hypermutations (SHMs) is unclear. Here, we developed a tool to simulate repertoires by integrating the full spectrum features of an antibody repertoire such as germline gene usage, junctional modification, position-specific SHM and clonal expansion based on 2152 high-quality datasets. We then systematically evaluated these NADTs using both simulated and genuine Ig-seq datasets. Finally, we applied these NADTs to 687 Ig-seq datasets and identified 43 novel allele candidates (NACs) using defined criteria. Twenty-five alleles were validated through findings of other sources. In addition to the NACs detected, our simulation tool, the results of our comparison, and the streamline of this process may benefit further humoral immunity studies <italic>via</italic> Ig-seq.</p>
</abstract>
<kwd-group>
<kwd>tools benchmarking</kwd>
<kwd>novel allele</kwd>
<kwd>antibody repertoire</kwd>
<kwd>high-throughput sequencing</kwd>
<kwd>Ig-seq</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="6"/>
<equation-count count="0"/>
<ref-count count="30"/>
<page-count count="16"/>
<word-count count="9416"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<title>Introduction</title>
<p>Genetic variations of antibody germline genes play a pivotal role in humoral immunity. For instance, the allele variants of IGHV1-69 greatly impact the ability to develop broadly neutralizing antibodies (bNAbs) against influenza virus (<xref ref-type="bibr" rid="B1">1</xref>), and modulate IGHV germline gene utilization (<xref ref-type="bibr" rid="B2">2</xref>). In addition, the polymorphism in IGHV4-61 is associated with a risk in rheumatic heart disease (<xref ref-type="bibr" rid="B3">3</xref>). In fundamental research, accurately assigning germline genes to antibody sequences is also critical. It affects the analysis of clonotype, somatic hypermutation (SHM), and the maturation pathway of antibody clones. Therefore, germline alleles are essential for delineating the ontogeny and evolution of antibody responses specific to antigens or vaccines. Despite this need, a comprehensive collection of novel alleles has not yet been achieved (<xref ref-type="bibr" rid="B4">4</xref>).</p>
<p>The advent of antibody repertoire sequencing (Rep-seq or Ig-seq) technology allows the acquisition of millions of antibody sequences and these unprecedented data facilitate the discovery of novel alleles through tools with specific aims (i.e. novel allele detection tools, NADTs) (<xref ref-type="bibr" rid="B5">5</xref>&#x2013;<xref ref-type="bibr" rid="B9">9</xref>). As antibody sequences undergo extensive SHMs along with B cell proliferation once activated by an antigen, novel allele detection for antibody genes are more challenging than traditional mutation detection in conventional genes where only base errors caused by PCR and high-throughput sequencing (HTS) need to be considered (<xref ref-type="bibr" rid="B6">6</xref>). To distinguish SHMs and base errors from real polymorphisms, NADTs use distinct algorithms and are supposed to be effective in typical scenarios.</p>
<p>Algorithm wise, <italic>TIgGER</italic> (<xref ref-type="bibr" rid="B6">6</xref>), <italic>LymAnalyzer</italic> (<xref ref-type="bibr" rid="B8">8</xref>), and <italic>Partis</italic> (<xref ref-type="bibr" rid="B7">7</xref>) employ a SNP-based approach. Novel alleles are predicted by identifying SNPs in the reference germlines. For example, <italic>TIgGER</italic> and <italic>Partis</italic> employ mutation accumulation plots to identify SNPs. Therefore, the major challenge for these NADTs is to distinguish SNPs from SHMs. In contrast, <italic>IgDiscover</italic> (<xref ref-type="bibr" rid="B5">5</xref>) annotates the input sequences with an initial germline database to form clusters and subsequently predicts novel alleles based on consensus building within clusters. This sequence-based approach circumvents the SNP set determination procedure encountered by the SNP-based approach and can easily output the novel germline sequences regardless of the distances to their nearest counterparts. Nevertheless, it heavily relies on repertoire types and is suggested to work efficiently only on na&#xef;ve repertoires featured by a substantial fraction of unmutated sequences. <italic>IMPre</italic> (<xref ref-type="bibr" rid="B9">9</xref>) uses a seed-based approach. It starts with a seed sequence and extends the sequence in both directions if defined requirements are met. It is worth mentioning that both the sequence-based approach and the seed-based extension approach can identify novel alleles that have insertions and deletions compared to the known germlines.</p>
<p>Despite these algorithm differences, it remains unclear how NADTs above compete with each other in practice. A previous study presented a comparison among 3 NADTs (i.e. <italic>IgDiscover</italic>, <italic>TIgGER</italic> and <italic>Partis</italic>) (<xref ref-type="bibr" rid="B7">7</xref>), but the study was not comprehensive as to both the number of included NADTs and the kind of challenges that need to be overcome in novel allele detection. To evaluate the five NADTs <italic>TIgGER</italic>, <italic>LymAnalyzer</italic>, <italic>Partis, IgDiscover</italic> and <italic>IMPre</italic> objectively, we used a repertoire simulation tool that incorporates the full spectrum of repertoire features extrapolated from 2152 datasets, including germline gene usage, junctional modification, position-specific SHM and clonal expansion. We then systematically evaluated these NADTs using both the simulated datasets and paired genuine bulk and single-cell repertoire sequencing datasets. We identified 43 novel allele candidates (NACs) from 683 datasets using the criterion set based on the comparison result. This systematic evaluation, together with the NACs we present here, may aid future novel allele identification and thus achieve a better interpretation of adaptive immune receptor repertoire sequencing (AIRR-seq) dataset.</p>
</sec>
<sec id="s2" sec-type="results">
<title>Results</title>
<sec id="s2_1">
<title>An Overview of 5 NADTs and the Study Design</title>
<p>To perform solid and comprehensive comparison for currently available NADTs, we employed <italic>TIgGER</italic> (<xref ref-type="bibr" rid="B6">6</xref>), <italic>IMPre</italic> (<xref ref-type="bibr" rid="B9">9</xref>), <italic>IgDiscover</italic> (<xref ref-type="bibr" rid="B5">5</xref>), <italic>LymAnalyzer</italic> (<xref ref-type="bibr" rid="B8">8</xref>) and <italic>Partis</italic> (<xref ref-type="bibr" rid="B7">7</xref>). Their basic information is summarized in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. As these five NADTs were developed using various programming languages, their installations are subject to various dependencies. With respect to their applications, <italic>IMPre</italic> and <italic>LymAnalyzer</italic> work on both T cell receptor (TCR) and B cell receptor (BCR) while the other three only work on BCR. All NADTs support both heavy chain (IGH) and light chain (IGK and IGL) of BCR, while <italic>IMPre</italic> and <italic>LymAnalyzer</italic> also support TRB and TRA. <italic>TIgGER</italic> and <italic>Partis</italic> only support V genes, <italic>IMPre</italic> and <italic>LymAnalyzer</italic> support V and J genes, while <italic>IgDiscover</italic> supports V, D, and J genes. Except <italic>IgDiscover</italic> and <italic>LymAnalyzer</italic>, all other NADTs underwent <italic>in silico</italic> benchmark during development. <italic>Partis</italic> developers compared their NADT with others, but no systematic third-party comparison has been performed among them. Therefore, a comprehensive and systematic comparison would benefit the field for novel allele detection using antibody repertoire datasets.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>The basic information for 5 NADTs.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">NADTs</th>
<th valign="top" align="center">Year</th>
<th valign="top" align="center"># Citation<sup>*</sup>
</th>
<th valign="top" align="center">Programming language(s)</th>
<th valign="top" align="center">Supported receptor type(s)</th>
<th valign="top" align="center">Supported chain type(s)</th>
<th valign="top" align="center">Supported gene type(s)</th>
<th valign="top" align="center">Nonhuman species supported</th>
<th valign="top" align="center">Comparison with other tools</th>
<th valign="top" align="center">
<italic>in silico</italic> Benchmark</th>
<th valign="top" align="center">Algorithm</th>
<th valign="top" align="center">Authors</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">2015</td>
<td valign="top" align="center">104</td>
<td valign="top" align="left">R</td>
<td valign="top" align="left">BCR</td>
<td valign="top" align="left">IGH, IGK, IGL</td>
<td valign="top" align="left">V</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Mutation accumulation models</td>
<td valign="top" align="left">Gadala-Maria et&#xa0;al. (<xref ref-type="bibr" rid="B6">6</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">2016</td>
<td valign="top" align="center">20</td>
<td valign="top" align="left">C, Perl</td>
<td valign="top" align="left">BCR, TCR</td>
<td valign="top" align="left">IGH, IGK, IGL, TRB, TRA</td>
<td valign="top" align="left">V, J</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Seed_Clust</td>
<td valign="top" align="left">Zhang et&#xa0;al. (<xref ref-type="bibr" rid="B9">9</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">2016</td>
<td valign="top" align="center">81</td>
<td valign="top" align="left">Python</td>
<td valign="top" align="left">BCR</td>
<td valign="top" align="left">IGH, IGK, IGL</td>
<td valign="top" align="left">V, D, J</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">Windowed cluster analysis, Linkage cluster analysis</td>
<td valign="top" align="left">Corcoran et&#xa0;al. (<xref ref-type="bibr" rid="B5">5</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">2016</td>
<td valign="top" align="center">41</td>
<td valign="top" align="left">Java</td>
<td valign="top" align="left">BCR, TCR</td>
<td valign="top" align="left">IGH, IGK, IGL, TRB, TRA</td>
<td valign="top" align="left">V,J</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">No</td>
<td valign="top" align="left">Mismatch quality control</td>
<td valign="top" align="left">Yu et&#xa0;al. (<xref ref-type="bibr" rid="B8">8</xref>)</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">2019</td>
<td valign="top" align="center">12</td>
<td valign="top" align="left">C, C++, Perl, Python</td>
<td valign="top" align="left">BCR</td>
<td valign="top" align="left">IGH, IGK, IGL</td>
<td valign="top" align="left">V</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Mutation accumulation models</td>
<td valign="top" align="left">Ralph et&#xa0;al. (<xref ref-type="bibr" rid="B7">7</xref>)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>*The citation statistics is obtained on 2020/4/13 according to google scholar (<uri xlink:href="https://scholar.google.com/">https://scholar.google.com/</uri>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>When we compared the supportive features of these NADTs, we found <italic>IMPre</italic> to be the most versatile and user-friendly NADT before considering its performance for novel allele detection (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;1</bold>
</xref>). To gain more insights into these NADTs, we evaluated their performance with both simulated and real-world Ig-seq datasets (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>). The benchmark result was then summarized and translated into knowledge-based filtration criteria used to obtain credible NACs from collected bulk sequencing dataset.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Schematic overview of the study design. In this study, both <italic>in silico</italic> simulated and genuine Ig-seq dataset were employed as benchmark datasets that serve as the input of all five NADTs independently. The performances of these NADTs were then summarized and integrated, and translated into filtration criteria capable of facilitating the evaluation of NACs. Among all NACs reported based on the collected bulk sequencing dataset, we retained only those credible NACs passing the defined filtration criteria.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g001.tif"/>
</fig>
</sec>
<sec id="s2_2">
<title>A Flexible Immune Repertoire Sequencing Dataset Simulation Tool and the Benchmark Dataset</title>
<p>Generating <italic>in silico</italic> Ig-seq datasets is a challenging task. An ideal Ig-seq simulating tool should reflect the preferential gene usage, junctional nucleotide insertion and deletion, phylogenetic clonal structure, various allele ratio, and the base errors intrinsic to PCR amplification and next-generation sequencing (NGS). Although several repertoire simulation tools exist (<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B10">10</xref>&#x2013;<xref ref-type="bibr" rid="B13">13</xref>), none of them incorporate the full features of Ig-seq dataset mentioned above. Therefore, we built <italic>IMPlAntS</italic> (<bold>I</bold>ntegrated and <bold>M</bold>odular <bold>P</bold>ipe<bold>l</bold>ine for <bold>Ant</bold>ibody Repertoire <bold>S</bold>imulation) and it enables both one-stop repertoire simulation and modular calls for adaption to customized pipelines.</p>
<p>Briefly, <italic>IMPlAntS</italic> consists of three consecutive steps, i) generation of independent V(D)J rearrangements; ii) generation of SHM with phylogenetic structure within clones; and iii) generation of NGS reads incorporating base errors (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>). These steps can be implemented individually or collectively using the corresponding scripts.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>
<italic>IMPlAnts</italic>, an <italic>I</italic>ntegrated and <italic>M</italic>odular <italic>P</italic>ipeline for <italic>Ant</italic>ibody Repertoire <italic>S</italic>imulation. <italic>IMPlAnts</italic> consists of three consecutive steps: i) individual rearrangement simulation; ii) SHM and clonal expansion simulation; and iii) next generation sequencing simulation. In step i and ii, V(D)J gene usage, junctional modification and position-specific SHM were learned from a previous large-scale study encompassing 2152 high-quality Ig-seq datasets. After SHMs were simulated, the power law was used to simulate clonal size distribution. Finally, a NGS read simulator, ART, was exploited to produce sequencing reads (Illumina MiSeq, PE250).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g002.tif"/>
</fig>
<p>In the first step, a series of key parameters can be specified in the configuration files. These parameters include V(D)J gene usage, allele ratio, the distribution of insertion and deletion length, and the percentage of productive rearrangements. In the second step, we generated SHMs in rearranged sequences in a way similar to that reported by Yermanos et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>) to create the phylogenetic sequences as in the real repertoire. The resultant repertoire with SHMs comes from several iterations of introducing SHMs to the selected sequences based on the positional mutability and substitutability models. These two models, together with the parameters involved in the first step, derive from our previous large-scale study (<xref ref-type="bibr" rid="B14">14</xref>). Finally, we employed a popular NGS simulation tool, ART, to produce NGS reads (<xref ref-type="bibr" rid="B15">15</xref>). More details for <italic>IMPlAnts</italic> can be found in <italic>Materials and Methods</italic> section.</p>
<p>With this pipeline, we generated four datasets: DEXPR, DSNP, DALLELE, and DSHM (<xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>). Noteworthy is that only DSHM was generated with all three steps mentioned above. In contrast, the other three datasets were generated with only the first and the final step as they contain no SHMs. Each of the four datasets was comprised of 20 repertoires, except for DSHM (n=10). The constituent repertoires within each dataset contained variation only in the studied variable. Except for DSHM, each dataset contained four groups (two groups for DSHM). While each group is represented by five repertoire replicates and has a distinct level as to the studied variables. Other variables were set identically among groups within each dataset and to a level theoretically most favorable to novel allele detection. For each repertoire, we generated 1 million reads to avoid the read number limitation mentioned in the <italic>IgDiscover</italic> manual (at least 750,000 was recommended). The only exception was with DALLELE, in which repertoires in different groups had varying numbers of reads to make the novel alleles represented by the same number of reads. Lastly, we artificially created &#x201c;novel&#x201d; alleles by random selection of the positions and SNPs in germline sequences. The resultant &#x201c;novel&#x201d; alleles together with known ones then served as the initial germline database for NADTs&#x2019; benchmarking (Materials and Methods).</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Characterization of four simulated datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="center">Studied variable</th>
<th valign="top" align="center"># Repertoires</th>
<th valign="top" align="center"># Reads (million)</th>
<th valign="top" align="center">Gene expression</th>
<th valign="top" align="center">Minor allele frequency</th>
<th valign="top" align="center"># SNPs</th>
<th valign="top" align="center">SHM frequency</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">DEXPR</td>
<td valign="top" align="left">gene expression</td>
<td valign="top" align="center">20 (5, 5, 5, 5)</td>
<td valign="top" align="center">1, 1, 1, 1</td>
<td valign="top" align="center">~5%, ~1%, ~0.1%, ~0.01%</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">DALLELE</td>
<td valign="top" align="left">minor allele frequency</td>
<td valign="top" align="center">20 (5, 5, 5, 5)</td>
<td valign="top" align="center">0.1, 0.16, 0.5, 1</td>
<td valign="top" align="center">~5%</td>
<td valign="top" align="center">50%, 30%, 10%, 5%</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">DSNP</td>
<td valign="top" align="left"># SNPs</td>
<td valign="top" align="center">20 (5, 5, 5, 5)</td>
<td valign="top" align="center">1, 1, 1, 1</td>
<td valign="top" align="center">~5%</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">1, 3, 5, 7</td>
<td valign="top" align="center">0</td>
</tr>
<tr>
<td valign="top" align="left">DSHM</td>
<td valign="top" align="left">SHM</td>
<td valign="top" align="center">10 (5, 5)</td>
<td valign="top" align="center">1, 1</td>
<td valign="top" align="center">~5%</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0, ~6%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Each of the first three datasets above consists of 20 simulated repertoires, corresponding to four groups with equal sample size (n=5) varying from each other with respect to the studied variable. While DSHM contains 10 repertoires from two groups with equal size (n=5). Besides, repertoires from DEXPR, DSNP and DSHM do not contain allelic diversity and thus do not apply to the &#x2018;minor allele frequency&#x2019; column. Comma-separated percentages or numbers in the last four columns describe the features of simulated novel alleles in repertoires of different groups within a certain dataset (see also Results).</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s2_3">
<title>Evaluation of the 5 NADTs Using <italic>In Silico</italic> Simulated Benchmark Dataset</title>
<p>To compare the sensitivity and specificity of the 5 NADTs in detecting novel alleles (allele level) (<italic>LymAnalyzer</italic> was excluded as it reported only SNPs) as well as SNPs (SNP level) (Materials and Methods), we used our <italic>in silico</italic> simulated datasets (<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>). As expected, lower gene or allele expression and more SNPs or SHMs hampered both sensitivities and specificities for at least one NADT in the detection of novel alleles and SNPs in general (<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>). We found <italic>TIgGER</italic> to work well with respect to both sensitivities and specificities with DEXPR and DSNP, although it did not identify alleles in DALLELE (<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>). <italic>IMPre</italic>, though exhibiting lower sensitivities and specificities, identified novel alleles in the datasets with all four variables. <italic>IgDiscover</italic> manifested very good specificities although it identified fewer alleles than <italic>TIgGER</italic>. The performance of <italic>Partis</italic> was less optimal in DSNP than that of <italic>TIgGER</italic> but excelled in DALLELE and higher SHM datasets. As <italic>LymAnalyzer</italic> only reports SNPs, it was excluded from allele level comparisons. However, it also showed high sensitivities in all situations in SNP level although the sensitivities were less ideal. The performance of other NADTs was similar in SNP level to that of the allele level.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Sensitivity and specificity of novel allele detection for 5 NADTs based on four simulated datasets.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" rowspan="3" align="left">Type</th>
<th valign="top" rowspan="3" align="center">Measurement</th>
<th valign="top" rowspan="3" align="center">Tool</th>
<th valign="top" colspan="14" align="center">Dataset</th>
</tr>
<tr>
<th valign="top" colspan="4" align="center">DEXPR</th>
<th valign="top" colspan="4" align="center">DALLELE</th>
<th valign="top" colspan="4" align="center">DSNP</th>
<th valign="top" colspan="2" align="center">DSHM</th>
</tr>
<tr>
<th valign="top" align="center">~5%</th>
<th valign="top" align="center">~1%</th>
<th valign="top" align="center">~0.1%</th>
<th valign="top" align="center">~0.01%</th>
<th valign="top" align="center">50%</th>
<th valign="top" align="center">30%</th>
<th valign="top" align="center">10%</th>
<th valign="top" align="center">5%</th>
<th valign="top" align="center">1</th>
<th valign="top" align="center">3</th>
<th valign="top" align="center">5</th>
<th valign="top" align="center">7</th>
<th valign="top" align="center">0%</th>
<th valign="top" align="center">6%</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="10" align="left">
<bold>Allele level</bold>
</td>
<td valign="top" rowspan="5" align="left">
<bold>Sensitivity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.52</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.52</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.16</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.52</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.32</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.48</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.92</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left">
<bold>Specificity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.96</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.24</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">0.33</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.25</td>
<td valign="top" align="center">0.30</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.21</td>
<td valign="top" align="center">0.13</td>
<td valign="top" align="center">0.56</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.30</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.67</td>
</tr>
<tr>
<td valign="top" rowspan="10" align="left">
<bold>SNP level</bold>
</td>
<td valign="top" rowspan="5" align="left">
<bold>Sensitivity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.99</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.52</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.74</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.56</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.32</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.48</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.30</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.92</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left">
<bold>Specificity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">0.15</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.39</td>
<td valign="top" align="center">0.16</td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">0.59</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.25</td>
<td valign="top" align="center">0.45</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.94</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.11</td>
<td valign="top" align="center">0.10</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">0.34</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.97</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.10</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.39</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Heatmap of double-tailed p-value of paired t-test between different subgroups with regard to sensitivity and specificity for different tools in four simulated datasets. Each row and column in the heatmap represents a subgroup. P-values below 0.05 are shown in red and p-values above 0.05 in blue.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g003.tif"/>
</fig>
<p>Taken together, <italic>TIgGER</italic>, <italic>IgDiscover</italic>, and <italic>Partis</italic> showed comparably high specificities and therefore the alleles identified were more reliable. <italic>IMPre</italic> and <italic>LymAnalyzer</italic> provided more allele candidates, but none of the NADTs performed well in all situations. However, each of these datasets was simulated with only one variable with a particular quantity to evaluate the effect of these quantitative measures on the performance of NADTs whereas real-world repertoires always consist of combinations of all variables in multiple quantitative measures.</p>
</sec>
<sec id="s2_4">
<title>Evaluation of 5 NADTs Using a Combination of Single-Cell and Bulk Sequencing Dataset</title>
<p>An ideal situation to test NADTs is to genotype all the V alleles in a genome and then compare them with NADTs&#x2019; predictions. However, given the high similarities of V alleles and other interspersed tandem sequences among them, sequencing this peculiar region of the genome alone is a challenging task (<xref ref-type="bibr" rid="B16">16</xref>). Therefore, we took an alternative approach by acquiring germline V allele sequences from single-cell repertoire sequencing of na&#xef;ve B cells (scRep-seq) and then conducted novel allele identification on the bulk Ig-seq datasets from the same donor. The na&#xef;ve state of antibody sequences and the super-high depth of the scRep-seq data ensured the accuracy of acquired germline sequences. Thus, this evaluation represents the real-world situation.</p>
<p>Three healthy donors were included in real-world dataset benchmark. Briefly, we isolated na&#xef;ve B cells from the peripheral blood of these 3 donors with specific cell surface markers. Then sequencing libraries were constructed according to the protocols of Chromium Single Cell Human BCR Amplification Kit (10X Genomics) and sequenced on an Illumina platform (Materials and Methods). Each single cell sequencing sample contains around 85 million to 91 million paired end reads. With the single na&#xef;ve B cell sequencing dataset, we identified 4 unique NACs from 3 donors using a customized pipeline (<xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>, Materials and Methods). All identified NACs are minor alleles of the involved genes, with expression ratios to the major ones ranging from 0.19 to 0.89. Moreover, each of them only harbors one SNP compared to their nearest known alleles.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>NACs identified based on single na&#xef;ve B cell sequencing dataset from 3 donors.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Nearest known allele<sup>a</sup>
</th>
<th valign="top" align="center">Known allele</th>
<th valign="top" align="center"># Supportive contigs<sup>b</sup>
</th>
<th valign="top" align="center">Length (bp)</th>
<th valign="top" align="center">Start</th>
<th valign="top" align="center">End</th>
<th valign="top" align="center">SNP loci<sup>c</sup>
</th>
<th valign="top" align="center">Individual</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">IGHV7-4-1*02</td>
<td valign="top" align="center">IGHV7-4-1*02</td>
<td valign="top" align="center">44 (136, 0.32)</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">G92A</td>
<td valign="top" align="center">Donor1</td>
</tr>
<tr>
<td valign="top" align="left">IGHV3-30*18<sup>T</sup>
</td>
<td valign="top" align="center">IGHV3-30*18</td>
<td valign="top" align="center">126 (492, 0.26)</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">C72G</td>
<td valign="top" align="center">Donor2</td>
</tr>
<tr>
<td valign="top" align="left">IGHV3-7*03<sup>T</sup>
</td>
<td valign="top" align="center">IGHV3-7*03</td>
<td valign="top" align="center">96 (108, 0.89)</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">296</td>
<td valign="top" align="center">G46A</td>
<td valign="top" rowspan="2" align="center">Donor3</td>
</tr>
<tr>
<td valign="top" align="left">IGHV3-53*04<sup>T,G</sup>
</td>
<td valign="top" align="center">IGHV3-53*01</td>
<td valign="top" align="center">24 (126, 0.19)</td>
<td valign="top" align="center">293</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">293</td>
<td valign="top" align="center">T261C</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>
<bold>a</bold>, NACs identified by TIgGER using bulk sequencing of IgM sequences are marked with &#x201c;T&#x201d; while IgDiscover with &#x201c;G&#x201d;. <bold>b</bold>, The numbers in the parentheses denote the number of contigs supportive of its known germline variant in the second column and the ratio of the two germline variants. <bold>c</bold>, The indexes in SNP loci are 1-based. IGHV7-4-1*02_G92A is not included in the collected germline sequences (see <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;8</bold>
</xref>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>We then applied NADTs to the bulk sequencing datasets and compared their novel allele predictions. <italic>TIgGER</italic> identified three and <italic>IgDiscover</italic> identified one out of the four NACs (<xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>) while <italic>IMPre</italic> and <italic>Partis</italic> missed all of them. Although <italic>LymAnalyzer</italic> identified two positive SNPs from two NACs, it also falsely predicted 14 and 6 SNPs in these two alleles, respectively. In addition, we found two possible novel germline sequences that harbor a considerable number of mismatches with their nearest known germline sequences (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;2</bold>
</xref>). Notably, the novel germline sequence nearest to IGHV1-NL1*01 was identified in 2 of 3 enrolled donors.</p>
<p>Because of the limited number of <italic>bona fide</italic> novel alleles (4 NACs in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref> were deemed as <italic>bona fide</italic> novel alleles here), this evaluation was less comprehensive. We thus exploited the genuine Ig-seq dataset (donor1 and donor3) in another way. As many germline sequences were known through the single na&#xef;ve B cell sequencing dataset, we artificially generated &#x201c;novel&#x201d; alleles as those in the simulated dataset mentioned above and evaluated these NADTs in the same way (Materials and Methods). However, because there were not enough genes expressing at around 0.01%, we did not generate novel alleles at this level. Moreover, the allele ratio is hard to precisely infer even with the single na&#xef;ve B cell sequencing dataset and was thus left unstudied. We denoted the genuine dataset with different initial databases as GD-EXPR, GD-SNP, and GD-SHM.</p>
<p>The genuine dataset-based benchmark result exhibited a similar performance spectrum as that based on the simulated dataset (<xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>). These similarities included, <bold>i)</bold> each of the three studied factors was found to be influential for at least one NADT for novel allele detection (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;1</bold>
</xref>), <bold>ii)</bold> <italic>TIgGER</italic> and <italic>IgDiscover</italic> were superior to <italic>IMPre</italic> and <italic>Partis</italic> in both sensitivity and specificity for detecting novel alleles with multiple SNPs (i. e. 3, 5, and 7) (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;2</bold>
</xref>), <bold>iii)</bold> the SNP-level performance spectrum in general resembled that of allele level, <bold>iv)</bold> <italic>IMPre</italic> and <italic>Partis</italic> presented higher sensitivity and specificity for identifying SNPs in DSNP than for alleles, and <bold>v)</bold> <italic>LymAnalyzer</italic> remained the most sensitive but least specific NADT in identifying SNPs.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Sensitivity and specificity of novel allele detection for 5 NADTs based on genuine Ig-seq dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" rowspan="3" align="left">Type</th>
<th valign="top" rowspan="3" align="center">Measurement</th>
<th valign="top" rowspan="3" align="center">Tool</th>
<th valign="top" colspan="9" align="center">Dataset</th>
</tr>
<tr>
<th valign="top" colspan="3" align="center">GD-EXPR</th>
<th valign="top" colspan="4" align="center">GD-SNP</th>
<th valign="top" colspan="2" align="center">GD-SHM</th>
</tr>
<tr>
<th valign="top" align="center">~5%</th>
<th valign="top" align="center">~1%</th>
<th valign="top" align="center">~0.1%</th>
<th valign="top" align="center">1</th>
<th valign="top" align="center">3</th>
<th valign="top" align="center">5</th>
<th valign="top" align="center">7</th>
<th valign="top" align="center">IgM</th>
<th valign="top" align="center">IgG</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="5" align="left">
<bold>Allele level</bold>
</td>
<td valign="top" rowspan="5" align="left">
<bold>Sensitivity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.20</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.36</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.48</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.32</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left"/>
<td valign="top" rowspan="5" align="left">
<bold>Specificity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.14</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.41</td>
<td valign="top" align="center">0.52</td>
<td valign="top" align="center">0.31</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.33</td>
<td valign="top" align="center">0.03</td>
<td valign="top" align="center">0.28</td>
<td valign="top" align="center">0.21</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.23</td>
<td valign="top" align="center">0.27</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.33</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.12</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.18</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.43</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left">
<bold>SNP level</bold>
</td>
<td valign="top" rowspan="5" align="left">
<bold>Sensitivity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="center">IMPre</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.04</td>
<td valign="top" align="center">0.48</td>
<td valign="top" align="center">0.71</td>
<td valign="top" align="center">0.18</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.96</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.48</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.17</td>
<td valign="top" align="center">0.11</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.32</td>
</tr>
<tr>
<td valign="top" rowspan="5" align="left"/>
<td valign="top" rowspan="5" align="left">
<bold>Specificity</bold>
</td>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.18</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.49</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.53</td>
<td valign="top" align="center">0.57</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.03</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.10</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.12</td>
<td valign="top" align="center">0.05</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.43</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">1.00</td>
</tr>
<tr>
<td valign="top" align="left">LymAnalyzer</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.01</td>
<td valign="top" align="center">0.02</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.00</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.00</td>
<td valign="top" align="center">0.09</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.11</td>
<td valign="top" align="center">0.07</td>
<td valign="top" align="center">0.08</td>
<td valign="top" align="center">0.34</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>However, several variations were also remarkable and they included, <bold>i)</bold> for genuine datasets, <italic>TIgGER</italic> and <italic>IgDiscover</italic> performed better in identifying novel alleles expressed at a low level (i. e. ~0.1%) than for simulated dataset; both were thus superior to <italic>IMPre</italic> and <italic>Partis</italic>, <bold>ii)</bold> although <italic>Partis</italic> remained excellent in overcoming SHM noise, it was outperformed by <italic>IgDiscover</italic> (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;2</bold>
</xref>), which exhibited a surprisingly high sensitivity of 0.80 and specificity of 1.00 at both SNP and allele levels, <bold>iii)</bold> <italic>IgDiscover</italic> manifested significantly higher sensitivities and specificities than <italic>TIgGER</italic> in three datasets, and <bold>iv)</bold> <italic>LymAnzlyzer</italic> displayed low and negligible specificities.</p>
<p>To seek the underlying reasons accounting for these discrepancies, we assessed the output of these NADTs as well as the properties of each input dataset. We found that the inferior performance of <italic>IgDiscover</italic> and <italic>TIgGER</italic> on DEXPR in the simulated dataset was caused by low sequence identities to the germline (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;3</bold>
</xref>), which was caused by sequencing errors simulated by ART, the NGS simulator. Similarly, the failure in DSHM for <italic>IgDiscover</italic> could also be attributed to the paucity of unmutated sequences as a consequence of the simulation of SHMs and NGS errors (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;4</bold>
</xref>). In contrast, the success in GD-SHM for <italic>IgDiscover</italic> indicated that a number of unmutated sequences also exist in IgG dataset. We therefore determined the frequency of such sequences for each simulated novel allele and found that it ranged from 0.07% to 1.11%, with a median of 0.31% (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;4</bold>
</xref>), which agrees with reported values of previous studies (<xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B18">18</xref>). Interestingly, <italic>TIgGER</italic> failed to detect any novel alleles from both simulated and genuine datasets containing SHMs as its algorithm is expected to be more robust to datasets with SHMs. Our in-depth analyses showed that <italic>TIgGER</italic> failed to identify novel alleles for DSHM and GD-SHM for different reasons. For DSHM, the simulated SHMs created an overly-diversified repertoire, in which plural sequences for each novel allele were too rare to pass the threshold <italic>min_seqs</italic>. Whereas in GD-SHM, the diversity of sequences perfectly matching novel alleles failed to meet the default threshold <italic>j_max</italic>. In addition, we noted a remarkable difference in the diversity filtration criterion between <italic>TIgGER</italic> and <italic>IgDiscover: TIgGER</italic> employs a quantitative filtration (<italic>j_max</italic>) whereas <italic>IgDiscovers</italic> uses a qualitative filtration (<italic>CDR3_exact</italic>). When considering only the diversity criterion, <italic>TIgGER</italic> is stricter than <italic>IgDiscover</italic>, and this explains the compromised performance of <italic>TIgGER</italic>. Finally, the lower specificity of <italic>LymAnalyzer</italic> in the genuine dataset may result from the non-independent mutation events in a genuine dataset that tends to be interpreted as SNPs according to its algorithm.</p>
<p>Together, we concluded that <bold>i)</bold> <italic>TIgGER</italic> and <italic>IgDiscover</italic> outperform all other NADTs considering both sensitivity and specificity in most situations, <bold>ii)</bold> <italic>Partis</italic> is characterized by remarkable robustness in overcoming the challenge imposed by SHMs, <bold>iii)</bold> <italic>IMPre</italic> is outstanding in detecting minor alleles, and <bold>iv)</bold> <italic>LymAnalyzer</italic> is sensitive at the cost of specificity.</p>
</sec>
<sec id="s2_5">
<title>Forty-Three NACs Are Identified From a Total Number of 687 Ig-Seq Datasets</title>
<p>With the knowledge obtained above, we designed a scheme to identify reliable NACs using 4 NADTs (excluding <italic>LymAnalyzer</italic>) from bulk Ig-seq dataset. As intrinsic features (i.e. expression level, allele ratio, and number of SNPs to the nearest allele) of novel alleles were unknown, we took into account the overall performance of each NADT summarized above and gave more credit to <italic>TIgGER</italic> and <italic>IgDiscover.</italic> We classified all Ig-seq datasets into two groups with regard to the SHM richness according to the isotypes (Materials and Methods). For IgM datasets, NACs found by at least 2 NADTs with at least one being either <italic>TIgGER</italic> or <italic>IgDiscover</italic> were retained. For datasets in which SHMs were expected to be enriched, only NACs called by two out of three NADTs, namely <italic>TIgGER</italic>, <italic>IgDiscover</italic> and <italic>Partis</italic>, were retained.</p>
<p>We then explored the efficiency of this scheme in identifying NACs from a total number of 424 Ig-seq datasets either generated in-house or from the public resource (Materials and Methods). The selected datasets stemmed from 382 donors and were all obtained from RNA samples amplified with RACE (rapid amplification of cDNA ends) protocols. Detailed metadata for these datasets are outlined in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;3</bold>
</xref>. According to the dataset classification criteria (Materials and Methods), we obtained 336 (79.2%) SHM-rich datasets (enriched for IgG sequences) and 88 (20.8%) SHM-sparse datasets (enriched for IgM sequences) (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>). Despite the lower fraction in overall datasets, IgM datasets contain more reads than IgG datasets (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4B</bold>
</xref>). Applying the selected 4 NADTs to these datasets, we found clear differences between the four NADTs in both the number of samples identified with NACs and the number of unique NACs (<xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>). <italic>IMPre</italic> discerned NACs for 71.0% of the datasets, whereas the other three NADTs found NACs for only 16.3% to 18.2% datasets. Moreover, the other three NADTs reported NACs for a sharply lower (over 10-fold) percentage of SHM-rich datasets than SHM-sparse datasets, which was likely due to more SHMs and low number of input reads that had reduced the confidence for these NADTs to make novel calls. In contrast, <italic>IMPre</italic> reported NACs for a large fraction of IgG datasets (63.4%) and also more NACs overall for individual samples (<xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> and <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4C</bold>
</xref>), which likely reflects its higher sensitivity to those underrepresented sequences (<xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>). However, the genuine sensitivity and specificity for the NADTs were elusive through these bulk sequencing datasets, for which we have no access to the genotype information.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Quantitative characterization of Ig-seq datasets and NACs identified by 4 NADTs. <bold>(A)</bold> Composition of IgM (SHM-sparse) and IgG (SHM-enriched) datasets. <bold>(B)</bold> Density of Ig-seq datasets with different number of reads. <bold>(C)</bold>. Correlation between the number of NACs for each dataset and the number of reads. Note that only dataset reported with NACs by a certain tool is included.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g004.tif"/>
</fig>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Quantitative summary of NACs identified from Ig-seq datasets by 4 NADTs.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">NADTs</th>
<th valign="top" align="center"># Datasets (IgG)</th>
<th valign="top" align="center"># Datasets (IgM)</th>
<th valign="top" align="center"># Unique novels (IgG)</th>
<th valign="top" align="center"># Unique novels (IgM)</th>
<th valign="top" align="center"># Datasets (total)</th>
<th valign="top" align="center"># Unique novels (total)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">TIgGER</td>
<td valign="top" align="center">1 (0.3)</td>
<td valign="top" align="center">68 (77.3)</td>
<td valign="top" align="center">6 (0.8)</td>
<td valign="top" align="center">57 (4.8)</td>
<td valign="top" align="center">69 (16.3)</td>
<td valign="top" align="center">57 (4.8)</td>
</tr>
<tr>
<td valign="top" align="left">IMPre</td>
<td valign="top" align="center">213 (63.4)</td>
<td valign="top" align="center">88 (100.0)</td>
<td valign="top" align="center">740 (96.1)</td>
<td valign="top" align="center">1033 (86.5)</td>
<td valign="top" align="center">301 (71.0)</td>
<td valign="top" align="center">1033 (86.5)</td>
</tr>
<tr>
<td valign="top" align="left">IgDiscover</td>
<td valign="top" align="center">15 (4.5)</td>
<td valign="top" align="center">62 (70.5)</td>
<td valign="top" align="center">16 (2.1)</td>
<td valign="top" align="center">50 (4.2)</td>
<td valign="top" align="center">77 (18.2)</td>
<td valign="top" align="center">50 (4.2)</td>
</tr>
<tr>
<td valign="top" align="left">Partis</td>
<td valign="top" align="center">4 (1.2)</td>
<td valign="top" align="center">65 (73.9)</td>
<td valign="top" align="center">12 (1.6)</td>
<td valign="top" align="center">101 (8.5)</td>
<td valign="top" align="center">69 (16.3)</td>
<td valign="top" align="center">101 (8.5)</td>
</tr>
<tr>
<td valign="top" align="left">
<bold>Total</bold>
</td>
<td valign="top" align="center">
<bold>215 (64.0)</bold>
</td>
<td valign="top" align="center">
<bold>88 (100.0)</bold>
</td>
<td valign="top" align="center">
<bold>770</bold>
</td>
<td valign="top" align="center">
<bold>1194</bold>
</td>
<td valign="top" align="center">
<bold>303 (71.5)</bold>
</td>
<td valign="top" align="center">
<bold>1194</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The number in each parentheses indicates the corresponding percentage (%) of each item. For columns indicating number of datasets, the associated percentages were calculated based on the total number of datasets (or of a specific type, see <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4A</bold>
</xref>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Applying this scheme to 424 Ig-seq datasets, we identified 23 and 2 reliable NACs from SHM-sparse and SHM-rich group, respectively (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;4</bold>
</xref>). One NAC, IGHV3-33*01_G72C, was identified in both groups. Three of the 24 unique NACs were found to harbor more than one SNPs to their corresponding nearest alleles, while eleven were found in more than one donor (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>). The most frequent NAC was found in 29 donors. Notably, 17 of the 24 NACs can also be identified from public databases or independent reports in the literature (Materials and Methods) (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;4</bold>
</xref>), which demonstrated the high efficiency of our scheme. To enlarge the knowledge database of NACs, we also included 263 multiplex datasets we collected in a previous study into our analysis (<xref ref-type="bibr" rid="B14">14</xref>). These latter datasets were derived from 71 donors and consisted of 186 SHM-rich datasets and 77 SHM-sparse datasets. Considering the degenerate primers designed against framework region 1 (FR1) of V genes, we considered only the sequence downstream of FR1 for each NAC for these multiplex datasets. Applying the same scheme to these datasets as to RACE datasets, we identified in total 22 NACs (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;5</bold>
</xref>) and found that 21 of them were from SHM-sparse datasets. Eleven of the 22 NACs were cross-validated in previous public or published resources. Combining the two NAC sets, we identified 43 unique NAC sequences from a total number of 687 Ig-seq datasets (3 NACs were found in both RACE and multiplex dataset).</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Twenty-four NACs identified from 424 Ig-seq datasets amplified using RACE protocol. The top bar graph shows the number of supportive samples and donors. The bottom scatter plot shows the set of tools identifying a typical NAC. Numbers in the x-axis labels are 1-based positions of SNPs (refer also to <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;4</bold>
</xref>).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g005.tif"/>
</fig>
<p>We then characterized these 43 NACs and found that all NACs derive from core V genes we defined in a previous study according to their prevalence in antibody repertoires (<xref ref-type="bibr" rid="B14">14</xref>) (<xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6A</bold>
</xref>). This result further suggested that gene usage is critical in novel allele identification through Ig-seq dataset. Furthermore, the number of NACs did not correlate with the number of known alleles for a typical gene (Pearson correlation coefficient: 0.43) (<xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6B</bold>
</xref>). However, IGHV1-69, the gene with the second largest known polymorphisms in IMGT, was found with up to 10 additional NACs. Since germline V gene polymorphisms have been implicated in immune response capability (<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B3">3</xref>), these NACs will facilitate the elucidation of the role of germline variants in disease susceptibility. Finally, we classified all identified SNPs (n=75) into two categories, replacement (R) SNPs and silent (S) SNPs, according to the variation of encoded amino acids. Overall, the R/S ratio for these SNPs was around 2 (1.88) (<xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6C</bold>
</xref>). Nevertheless, the R/S ratio was larger for complementarity-determining region (CDR) SNPs (2.78) than framework region (FR) SNPs (1.41), which indicated a varied selection pressure between FRs and CDRs.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Characterization of 43 unique NACs identified from Ig-seq datasets. <bold>(A)</bold> Overlap between genes identified with NACs and 52 core genes defined in a previous study (Yang <italic>et al.</italic>, 2021). <bold>(B)</bold> Correlation between the number of NACs and the number of known alleles (from IMGT/GENE-DB) for a typical gene. The Pearson correlation coefficient is 0.43. Note only 48 of 52 core genes were included in the germline reference sequences. <bold>(C)</bold> Comparisons of R/S (Replacement/Silent) ratio of SNPs in the framework regions (FRs), complementarity-determining regions (CDRs) and both kinds of regions (all). Numbers at the top of the doughnut chart denote R/S ratios.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g006.tif"/>
</fig>
</sec>
</sec>
<sec id="s3" sec-type="discussion">
<title>Discussion</title>
<p>In this study, we comprehensively compared 5 NADTs with an emphasis on their performance in different scenarios. We identified 43 credible NACs through the filtration criteria informed by our benchmark results. We found that these NADTs possess a varied array of functionalities and distinct algorithms implemented in different languages (<xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> and <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;1</bold>
</xref>). By exploiting a combination of <italic>in silico</italic> simulated and genuine Ig-seq datasets, we provided scenario-specific performance spectrums for these NADTs. As summarized in the <italic>Results</italic> section, both <italic>TIgGER</italic> and <italic>IgDiscover</italic> hit a greater balance between sensitivity and specificity in most scenarios than the other NADTs. In contrast, <italic>LymAnalyzer</italic> reported the greatest number of polymorphisms among NADTs, achieving the highest sensitivity in all scenarios, however, at a great cost of specificity. <italic>Partis</italic> and <italic>IMPre</italic> were superior in overcoming challenges brought by SHMs and scarcity of minor alleles, respectively.</p>
<p>Counterintuitively, in our study <italic>IgDiscover</italic> rather than <italic>TIgGER</italic> exhibited higher efficiency in detecting novel alleles from SHM noise in GD-SHM. After careful examination, we identified the difference in candidate novel allele filtration between them. The quantitative filtration employed by <italic>TIgGER</italic> by default is mathematically stricter than the qualitative filtration used by <italic>IgDiscover</italic>. Combining the fact that a non-negligible fraction of unmutated sequences present also in IgG repertoires (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;4</bold>
</xref>), <italic>IgDiscover</italic> outperformed <italic>TIgGER</italic> even in SHM-enriched scenarios. However, ample sequencing depth is a prerequisite, because it favors the presence of enough unmutated sequence needed by <italic>IgDiscover</italic> to detect novel alleles. This is also true for all other NADTs because gene expression level was confirmed to be a general limitation for all NADTs in both simulated and genuine Ig-seq dataset (<xref ref-type="table" rid="T3">
<bold>Tables&#xa0;3</bold>
</xref> and <xref ref-type="table" rid="T5">
<bold>5</bold>
</xref>).</p>
<p>Although the unexpected observation above was not obtained by Ralph et al. (<xref ref-type="bibr" rid="B7">7</xref>), they provided evidence that <italic>TIgGER</italic> can be completely compromised in handling with dataset of typically high SHM, which was possibly due to either the rarity of plurality sequence or the unqualified diversity of unmutated sequences. To maintain the originality of each NADT, we did not alter the suggested parameters and the results here may thus not represent the optimal performance for them. It is very likely that one can obtain greatly improved result when some key parameters are fine-tuned, a strategy that has been employed by Mikocziova et&#xa0;al. (<xref ref-type="bibr" rid="B19">19</xref>). Despite the compromised sensitivity for <italic>TIgGER</italic> on particular datasets in this study, we agree with Ralph et&#xa0;al. that <italic>IgDiscover</italic> and <italic>TIgGER</italic> are more specific in novel allele detection than other NADTs, a major consideration of assigning more weight to them in the filtration scheme. We also noted some differences to Ralph <italic>et al.</italic> The number of SNPs by which a novel allele departs from its nearest known allele (within a range from 1 to 3) are shown to exert negligible influence on <italic>Partis</italic>&#x2019;s performance. However, our result revealed remarkable performance variance in detecting novel alleles separated from their nearest counterparts by SNPs of different number (i. e. 1 <italic>vs</italic> 3). This variance is probably caused by an error-prone procedure that <italic>Partis</italic> tries to manage &#x2013; &#x201c;comparing multiple hypotheses&#x201d; (through which a complete set of individual SNPs contributing to a novel allele is determined). Noteworthy is that the step of initial removal of less-likely alleles in some cases can worsen the detection task for <italic>Partis</italic> because it can remove those less-likely but <italic>bona fide</italic> novel alleles that appears to harbor more than one SNPs.</p>
<p>Given all these findings, we suggest future studies to exploit strengths of different NADTs and present NACs based on the consensus of more than one NADTs whenever genomic validation is unavailable, since none of the NADTs excels in all scenarios.</p>
<p>It should be noted that we considered only a single variable at a time. However, in real-world scenarios, a mixture of challenges represented by these studied variables coexists and thus further complicate novel allele detection tasks. Moreover, we left polymorphisms of nucleotide insertion and deletion (INDEL) unaddressed because the algorithms employed by some NADTs are intrinsically incapable of capturing them (<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;1</bold>
</xref>). Nevertheless, INDEL can&#x2019;t be neglected, especially in species whose germline sets are far from complete. In such cases, <italic>IgDiscover</italic> and <italic>IMPre</italic> are the only choices currently. Finally, this study only focused on evaluation of NADTs&#x2019; performance based on antibody heavy chain repertoire datasets. Their efficiency with light chain and TCR repertoire datasets may vary due to differences inherent to these sequences (e. g. absence of SHM for TCR sequences).</p>
<p>Despite these limitations, our study based on a composite benchmark dataset provides insights into the performance of different NADTs and thus can guide bioinformaticians and immunologists in tool selection in future novel allele detection through these NADTs. Together with the flexible simulation tool and the NACs identified, our study may serve as a valuable reference and resource for immunoglobulin loci germline diversity researches as well as Ig-seq-based studies.</p>
</sec>
<sec id="s4" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="s4_1">
<title>Samples From Human Subjects</title>
<p>A total of 28 samples from peripheral blood, tumor and normal tissues, and bone marrow were collected. Of these, 7 peripheral blood samples were derived from healthy individuals (without recent infection events), 6 peripheral blood samples were from hepatitis B virus-infected patients, 1 bone marrow sample and 2 peripheral blood samples were from graft-<italic>versus</italic>-host disease (GvHD) patients, 4 peripheral blood samples, 1 normal intestine sample and 2 intestine tumor samples were from colorectal cancer (CRC) patients, 2 peripheral blood samples were from individuals involved in traffic accidents, and 3 peripheral blood samples were from patients with adolescent idiopathic scoliosis, sore throat, and chronic pharyngitis, respectively. Peripheral blood mononuclear cells (PBMCs) and bone marrow mononuclear cells were isolated using Ficoll (TBD Science) density-gradient centrifugation. The tissues were cut into small pieces and grind with liquid nitrogen. These experiments were handled under the guidelines of the Ethics Committee of Southern Medical University. For human na&#xef;ve B cells isolation, PBMCs were counted and washed with DPBS supplemented with 1% bovine serum albumin (BSA), and then were stained with a cocktail of fluorescent conjugated antibodies, including ECD-CD19 (Beckman Coulter, A07770), FITC-IgD (Beckman Coulter, B30652), APC-CD27 (BD Bioscience, 561400), and 7-AAD (BD Bioscience, 559925). Human na&#xef;ve B cells (CD19+IgD+CD27-7-AAD-) were sorted using a cell sorter (MoFlo XDP, Beckman Coulter) and collected for single-cell V(D)J sequencing.</p>
</sec>
<sec id="s4_2">
<title>Library Preparation and High-Throughput Sequencing</title>
<p>RNA purification was carried out using the RNeasy Mini Kit (Qiagen, 74106) according to the manufacturer&#x2019;s instructions. Total RNA was used as a template to synthesize cDNA with a SMARTer RACE (Rapid Amplification of cDNA Ends) cDNA Amplification Kit (Clontech, 634928) according to the manufacturer&#x2019;s protocol. Heavy chain variable regions were amplified using 1 &#x3bc;l of RT reaction product and 10 pmol of each primer in a 50 &#x3bc;l total reaction volume (KAPA HiFi HotStart ReadyMix, Roche) using the following thermal cycling program: 95&#xb0;C for 3&#xa0;min; 30 cycles of 98&#xb0;C for 20 s, 60&#xb0;C for 15 s, and 72&#xb0;C for 15 s; 72&#xb0;C for 5&#xa0;min. PCR products were purified using the Nucleospin Gel &amp; PCR Clean-up kit (Macherey-Nagel, 704609.25) and subjected to library preparation using VAHTS Universal DNA Library Prep Kit (Vazyme, ND607-01). Libraries were quantified by capillary electrophoresis (Bio-Fragment analyzer, Bioptic). After quantification, libraries were pooled and sequenced on an Illumina platform (MiSeq PE300). All primers are listed in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;6</bold>
</xref>.</p>
</sec>
<sec id="s4_3">
<title>10X Genomics Single Cell Processing and Next Generation Sequencing</title>
<p>The concentration of the single cell suspension was counted and adjusted to 1000 cells/&#x3bc;l. The single cell suspensions were loaded onto the Chromium Controller microfluidics device (10X Genomics) and processed using Chromium Next GEM Single Cell 5&#x2019; Kits v2 according to manufacturer&#x2019;s protocol. The remaining procedures, including library construction, were performed according to the protocols of the Chromium Single Cell Human BCR Amplification Kit (10X Genomics). Following library construction, the BCR libraries were sequenced on an Illumina platform (NovaSeq 6000) using 2&#xd7;150bp kit.</p>
</sec>
<sec id="s4_4">
<title>Integrated and Modular Pipeline for Antibody Repertoire Simulation</title>
<p>
<italic>IMPlAntS</italic> (<bold>I</bold>ntegrated and <bold>M</bold>odular <bold>P</bold>ipe<bold>l</bold>ine for <bold>Ant</bold>ibody Repertoire <bold>S</bold>imulation) was developed to as much as possible mimic real-world antibody repertoires and meet the requirements (i.e. minor allele frequency control and NGS data simulation) in this study.</p>
<p>As mentioned in Results, <italic>IMPlAntS</italic> consists of three consecutive steps, i) generation of independent V(D)J rearrangements; ii) generation of BCRs with SHMs of proper phylogenetic structure within clones; and iii) generation of NGS reads incorporating base errors (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>). These steps can be implemented individually or collectively using the corresponding scripts hosted on github (<uri xlink:href="https://github.com/Xiujia-Yang/IMPlAntS">https://github.com/Xiujia-Yang/IMPlAntS</uri>).</p>
<p>In the first step, a customizable number of independent rearranged sequences are <italic>in silico</italic> simulated by considering two major features of the real-world rearrangement repertoire: preferential gene usage and junctional nucleotide modification (P and N nucleotide insertions and deletions). To investigate the influence of allelic diversity on novel allele identification, we equipped <italic>IMPlAntS</italic> with the ability to simulate alleles of a certain gene with varied ratios (only two alleles are supported), which can be customized by modifying the gene usage configuration file. Moreover, <italic>IMPlAntS</italic> also allows simulation of nonproductive rearrangements and their percentages in antibody repertoire can be specified by users for specific aims. Notably, the four simulated datasets in this study include only productive rearrangements.</p>
<p>The second step can be further divided into two stages: generation of clonally related sequences with proper phylogenetic structure and various numbers for each sequence. Clonally related sequences are created by a certain number of iterations (to mimic the affinity maturation of real-world antibody sequence) where SHMs are induced for randomly selected sequences across the variable region based on the positional mutability and substitution models similar to Yermanos et&#xa0;al. (<xref ref-type="bibr" rid="B13">13</xref>). In each iteration, a fraction of sequences in the current sequence pool are randomly selected for SHM simulation and new sequences with simulated SHMs will be added into the current sequence pool that will be subjected to random selection in the next iteration. Independent rearranged sequences serve as the input in the first iteration. Because the positional mutability model stores mutation probabilities for different positions observed in end repertoires (repertoires containing sequences have already undergone multiple rounds of maturations), a parameter named &#x2018;&#x2014;mut_ability_fold&#x2019; (less than 1) is introduced here to prevent the generation of hyper-mutated sequences after a number of iterations. Iterations above produce nonredundant clonally related sequences. Then selective sequences will be populated according to the power law (<xref ref-type="bibr" rid="B20">20</xref>) to mimic the clonal expansion of B cells with a various number of replicates. The key parameters in this step, including the number of iterations, the maximum number of sequences, the alpha value of the power law, and the largest size of sequences, are customizable. ART is employed in the last step to produce NGS data with Illumina MiSeq system settings.</p>
<p>For the above steps, parameters of gene usage, junctional modification, positional mutability and substitution models, were obtained from a population-level antibody repertoire study (<xref ref-type="bibr" rid="B14">14</xref>) and are set as defaults of <italic>IMPlAntS</italic>. Gene usage is calculated as the percentage of clones (sets of sequences sharing the same V and J gene and CDR3 nucleotide sequence) in a repertoire recombined from a certain gene. In this study, V, D and J gene usages are taken from normalized medians of gene usages from 2152 antibody repertoires of 582 donors. Junctional modification parameters consist of 10 entities (i.e. V3D, V3P, N1, D5D, D5P, D3D, D3P, N2, J5D and J5P (D, deleted nucleotide; P, palindromic nucleotide; N, nontemplated nucleotide), as demonstrated also in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>). The probabilities of modification lengths for each of these entities are derived from the observation of a combination of 2152 antibody repertoires of 582 donors. The positional mutability and substitution models were obtained from IgG repertoires of PBMC from 353 healthy donors. All parameters above can be found on the github and are set as defaults by <italic>IMPlAntS</italic>. <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;5</bold>
</xref> and <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure&#xa0;3B</bold>
</xref> show the approximation of the real-world repertoire for repertoires in the four simulated datasets in this study.</p>
</sec>
<sec id="s4_5">
<title>Customization of Reference Sequences With Artificially &#x2018;Novel&#x2019; V Alleles</title>
<p>In this study, the germline reference sequences for V, D, and J genes were obtained from IMGT GENE-DB and provided as <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;7</bold>
</xref>. The artificially &#x201c;novel&#x201d; alleles for V genes were created for both simulated dataset and real Ig-seq dataset. Only germline reference sequences used in the simulation were extracted to serve as the initial reference sequences for the simulated dataset. The set of alleles subject to the artificial SNP generation for each dataset was selected according to the criteria defined as <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. We randomly created SNPs in the sequence of selected alleles. These artificial SNPs were set to locate in the first 280 bp of V genes at the 5&#x2019; ends to avoid the possible failure in novel allele detection caused by junctional modification. A pitfall here is that there exists a possibility that the rearranged sequences fail to be best aligned against the artificially novel sequences, and this brings challenges in the evaluation of novel allele identification for NADTs. Therefore, we performed pairwise alignment between customized reference sequences and the germline sequences contained in each dataset and removed those unaltered allele sequences that were found to be more similar to the germline sequences than the &#x201c;novel&#x201d; allele sequences. The novel alleles identified by NADTs were in fact the real-world germline sequences, while &#x201c;novel&#x201d; is just a concept relative to the altered germline reference sequences.</p>
</sec>
<sec id="s4_6">
<title>Pipeline and Parameters Employed by 5 NADTs</title>
<p>The pair-end simulated dataset and bulk sequencing dataset were firstly assembled using <italic>PEAR</italic> (v0.9.6). The successfully assembled sequences were then taken as the input for <italic>IgDiscover</italic> and <italic>LymAnalyzer</italic>. As <italic>TIgGER</italic> can only accept a formatted database of well-annotated sequences as input, we further annotated and formatted the assembled sequences with <italic>IgBLAST</italic> (v2.8.0+) and <italic>Change-O</italic> toolkits (v0.4.4), respectively (<italic>IgBLAST</italic> was selected for its excellent performance (<xref ref-type="bibr" rid="B20">20</xref>) and easy output format conversion through <italic>Change-O</italic> toolkits). For <italic>IMPre</italic> and <italic>Partis</italic>, the input assembled sequences were corrected in a forward orientation at first. The script employed by <italic>IMPre</italic> (&#x2018;<italic>IMPre.pl</italic>&#x2019;) was modified to enable germline reference customization. The revised script, &#x2018;<italic>IMPre_revised.pl</italic>&#x2019;, can be found on the github (<uri xlink:href="https://github.com/Xiujia-Yang/IMPlAntS">https://github.com/Xiujia-Yang/IMPlAntS</uri>). All parameters used by the five NADTs were set in default or as suggested. We provided the detailed commandline parameters as below,</p>
<list list-type="simple">
<list-item>
<p>&#x2022; <italic>TIgGER (v0.4.0)</italic>:</p>
</list-item>
<list-item>
<p>&gt;<bold>findNovelAlleles</bold>(SampleDb, GermlineIGHV, nproc=4)</p>
</list-item>
<list-item>
<p>&#x2022; <italic>IMPre (v1.1.0)</italic>:</p>
</list-item>
<list-item>
<p>&gt;<bold>perl IMPre_revised.pl</bold> &#x2013;i *.fasta &#x2013;n sample_name &#x2013;o output_directory &#x2013;v_min_e 1 &#x2013;j_min_e 1 &#x2013;vm 50 &#x2013;jm 60 &#x2013;v_seed 200 &#x2013;vn 300 &#x2013;jf_ave 2 &#x2013;known_v customized_v_reference &#x2013;known_j customized_j_reference</p>
</list-item>
<list-item>
<p>&#x2022; <italic>IgDiscover (v0.12.3)</italic>:</p>
</list-item>
<list-item>
<p>&gt;<bold>igdiscover init</bold> &#x2013;db customized_database &#x2013;single-reads *.fastq sample_name</p>
</list-item>
<list-item>
<p>&gt;cd $sample &amp;&amp; <bold>igdiscover run</bold>
</p>
</list-item>
<list-item>
<p>&#x2022; <italic>LymAnalyzer (v1.2.2)</italic>:</p>
</list-item>
<list-item>
<p>&gt;java -jar &#x2013;Xmx8g <bold>LymAnalyzer_cmd_1.2.2.jar</bold> *.fastq result_folder IGH hs sample_name Yes No 5 reference_directory</p>
</list-item>
<list-item>
<p>&#x2022; <italic>Partis (v0.16.0)</italic>:</p>
</list-item>
<list-item>
<p>&gt;<bold>/partis/bin/partis cache-parameters</bold> &#x2013;infname *.fasta &#x2013;parameter-dir parameter_directory &#x2013;n-procs 16</p>
</list-item>
</list>
</sec>
<sec id="s4_7">
<title>Sensitivity and Specificity Calculation</title>
<p>Sensitivity is defined as the proportion of true positives that are correctly identified among all true positives, whereas specificity is defined as the proportion of true positives among all the identified positives. For individual SNPs (SNP level), a hit is considered as a true positive only when its nearest allele (same as the allele selected for artificial SNP generation), loci and nucleotide variant are correct at the same time. For individual sequences (allele level), a hit is considered a true positive only when it covers all the genuine SNPs and contains no mismatches with the genuine novel sequence in all other reported loci. A schematic diagram is provided here to demonstrate the cases of true positive and false positive in identifying individual sequences for novel alleles (<xref ref-type="fig" rid="f7">
<bold>Scheme 1</bold>
</xref>).</p>
<fig id="f7" position="float">
<label>Scheme&#xa0;1</label>
<caption>
<p>Schematic diagram of true positive and false positive in novel allele detection. The top sequence in bold represents the genuine novel sequence while the bottom sequences represent the partial/full-length sequences discovered by NADTs. The nucleotides marked in green represent the genuine SNPs while those in red are mismatches with the genuine novel sequence either in SNP loci or non-SNP loci. An identified sequence is accepted as a true positive only when it covers all the genuine SNPs and contains no mismatch with the genuine novel sequence in all other loci.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-12-739179-g007.tif"/>
</fig>
</sec>
<sec id="s4_8">
<title>Germline V Allele Identification Through Single Na&#xef;ve B Cell Sequencing Dataset</title>
<p>
<italic>Cell Ranger</italic> (v3.1.0) was exploited to preprocess the raw single na&#xef;ve B cell sequencing dataset. Contig assembly, annotation, and clonotype analysis were performed using &#x201c;cellranger vdj&#x201d; with the <italic>Cell Ranger</italic> V(D)J compatible reference (refdata-cellranger-vdj-GRCh38-alts-ensembl-3.1.0). Then the assembled <italic>contig</italic> sequences (&#x201c;all_contig.fasta&#x201d;) of the two replicates for each donor were pooled and then annotated using <italic>IgBLAST</italic> (v2.8.0+) with the germline references obtained from IMGT/GENE-DB (refer to above). Afterwards, the V segment (or allele) sequence was extracted from each annotated sequence and then each unique V segment sequences was counted. It is worth mentioning here that those short V segment sequences were merged into the longer ones provided they are with the same V allele annotation as the longer ones and were included in them. The counts for the short V segment sequences were also added to the longer ones. We discarded those with a length less than 290 bp or with a count less than 10 and determined the most frequent V segment sequence for each gene as the most confidential germline sequence for a gene. Apart from that, we also retained the second most frequent V segment sequence for a gene provided that its abundance was at least one tenth of that of the most frequent one (<xref ref-type="bibr" rid="B21">21</xref>).</p>
</sec>
<sec id="s4_9">
<title>Ig-Seq Dataset Classification Criteria</title>
<p>All enrolled Ig-seq datasets (i.e. 424 RACE datasets and 263 multiplex datasets mentioned in the <italic>Results</italic> section) were analyzed using MiXCR (v3.0.7) per the method in our previous study (<xref ref-type="bibr" rid="B14">14</xref>). After clonotype assembly, a constant gene will be assigned for each clone if antibody sequences from this clone cover constant region. The isotype (i.e. IgM, IgD, IgG, IgA, and IgE) was extracted for each clone and the clone-level isotype frequency was calculated for each dataset. IgM and IgD are deemed as SHM-sparse isotypes while IgG, IgA and IgE are deemed as SHM-enrich isotypes (<xref ref-type="bibr" rid="B22">22</xref>). Datasets will be classified as IgM datasets if they contain more SHM-sparse isotypes than SHM-enrich isotypes, otherwise IgG datasets. Constant genes were required to be assigned for more than a half number of clones in each dataset. All 687 Ig-seq datasets we enrolled in this study met this requirement.</p>
</sec>
<sec id="s4_10">
<title>V Allele Sequences From Public Databases and Independent Reports</title>
<p>To double-check NACs we identified through NADTs, we collected antibody heavy chain V allele sequences from five public databases (IMGT/GENE-DB, <uri xlink:href="http://www.imgt.org/genedb/">http://www.imgt.org/genedb/</uri>; IgPdb, <uri xlink:href="https://cgi.cse.unsw.edu.au/~ihmmune/IgPdb/information.php">https://cgi.cse.unsw.edu.au/~ihmmune/IgPdb/information.php</uri>; VBASE2 (<xref ref-type="bibr" rid="B23">23</xref>), <uri xlink:href="http://www.vbase2.org/">http://www.vbase2.org/</uri>; Lym1K (<xref ref-type="bibr" rid="B24">24</xref>), <uri xlink:href="http://maths.nuigalway.ie/biocluster/database/">http://maths.nuigalway.ie/biocluster/database/</uri>; OGRDB (<xref ref-type="bibr" rid="B4">4</xref>), <uri xlink:href="https://ogrdb.airr-community.org/">https://ogrdb.airr-community.org/</uri>) and nine independent reports (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B25">25</xref>&#x2013;<xref ref-type="bibr" rid="B30">30</xref>) and compared them with identified NACs. Before the sequence comparison, degenerate bases or N nucleotides in collected allele sequences were substituted with &#x2018;A&#x2019;, &#x2018;C&#x2019;, &#x2018;G&#x2019;, or &#x2018;T&#x2019;, accordingly. NACs whose sequences were identical to any of the sequences from a source were considered cross-validated NACs. As the set of V allele sequences used as germline reference to identify novel alleles is not as complete as those in the later release of IMGT/GENE-DB, several NAC sequences were included in the later release of IMGT/GENE-DB and thus were also cross-validated in it. All collected V allele sequences are outlined in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table&#xa0;8</bold>
</xref>.</p>
</sec>
</sec>
<sec id="s5" sec-type="data-availability">
<title>Data Availability Statement</title>
<p>In-house sequencing data including paired single na&#xef;ve B cell and bulk sequencing dataset and unpaired bulk sequencing dataset, as well as simulated dataset, is stored in NCBI SRA database under accession number PRJNA732986. The code used in this study, including all scripts involved in IMPlAntS simulation pipeline and an edited script (IMPre_revised.pl), has been deposited on the github (<uri xlink:href="https://github.com/Xiujia-Yang/IMPlAntS">https://github.com/Xiujia-Yang/IMPlAntS</uri>).</p>
</sec>
<sec id="s6" sec-type="ethics-statement">
<title>Ethics Statement</title>
<p>The studies involving human participants were reviewed and approved by Ethics Committee of Southern Medical University. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author Contributions</title>
<p>XJY, YZ, SC, HZ, and CL performed bioinformatics analyses on the data. QW and JG collected samples and conducted the biological experiments. XJY, XQY, and ZZ wrote the manuscript. ZZ conceived the project. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>This study was supported by the National Natural Science Foundation of China (NSFC) (31771479, 81991511 and 81991510 to ZZ), NSFC Projects of International Cooperation and Exchanges of NSFC (61661146004 to ZZ), the Local Innovative and Research Teams Project of Guangdong Pearl River Talents Program (2017BT01S131 to ZZ) and Guangdong-Hong Kong-Macao-Joint Labs Program from Guangdong Science and Technology (2019B121205005 to XQY).</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<sec id="s11" sec-type="supplementary-material">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fimmu.2021.739179/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fimmu.2021.739179/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Image_1.pdf" id="SF1" mimetype="application/pdf"/>
<supplementary-material xlink:href="Table_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
<supplementary-material xlink:href="Table_2.docx" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document"/>
<supplementary-material xlink:href="Table_3.xlsx" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"/>
<supplementary-material xlink:href="Table_4.pdf" id="SM4" mimetype="application/pdf"/>
<supplementary-material xlink:href="Table_5.pdf" id="SM5" mimetype="application/pdf"/>
<supplementary-material xlink:href="Table_6.xlsx" id="SM6" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"/>
<supplementary-material xlink:href="Table_7.xlsx" id="SM7" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"/>
<supplementary-material xlink:href="Table_8.xlsx" id="SM8" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lingwood</surname> <given-names>D</given-names>
</name>
<name>
<surname>McTamney</surname> <given-names>PM</given-names>
</name>
<name>
<surname>Yassine</surname> <given-names>HM</given-names>
</name>
<name>
<surname>Whittle</surname> <given-names>JRR</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>X</given-names>
</name>
<name>
<surname>Boyington</surname> <given-names>JC</given-names>
</name>
<etal/>
</person-group>. <article-title>Structural and Genetic Basis for Development of Broadly Neutralizing Influenza Antibodies</article-title>. <source>Nature</source> (<year>2012</year>) <volume>489</volume>:<page-range>566&#x2013;70</page-range>. doi: <pub-id pub-id-type="doi">10.1038/nature11371</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Avnir</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Watson</surname> <given-names>CT</given-names>
</name>
<name>
<surname>Glanville</surname> <given-names>J</given-names>
</name>
<name>
<surname>Peterson</surname> <given-names>EC</given-names>
</name>
<name>
<surname>Tallarico</surname> <given-names>AS</given-names>
</name>
<name>
<surname>Bennett</surname> <given-names>AS</given-names>
</name>
<etal/>
</person-group>. <article-title>IGHV1-69 Polymorphism Modulates Anti-Influenza Antibody Repertoires, Correlates With IGHV Utilization Shifts and Varies by Ethnicity</article-title>. <source>Sci Rep</source> (<year>2016</year>) <volume>6</volume>:<fpage>20842</fpage>. doi: <pub-id pub-id-type="doi">10.1038/srep23876</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Parks</surname> <given-names>T</given-names>
</name>
<name>
<surname>Mirabel</surname> <given-names>MM</given-names>
</name>
<name>
<surname>Kado</surname> <given-names>J</given-names>
</name>
<name>
<surname>Auckland</surname> <given-names>K</given-names>
</name>
<name>
<surname>Nowak</surname> <given-names>J</given-names>
</name>
<name>
<surname>Rautanen</surname> <given-names>A</given-names>
</name>
<etal/>
</person-group>. <article-title>Association Between a Common Immunoglobulin Heavy Chain Allele and Rheumatic Heart Disease Risk in Oceania</article-title>. <source>Nat Commun</source> (<year>2017</year>) <volume>8</volume>:<fpage>14946</fpage>. doi: <pub-id pub-id-type="doi">10.1038/ncomms14946</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lees</surname> <given-names>W</given-names>
</name>
<name>
<surname>Busse</surname> <given-names>CE</given-names>
</name>
<name>
<surname>Corcoran</surname> <given-names>M</given-names>
</name>
<name>
<surname>Ohlin</surname> <given-names>M</given-names>
</name>
<name>
<surname>Scheepers</surname> <given-names>C</given-names>
</name>
<name>
<surname>Matsen</surname> <given-names>FA</given-names>
</name>
<etal/>
</person-group>. <article-title>OGRDB: A Reference Database of Inferred Immune Receptor Genes</article-title>. <source>Nucleic Acids Res</source> (<year>2020</year>) <volume>48</volume>:<page-range>D964&#x2013;70</page-range>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkz822</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Corcoran</surname> <given-names>MM</given-names>
</name>
<name>
<surname>Phad</surname> <given-names>GE</given-names>
</name>
<name>
<surname>Bernat</surname> <given-names>NV</given-names>
</name>
<name>
<surname>Stahl-Hennig</surname> <given-names>C</given-names>
</name>
<name>
<surname>Sumida</surname> <given-names>N</given-names>
</name>
<name>
<surname>Persson</surname> <given-names>MAA</given-names>
</name>
<etal/>
</person-group>. <article-title>Production of Individualized V Gene Databases Reveals High Levels of Immunoglobulin Genetic Diversity</article-title>. <source>Nat Commun</source> (<year>2016</year>) <volume>7</volume>:<fpage>13642</fpage>. doi: <pub-id pub-id-type="doi">10.1038/ncomms13642</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gadala-Maria</surname> <given-names>D</given-names>
</name>
<name>
<surname>Yaari</surname> <given-names>G</given-names>
</name>
<name>
<surname>Uduman</surname> <given-names>M</given-names>
</name>
<name>
<surname>Kleinstein</surname> <given-names>SH</given-names>
</name>
</person-group>. <article-title>Automated Analysis of High-Throughput B-Cell Sequencing Data Reveals a High Frequency of Novel Immunoglobulin V Gene Segment Alleles</article-title>. <source>Proc Natl Acad Sci</source> (<year>2015</year>) <volume>112</volume>:<page-range>E862&#x2013;70</page-range>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1417683112</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ralph</surname> <given-names>DK</given-names>
</name>
<name>
<surname>Matsen</surname> <given-names>FA</given-names>
</name>
</person-group>. <article-title>Per-Sample Immunoglobulin Germline Inference From B Cell Receptor Deep Sequencing Data</article-title>. <source>PloS Comput Biol</source> (<year>2019</year>) <volume>15</volume>:<fpage>e1007133</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pcbi.1007133</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Ceredig</surname> <given-names>R</given-names>
</name>
<name>
<surname>Seoighe</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>LymAnalyzer: A Tool for Comprehensive Analysis of Next Generation Sequencing Data of T Cell Receptors and Immunoglobulins</article-title>. <source>Nucleic Acids Res</source> (<year>2016</year>) <volume>44</volume>:<fpage>e31</fpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkv1016</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>W</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>I</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>L</given-names>
</name>
<name>
<surname>Chai</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>IMPre: An Accurate and Efficient Software for Prediction of T- and B-Cell Receptor Germline Genes and Alleles From Rearranged Repertoire Data</article-title>. <source>Front Immunol</source> (<year>2016</year>) <volume>7</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2016.00457</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marcou</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Mora</surname> <given-names>T</given-names>
</name>
<name>
<surname>Walczak</surname> <given-names>AM</given-names>
</name>
</person-group>. <article-title>High-Throughput Immune Repertoire Analysis With IGoR</article-title>. <source>Nat Commun</source> (<year>2018</year>) <volume>9</volume>(<issue>1</issue>):<fpage>561</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-018-02832-w</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Safonova</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Lapidus</surname> <given-names>A</given-names>
</name>
<name>
<surname>Lill</surname> <given-names>J</given-names>
</name>
</person-group>. <article-title>IgSimulator: A Versatile Immunosequencing Simulator</article-title>. <source>Bioinformatics</source> (<year>2015</year>) <volume>31</volume>:<page-range>3213&#x2013;5</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btv326</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weber</surname> <given-names>CR</given-names>
</name>
<name>
<surname>Akbar</surname> <given-names>R</given-names>
</name>
<name>
<surname>Yermanos</surname> <given-names>A</given-names>
</name>
<name>
<surname>Pavlovi&#x107;</surname> <given-names>M</given-names>
</name>
<name>
<surname>Snapkov</surname> <given-names>I</given-names>
</name>
<name>
<surname>Sandve</surname> <given-names>GK</given-names>
</name>
<etal/>
</person-group>. <article-title>immuneSIM: Tunable Multi-Feature Simulation of B- and T-Cell Receptor Repertoires for Immunoinformatics Benchmarking</article-title>. <source>Bioinformatics</source> (<year>2020</year>) <volume>36</volume>:<page-range>3594&#x2013;6</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa158</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yermanos</surname> <given-names>A</given-names>
</name>
<name>
<surname>Greiff</surname> <given-names>V</given-names>
</name>
<name>
<surname>Krautler</surname> <given-names>NJ</given-names>
</name>
<name>
<surname>Menzel</surname> <given-names>U</given-names>
</name>
<name>
<surname>Dounas</surname> <given-names>A</given-names>
</name>
<name>
<surname>Miho</surname> <given-names>E</given-names>
</name>
<etal/>
</person-group>. <article-title>Comparison of Methods for Phylogenetic B-Cell Lineage Inference Using Time-Resolved Antibody Repertoire Simulations (AbSim)</article-title>. <source>Bioinformatics</source> (<year>2017</year>) <volume>33</volume>:<page-range>3938&#x2013;46</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btx533</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>J</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zeng</surname> <given-names>H</given-names>
</name>
<etal/>
</person-group>. <article-title>Large-Scale Analysis of 2,152 Ig-Seq Datasets Reveals Key Features of B Cell Biology and the Antibody Repertoire</article-title>. <source>Cell Rep</source> (<year>2021</year>) <volume>35</volume>:<fpage>109110</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.celrep.2021.109110</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>W</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L</given-names>
</name>
<name>
<surname>Myers</surname> <given-names>JR</given-names>
</name>
<name>
<surname>Marth</surname> <given-names>GT</given-names>
</name>
</person-group>. <article-title>ART: A Next-Generation Sequencing Read Simulator</article-title>. <source>Bioinformatics</source> (<year>2012</year>) <volume>28</volume>:<page-range>593&#x2013;4</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btr708</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Watson</surname> <given-names>CT</given-names>
</name>
<name>
<surname>Steinberg</surname> <given-names>KM</given-names>
</name>
<name>
<surname>Huddleston</surname> <given-names>J</given-names>
</name>
<name>
<surname>Warren</surname> <given-names>RL</given-names>
</name>
<name>
<surname>Malig</surname> <given-names>M</given-names>
</name>
<name>
<surname>Schein</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Complete Haplotype Sequence of the Human Immunoglobulin Heavy-Chain Variable, Diversity, and Joining Genes and Characterization of Allelic and Copy-Number Variation</article-title>. <source>Am J Hum Genet</source> (<year>2013</year>) <volume>92</volume>:<page-range>530&#x2013;46</page-range>. doi: <pub-id pub-id-type="doi">10.1016/j.ajhg.2013.03.004</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Budeus</surname> <given-names>B</given-names>
</name>
<name>
<surname>Schweigle De Reynoso</surname> <given-names>S</given-names>
</name>
<name>
<surname>Przekopowitz</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hoffmann</surname> <given-names>D</given-names>
</name>
<name>
<surname>Seifert</surname> <given-names>M</given-names>
</name>
<name>
<surname>K&#xfc;ppers</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Complexity of the Human Memory B-Cell Compartment is Determined by the Versatility of Clonal Diversification in Germinal Centers</article-title>. <source>Proc Natl Acad Sci</source> (<year>2015</year>) <volume>112</volume>:<page-range>E5281&#x2013;9</page-range>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1511270112</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ghraichy</surname> <given-names>M</given-names>
</name>
<name>
<surname>Galson</surname> <given-names>JD</given-names>
</name>
<name>
<surname>Kovaltsuk</surname> <given-names>A</given-names>
</name>
<name>
<surname>von Niederh&#xe4;usern</surname> <given-names>V</given-names>
</name>
<name>
<surname>Pachlopnik Schmid</surname> <given-names>J</given-names>
</name>
<name>
<surname>Recher</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>Maturation of the Human Immunoglobulin Heavy Chain Repertoire With Age</article-title>. <source>Front Immunol</source> (<year>2020</year>) <volume>11</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2020.01734</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mikocziova</surname> <given-names>I</given-names>
</name>
<name>
<surname>Gidoni</surname> <given-names>M</given-names>
</name>
<name>
<surname>Lindeman</surname> <given-names>I</given-names>
</name>
<name>
<surname>Peres</surname> <given-names>A</given-names>
</name>
<name>
<surname>Snir</surname> <given-names>O</given-names>
</name>
<name>
<surname>Yaari</surname> <given-names>G</given-names>
</name>
<etal/>
</person-group>. <article-title>Polymorphisms in Human Immunoglobulin Heavy Chain Variable Genes and Their Upstream Regions</article-title>. <source>Nucleic Acids Res</source> (<year>2020</year>) <volume>48</volume>:<page-range>5499&#x2013;510</page-range>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkaa310</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M</given-names>
</name>
<name>
<surname>Ou</surname> <given-names>JX</given-names>
</name>
<etal/>
</person-group>. <article-title>Tools for Fundamental Analysis Functions of TCR Repertoires: A Systematic Comparison</article-title>. <source>Brief Bioinform</source> (<year>2020</year>) <volume>21</volume>:<page-range>1706&#x2013;16</page-range>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbz092</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Boyd</surname> <given-names>SD</given-names>
</name>
<name>
<surname>Gaeta</surname> <given-names>BA</given-names>
</name>
<name>
<surname>Jackson</surname> <given-names>KJ</given-names>
</name>
<name>
<surname>Fire</surname> <given-names>AZ</given-names>
</name>
<name>
<surname>Marshall</surname> <given-names>EL</given-names>
</name>
<name>
<surname>Merker</surname> <given-names>JD</given-names>
</name>
<etal/>
</person-group>. <article-title>Individual Variation in the Germline Ig Gene Repertoire Inferred From Variable Region Gene Rearrangements</article-title>. <source>J Immunol</source> (<year>2010</year>) <volume>184</volume>:<page-range>6986&#x2013;92</page-range>. doi: <pub-id pub-id-type="doi">10.4049/jimmunol.1000445</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kitaura</surname> <given-names>K</given-names>
</name>
<name>
<surname>Yamashita</surname> <given-names>H</given-names>
</name>
<name>
<surname>Ayabe</surname> <given-names>H</given-names>
</name>
<name>
<surname>Shini</surname> <given-names>T</given-names>
</name>
<name>
<surname>Matsutani</surname> <given-names>T</given-names>
</name>
<name>
<surname>Suzuki</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Different Somatic Hypermutation Levels Among Antibody Subclasses Disclosed by a New Next-Generation Sequencing-Based Antibody Repertoire Analysis</article-title>. <source>Front Immunol</source> (<year>2017</year>) <volume>8</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2017.00389</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Retter</surname> <given-names>I</given-names>
</name>
</person-group>. <article-title>VBASE2, an Integrative V Gene Database</article-title>. <source>Nucleic Acids Res</source> (<year>2004</year>) <volume>33</volume>:<page-range>D671&#x2013;4</page-range>. doi: <pub-id pub-id-type="doi">10.1093/nar/gki088</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Ceredig</surname> <given-names>R</given-names>
</name>
<name>
<surname>Seoighe</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>A Database of Human Immune Receptor Alleles Recovered From Population Sequencing Data</article-title>. <source>J Immunol</source> (<year>2017</year>) <volume>198</volume>:<page-range>2202&#x2013;10</page-range>. doi: <pub-id pub-id-type="doi">10.4049/jimmunol.1601710</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gadala-Maria</surname> <given-names>D</given-names>
</name>
<name>
<surname>Gidoni</surname> <given-names>M</given-names>
</name>
<name>
<surname>Marquez</surname> <given-names>S</given-names>
</name>
<name>
<surname>Vander Heiden</surname> <given-names>JA</given-names>
</name>
<name>
<surname>Kos</surname> <given-names>JT</given-names>
</name>
<name>
<surname>Watson</surname> <given-names>CT</given-names>
</name>
<etal/>
</person-group>. <article-title>Identification of Subject-Specific Immunoglobulin Alleles From Expressed Repertoire Sequencing Data</article-title>. <source>Front Immunol</source> (<year>2019</year>) <volume>10</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2019.00129</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gidoni</surname> <given-names>M</given-names>
</name>
<name>
<surname>Snir</surname> <given-names>O</given-names>
</name>
<name>
<surname>Peres</surname> <given-names>A</given-names>
</name>
<name>
<surname>Polak</surname> <given-names>P</given-names>
</name>
<name>
<surname>Lindeman</surname> <given-names>I</given-names>
</name>
<name>
<surname>Mikocziova</surname> <given-names>I</given-names>
</name>
<etal/>
</person-group>. <article-title>Mosaic Deletion Patterns of the Human Antibody Heavy Chain Gene Locus Shown by Bayesian Haplotyping</article-title>. <source>Nat Commun</source> (<year>2019</year>) <volume>10</volume>(<issue>1</issue>):<fpage>628</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-019-08489-3</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Th&#xf6;rnqvist</surname> <given-names>L</given-names>
</name>
<name>
<surname>Ohlin</surname> <given-names>M</given-names>
</name>
</person-group>. <article-title>Critical Steps for Computational Inference of the 3&#x2032;-End of Novel Alleles of Immunoglobulin Heavy Chain Variable Genes - Illustrated by an Allele of IGHV3-7</article-title>. <source>Mol Immunol</source> (<year>2018</year>) <volume>103</volume>:<fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.molimm.2018.08.018</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>V&#xe1;zquez Bernat</surname> <given-names>N</given-names>
</name>
<name>
<surname>Corcoran</surname> <given-names>M</given-names>
</name>
<name>
<surname>Hardt</surname> <given-names>U</given-names>
</name>
<name>
<surname>Kaduk</surname> <given-names>M</given-names>
</name>
<name>
<surname>Phad</surname> <given-names>GE</given-names>
</name>
<name>
<surname>Martin</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>High-Quality Library Preparation for NGS-Based Immunoglobulin Germline Gene Inference and Repertoire Expression Analysis</article-title>. <source>Front Immunol</source> (<year>2019</year>) <volume>10</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2019.00660</pub-id>
</citation>
</ref>
<ref id="B29">
<label>29</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Jackson</surname> <given-names>KJ</given-names>
</name>
<name>
<surname>G&#xe4;eta</surname> <given-names>B</given-names>
</name>
<name>
<surname>Pomat</surname> <given-names>W</given-names>
</name>
<name>
<surname>Siba</surname> <given-names>P</given-names>
</name>
<name>
<surname>Sewell</surname> <given-names>WA</given-names>
</name>
<etal/>
</person-group>. <article-title>Genomic Screening by 454 Pyrosequencing Identifies a New Human IGHV Gene and Sixteen Other New IGHV Allelic Variants</article-title>. <source>Immunogenetics</source> (<year>2011</year>) <volume>63</volume>:<page-range>259&#x2013;65</page-range>. doi: <pub-id pub-id-type="doi">10.1007/s00251-010-0510-8</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wendel</surname> <given-names>BS</given-names>
</name>
<name>
<surname>He</surname> <given-names>C</given-names>
</name>
<name>
<surname>Crompton</surname> <given-names>PD</given-names>
</name>
<name>
<surname>Pierce</surname> <given-names>SK</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>N</given-names>
</name>
</person-group>. <article-title>A Streamlined Approach to Antibody Novel Germline Allele Prediction and Validation</article-title>. <source>Front Immunol</source> (<year>2017</year>) <volume>8</volume>. doi: <pub-id pub-id-type="doi">10.3389/fimmu.2017.01072</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>