<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="review-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Electron.</journal-id>
<journal-title>Frontiers in Electronics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Electron.</abbrev-journal-title>
<issn pub-type="epub">2673-5857</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1469802</article-id>
<article-id pub-id-type="doi">10.3389/felec.2025.1469802</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Electronics</subject>
<subj-group>
<subject>Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Quantized convolutional neural networks: a hardware perspective</article-title>
<alt-title alt-title-type="left-running-head">Zhang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/felec.2025.1469802">10.3389/felec.2025.1469802</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Li</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2797554/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>Olga</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1239917/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fouda</surname>
<given-names>Mohammed E.</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/990346/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Eltawil</surname>
<given-names>Ahmed M.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1013568/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Salama</surname>
<given-names>Khaled Nabil</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Computer, Electrical and Mathematical Science and Engineering Division</institution>, <institution>King Abdullah University of Science and Technology</institution>, <addr-line>Thuwal</addr-line>, <country>Saudi Arabia</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Rain Neuromorphics</institution>, <institution>San Francisco Inc. CA</institution>, <addr-line>San Francisco</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1344853/overview">Yao Chen</ext-link>, National University of Singapore, Singapore</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2402988/overview">Dhruva Ghai</ext-link>, Oriental University, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1648315/overview">Peter A. Beerel</ext-link>, University of Southern California, United States</p>
<p>You (Dorothy) Qiu, University of Southern California, United States, in collaboration with reviewer PB</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Khaled Nabil Salama, <email>khaled.salama@kaust.edu.sa</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>6</volume>
<elocation-id>1469802</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>07</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>06</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Zhang, Krestinskaya, Fouda, Eltawil and Salama.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Zhang, Krestinskaya, Fouda, Eltawil and Salama</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>With the rapid development of machine learning, Deep Neural Network (DNN) exhibits superior performance in solving complex problems like computer vision and natural language processing compared with classic machine learning techniques. On the other hand, the rise of the Internet of Things (IoT) and edge computing set a demand on executing those complex tasks on corresponding devices. As the name suggested, deep neural networks are sophisticated models with complex structures and millions of parameters, which overwhelm the capacity of IoT and edge devices. To facilitate the deployment, quantization, as one of the most promising methods, is proposed to alleviate the challenge in terms of memory usage and computation complexity by quantizing both the parameters and data flow in the DNN model into formats with shorter bit-width. Consistently, dedicated hardware accelerators are developed to further boost the execution efficiency of DNN models. In this work, we focus on Convolutional Neural Network (CNN) as an example of DNNs and conduct a comprehensive survey on various quantization and quantized training methods. We also discuss various hardware accelerator designs for quantized CNN (QCNN). Based on the review of both algorithm and hardware design, we provide general software-hardware co-design considerations. Based on the analysis, we discuss open challenges and future research directions for both algorithms and corresponding hardware designs of quantized neural networks (QNNs).</p>
</abstract>
<kwd-group>
<kwd>convolutional neural networks</kwd>
<kwd>quantization</kwd>
<kwd>hardware</kwd>
<kwd>in-memory computing (IMC)</kwd>
<kwd>FPGA</kwd>
</kwd-group>
<contract-num rid="cn001">URF/1/4704-01-01</contract-num>
<contract-sponsor id="cn001">King Abdullah University of Science and Technology<named-content content-type="fundref-id">10.13039/501100004052</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Integrated Circuits and VLSI</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Convolutional Neural Network (CNN) is one of the fundamental building blocks in modern computer vision systems proven to be effective in image classification, video processing, and object detection. The state-of-the-art CNNs are capable of performing very complex image classification tasks with an accuracy comparable to or even outperforming a human (<xref ref-type="bibr" rid="B62">Krizhevsky et al., 2017</xref>; <xref ref-type="bibr" rid="B111">Simonyan and Zisserman, 2014</xref>; <xref ref-type="bibr" rid="B40">He et al., 2016</xref>; <xref ref-type="bibr" rid="B92">Pham et al., 2021</xref>). However, the size of a state-of-the-art CNN can reach hundreds of megabytes preventing it from being deployed on edge or IoT devices for vision-related applications. Moreover, a 32-bit floating-point format is used for data representation in state-of-the-art CNN models. This leads to the challenges of deployment of these models to edge/IoT devices with restricted memory bandwidth, throughput, computation resources, and battery life, especially for real-time applications. Hence, there is an increasing demand for compact efficient CNN hardware maintaining acceptable performance.</p>
<p>Due to redundant parameters of state-of-the-art CNN models (<xref ref-type="bibr" rid="B37">Han et al., 2015</xref>), pruning and quantization techniques can be used to reduce the size or number of CNN weights (<xref ref-type="bibr" rid="B49">Janowsky, 1989</xref>; <xref ref-type="bibr" rid="B23">Fiesler et al., 1990</xref>; <xref ref-type="bibr" rid="B17">Courbariaux et al., 2015</xref>). The conceptual comparison between quantization and pruning is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. Quantization is a compression technique, that reduces the number of bits used for computation leading to CNN size reduction and hardware-friendly operations, e.g., integer arithmetic or bit-wise operations, rather than full precision floating-point operations (<xref ref-type="bibr" rid="B86">Neill, 2020</xref>; <xref ref-type="bibr" rid="B34">Guo, 2018</xref>; <xref ref-type="bibr" rid="B93">Qin et al., 2020</xref>; <xref ref-type="bibr" rid="B63">Kulkarni et al., 2022</xref>; <xref ref-type="bibr" rid="B97">Rokh et al., 2022</xref>; <xref ref-type="bibr" rid="B27">Gholami et al., 2021</xref>). Both pruning and quantization are important techniques for reducing model size and computational complexity. However, in this work, we specifically focus on quantization techniques and their impact on hardware implementations.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Comparison between the original, quantized and pruned models. <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="bold-italic">w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are weight, activation tensors respectively.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g001.tif">
<alt-text content-type="machine-generated">Diagram of neural network models. On the left, the original model with full precision neurons and synapses. In the center, the quantized model displays pentagon-shaped neurons and dotted synapses. On the right, the pruned model has fewer neurons and synapses with some inactive connections.</alt-text>
</graphic>
</fig>
<p>In addition to quantization and pruning, which facilitate the hardware implementation of CNNs, dedicated hardware accelerator designs can further improve energy and computation efficiency. The major motivation to develop specific neural network hardware comes from the memory bottleneck of the traditional von Neumann architectures (CPUs/GPUs), especially noticeable when deploying memory-dense applications, e.g., CNNs with millions of parameters. Specific hardware accelerators, e.g., FPGA-based (<xref ref-type="bibr" rid="B125">Umuroglu et al., 2017</xref>; <xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>), ASIC-based (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>; <xref ref-type="bibr" rid="B7">Biswas and Chandrakasan, 2018</xref>) or In-memory computing (IMC) based (<xref ref-type="bibr" rid="B119">Sun et al., 2018b</xref>; <xref ref-type="bibr" rid="B1">Ankit et al., 2019</xref>) designs, help to address von Neumann bottleneck issues and deploy CNNs on low-power devices. Therefore, in this work, we try to provide a comprehensive review of specific hardware designs of quantized CNNs (QCNNs) and connect software-based QCNN methodologies with hardware deployment.</p>
<p>While previous studies have primarily reviewed neural network compression and quantization techniques from an algorithmic perspective (<xref ref-type="bibr" rid="B86">Neill, 2020</xref>; <xref ref-type="bibr" rid="B34">Guo, 2018</xref>; <xref ref-type="bibr" rid="B93">Qin et al., 2020</xref>; <xref ref-type="bibr" rid="B63">Kulkarni et al., 2022</xref>; <xref ref-type="bibr" rid="B97">Rokh et al., 2022</xref>; <xref ref-type="bibr" rid="B27">Gholami et al., 2021</xref>), they barely pay attention hardware implementations and often overlook the critical interplay between these algorithms and their hardware implementations. In contrast, our work bridges this gap by surveying both quantization algorithms and a wide range of QCNN-specific acceleration hardware. Furthermore, we offer insights into the challenges and open problems in QCNN hardware accelerator design, along with general guidelines for effective software-hardware co-design. Our main contributions are as follows:<list list-type="simple">
<list-item>
<p>&#x2022; Integrated Review of Algorithms and Hardware: We survey various quantization techniques for CNNs alongside a detailed review of dedicated hardware accelerators&#x2014;such as ASIC- and FPGA-based designs&#x2014;that implement these methods. This dual perspective highlights how algorithmic choices impact hardware performance and <italic>vice versa</italic>.</p>
</list-item>
<list-item>
<p>&#x2022; Guidelines for Software-Hardware Co-Design: We discuss practical strategies for co-designing quantization algorithms and hardware architectures. By outlining design trade-offs and optimization strategies, we provide a roadmap for developing CNN systems that maintain high performance under strict energy and resource constraints.</p>
</list-item>
</list>
</p>
<p>Both quantization algorithms, QCNN acceleration hardware design, and even network structure designs are rapidly evolving research topics. Hence, it is challenging to encompass an exhaustive survey of all relevant literature. The focus of this work is specifically narrowed to hardware accelerators for QCNNs that are tailored to maximize energy efficiency (e.g., ASIC- and FPGA-based designs), as opposed to those designed with an emphasis on scalability and peak performance. In alignment with this focus, the review of quantization algorithms and CNN models is mainly on those that are prevalently adopted by energy-efficient QCNN hardware accelerator designs. Given the defined scope of this paper, it is noteworthy that some cutting-edge CNN models (e.g., Vision Transformer &#x2b; CNN(<xref ref-type="bibr" rid="B31">Guo et al., 2022</xref>), RegNet (<xref ref-type="bibr" rid="B135">Xu et al., 2022</xref>), DPN(<xref ref-type="bibr" rid="B12">Chen et al., 2017</xref>), Res2Net (<xref ref-type="bibr" rid="B25">Gao et al., 2019</xref>)), quantization techniques (e.g., codebook quantization, gradient quantization), and QCNN hardware platforms (e.g., GPUs, CPUs) will not be reviewed in extensive detail, as they are not common for energy-efficient on-edge processing. An overview illustrating the scope of this paper is provided in <xref ref-type="fig" rid="F2">Figure 2</xref>. Nonetheless, we acknowledge and value the significant contributions of researchers in both fields who are missed by this work and extend our apologies to our readers for the inevitable limitations in coverage.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Overview of the scope of this work.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating the scope of this work. The blue triangle identify the CNN architectures like RegNet, ViT+CNN, ResNeXt, and DPN. The pink triangle describes quantization algorithms such as CBQ, MPQ, and Uni Q/Log Q. The green triangle details hardware like FPGA, ASIC, CPU, MCU, and GPU. The intersection of all three triangle formulate the scope of this work. Additional text explains acronyms: ViT as Vision Transformer, DPN as Dual Path Network, and IMC as Inmemorycomputing architecture. </alt-text>
</graphic>
</fig>
<p>The remaining part of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> introduces the basics of convention CNN and QCNN. <xref ref-type="sec" rid="s3">Section 3</xref> presents different quantization methods commonly used to quantize CNNs. In <xref ref-type="sec" rid="s4">Section 4</xref>, different methods to generate a QNN are illustrated and benchmarked. In <xref ref-type="sec" rid="s5">Section 5</xref>, various hardware accelerator designs are reviewed. <xref ref-type="sec" rid="s6">Section 6</xref> presents the future outlook on algorithms leading to more efficient QCNNs and corresponding hardware accelerator implementations. <xref ref-type="sec" rid="s7">Section 7</xref> concludes the paper.</p>
</sec>
<sec id="s2">
<title>2 Convolutional neural network</title>
<sec id="s2-1">
<title>2.1 Full precision convolutional neural network</title>
<sec id="s2-1-1">
<title>2.1.1 Convolution layer and fully connected layer</title>
<p>Inspired by the hierarchy model of the visual nervous system, Fukushima proposed the first neural network similar to modern-day convolution layers (<xref ref-type="bibr" rid="B24">Fukushima, 1980</xref>). LeCun et al. introduced the &#x201c;LeNet-5&#x201d; CNN in (<xref ref-type="bibr" rid="B64">LeCun et al., 1989</xref>; <xref ref-type="bibr" rid="B65">Lecun et al., 1998</xref>) for handwritten digit recognition systems, which is referred to as the first modern CNN trained with gradient-based backpropagation including all the essential building blocks in modern CNNs (convolution layers, pooling layers, and fully connected layers). Convolution layers are used to extract spatial features due to their spatial invariance.</p>
<p>Following the convolution layers, the fully connected layers are used to classify the abstracted features from the convolution layers and generate the final classification output.</p>
</sec>
<sec id="s2-1-2">
<title>2.1.2 Other auxiliary layers</title>
<p>In modern CNN architectures, additional auxiliary layers, including pooling, normalization, and dropout layers, are used along with the main convolution and fully connected layers. The pooling layers reduce the sizes of the convolution layers by sub-sampling the output feature maps with max/average operations.</p>
<p>As CNNs are becoming deeper and more complex, they also tend to be hard to be trained. To solve this problem, Ioffe et al. proposed the batch normalization technique in (<xref ref-type="bibr" rid="B46">Ioffe and Szegedy, 2015</xref>). Batch normalization normalizes the data over a mini-batch during training as <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">batch</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the data before and after batch normalization respectively, <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">batch</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the batch size, <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the mean and standard deviation of the inputs over the mini-batch. Two optional trainable parameters <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are called affine parameters and involved in the final output computation enabling an affine transformation on the normalized data, restoring the representation capability of the neural network.</p>
<p>The other challenge in deep CNN training is overfitting. Srivastava et al. proposed the dropout technique in (<xref ref-type="bibr" rid="B115">Srivastava et al., 2014</xref>). By inserting dropout layers after convolution and fully connected layers, a certain portion of neurons is randomly chosen and dropped out during training, equivalent to training an ensemble of networks with different connections.</p>
</sec>
</sec>
<sec id="s2-2">
<title>2.2 Typical convolutional neural networks</title>
<p>The design of CNNs has been widely explored recently. However, due to the limitation of IoT and edge devices, to the best of our knowledge, the latest CNN models like the RegNet family (<xref ref-type="bibr" rid="B135">Xu et al., 2022</xref>) and CNN-transformer hybrid architectures (<xref ref-type="bibr" rid="B31">Guo et al., 2022</xref>) are not implemented on dedicated hardware accelerators. Instead, only limited types of CNNs are referred to in the study of QCNNs. In this section, the commonly used CNNs in the study of hardware-related and mobile device-related QCNNs are introduced rather than the state-of-the-art CNN models.</p>
<sec id="s2-2-1">
<title>2.2.1 LeNet-5</title>
<p>LeNet-5 (<xref ref-type="bibr" rid="B64">LeCun et al., 1989</xref>) is one of the earliest modern CNN architectures. It is designed for handwritten digit recognition with the Modified National Institute of Standards and Technology (MNIST) dataset. The structure of LeNet-5 can be shown in <xref ref-type="fig" rid="F3">Figure 3a</xref>. It is often introduced in the studies as a case of light-weighted CNN.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Network structure of <bold>(a)</bold> LeNet-5, <bold>(b)</bold> VGG-small, and <bold>(c)</bold> ResNet-18.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g003.tif">
<alt-text content-type="machine-generated">Diagram comparing three neural network architectures: (a) LeNet5, featuring convolution, pooling, and softmax outputs; (b) VGG-small, with multiple convolution and pooling layers; (c) ResNet-18, showing convolution, pooling, and residual blocks. Color-coded outputs are labeled for clarity: yellow for convolution, red for pooling, purple for linear, and dark purple for softmax.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2-2">
<title>2.2.2 VGG family</title>
<p>Supported by GPU acceleration, CNNs have become deeper. One such network is AlexNet (<xref ref-type="bibr" rid="B62">Krizhevsky et al., 2017</xref>), which can support large datasets classification, e.g., Imagenet. AlexNet adapts a rectified-linear unit (ReLU) as the non-linear activation function. The other example of deep CNN is VGGNet, which achieves high accuracy using small convolution kernel sizes. Such deep networks contain up to a hundred million parameters overwhelming the edge/IoT-based implementations of these networks due to the limited memory and resources. Therefore, a simplified version of VGGNet, VGG-small (VGG-9), is designed for a smaller dataset (CIFAR-10) (<xref ref-type="bibr" rid="B17">Courbariaux et al., 2015</xref>). The structure of the VGG-small is shown in <xref ref-type="fig" rid="F3">Figure 3b</xref>.</p>
</sec>
<sec id="s2-2-3">
<title>2.2.3 ResNet family</title>
<p>The challenges of deep neural networks with simply cascaded layers are vanishing or exploding gradient issues during the training. To address this issue, the ResNet CNN model was proposed, which is divided into small blocks (<xref ref-type="bibr" rid="B40">He et al., 2016</xref>). Each block consists of a few (usually 2 or 3) convolution layers, with an identity bypass connecting the input and output of the block to alleviate vanishing and exploding gradient problems. This allows the implementation of narrower but deeper networks, showing better performance than wider, shallower networks. Due to the modular design, the ResNet architecture can be scaled to up to a thousand layers. <xref ref-type="fig" rid="F3">Figure 3c</xref> shows the network structure of a ResNet-18 modified for the CIFAR-10 dataset.</p>
</sec>
<sec id="s2-2-4">
<title>2.2.4 CNNs for mobile devices and TinyML</title>
<p>To achieve better power efficiency and lower inference latency on modern mobile devices, the complicated CNNs need to be redesigned. MobileNet (<xref ref-type="bibr" rid="B43">Howard et al., 2017</xref>) proposes a network design with depth-wise separable convolution layers (<xref ref-type="bibr" rid="B110">Sifre and Mallat, 2014</xref>). Compared with traditional convolution layers, the depth-wise separable convolution layers reduce both the number of computations and the number of parameters by <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> times. Additionally, most computations in the depth-wise separable convolution layers are contributed by the point-wise convolution with a kernel size of <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, which can be executed by a highly optimized general matrix multiplication (GEMM) function. While for other convolution layers, the feature map needs to be rearranged in the memory before it can be processed by GEMM functions. To improve the performance further, the inverted bottleneck block requiring less computation compared with the traditional bottleneck block was introduced in MobileNetV2 (<xref ref-type="bibr" rid="B103">Sandler et al., 2018</xref>).</p>
<p>To scale up a CNN further, compound scaling factors are introduced in EfficientNet (<xref ref-type="bibr" rid="B120">Tan and Le, 2019</xref>). They scale up the width, depth, and resolution simultaneously leading to higher accuracy when more floating point operations (FLOPs) are allowed by the hardware setup. To generate a baseline CNN model with a certain target number of FLOPs, Neural Architecture Search (NAS) can be used. Then, a grid search can be performed with the generated baseline model to acquire the compound scaling factors for this model.</p>
<p>To further extend AI towards the edge, the concept of TinyML was proposed (<xref ref-type="bibr" rid="B131">Warden and Situnayake, 2019</xref>). TinyML includes hardware, software, and algorithms that enable on-device sensor data analysis on the edge. It also includes network structure design and optimization targeting low-power edge devices like microcontroller units (MCUs). Even though MobileNet and EfficientNet are optimized towards compactness, they still overwhelm the RAM size of typical MCUs. To overcome this memory bottleneck challenge, the MCUNet framework is proposed in (<xref ref-type="bibr" rid="B76">Lin J. et al., 2020</xref>). MCUNet adopts a two-stage NAS to generate an optimized model towards throughput and accuracy while meeting the memory constraint. The two-stage NAS is co-designed with a memory-efficient inference library supporting code generator-based compilation, model-adaptive memory scheduling, computation kernel specialization, and in-place depth-wise convolution. MCUNetV2 (<xref ref-type="bibr" rid="B75">Lin et al., 2021</xref>) introduces patch-by-patch inference scheduling in the inference library and receptive filed redistribution in NAS to further reduce peak memory consumption caused by the imbalanced memory distribution in CNNs.</p>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Quantized convolutional neural network</title>
<p>To enable efficient deployment of CNNs on edge or IoT devices, CNNs need to be compressed. Various techniques have been developed for neural network compression, including pruning (<xref ref-type="bibr" rid="B49">Janowsky, 1989</xref>; <xref ref-type="bibr" rid="B37">Han et al., 2015</xref>; <xref ref-type="bibr" rid="B33">Guo et al., 2020</xref>), low-rank tensor approximation (<xref ref-type="bibr" rid="B18">Denton et al., 2014</xref>; <xref ref-type="bibr" rid="B48">Jaderberg et al., 2014</xref>), and quantization (<xref ref-type="bibr" rid="B122">Teng et al., 2019</xref>; <xref ref-type="bibr" rid="B30">Gong et al., 2014</xref>; <xref ref-type="bibr" rid="B17">Courbariaux et al., 2015</xref>; <xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>; <xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>). In this work, we focus on quantization methods, which can be independently applied with other compression methods.</p>
<p>We summarize the achieved accuracies of various full-precision CNNs targeting different datasets in <xref ref-type="fig" rid="F4">Figure 4</xref>. Due to the small memory size of MCUs and FPGAs, full-precision CNNs are too large to be deployed on such devices. To reduce the memory footprint, the parameters in the neural network can be quantized into a format with a shorter bit-width compared with the original 32-bit floating-point format. In (<xref ref-type="bibr" rid="B35">Gupta et al., 2015</xref>; <xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>; <xref ref-type="bibr" rid="B96">Rastegari et al., 2016</xref>), the authors quantize the weights of CNN into different bit-width formats achieving up to 32 times the size reduction with a cost of mild accuracy degradation. To reduce the issues of limited computation ability and power in IoT/edge devices, activation quantization has been proposed leading to more efficient computation kernels. For instance, when using binary quantization, multiply-accumulate (MAC) operations in a traditional full-precision neural network can be replaced by energy-efficient combination of XNOR and bit counting operation (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>; <xref ref-type="bibr" rid="B96">Rastegari et al., 2016</xref>; <xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>). Similar to weight quantization, accuracy degradation caused by quantization is acceptable.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Comparison of convolutional neural network (CNN) model sizes against the memory capacities of various hardware platforms. For FPGA, memory capacity corresponds to on-chip block random access memory (BRAM), while standard random access memory (RAM) for other platforms.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g004.tif">
<alt-text content-type="machine-generated">Scatter plot showing the accuracy versus size (in bits) for various neural networks across datasets: ImageNet (purple triangles), CIFAR-10 (green squares), and MNIST (brown circles). Accuracy ranges from 70% to 100%, with sizes from 1 million to 10 billion bits. Labels like ResNet, VGG, EfficientNet, MCDNN, and LeNet are plotted. Images of FPGA, MCU, Mobile Device, and GPU appear above to indicate different hardware types related to memory capacity.</alt-text>
</graphic>
</fig>
<p>Generally, QCNN is commonly referred to as a CNN with a quantized format of parameters (weights) and data flow (activations) to achieve a smaller network size and more efficient computation. Weights in CNN account for the majority of the size of the neural network. As the number of weights in the convolution and fully connected layers are in the order of <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, while the rest of the parameters (biases, affine parameters) are in the order of <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, with <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> being the number of neurons in the layer. Hence, most of the published works focus on quantizing the weights of convolution layers and fully connected layers.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Quantization methods</title>
<p>Quantization maps a continuous interval into a set of discrete values. There are various mapping algorithms, which can be categorized into two subsets, deterministic quantization, and stochastic quantization. In deterministic quantization, the quantized value and original value have a one-to-one mapping. While the stochastically quantized value is sampled from a certain probability distribution parameterized by the original value. Sampling from a distribution requires more computation than a deterministic calculation. Additionally, gradient estimation is difficult with stochastic quantization leading to training complexity (<xref ref-type="bibr" rid="B4">Bengio, 2013</xref>). Hence, we focus on deterministic quantization algorithms.</p>
<sec id="s3-1">
<title>3.1 Uniform quantization</title>
<p>Uniform quantization is the most commonly used quantization algorithm, which divides an interval into equal sub-intervals where all the data is represented by a single value. Each sub-interval corresponds to a set of linear uniformly distributed discrete values. Uniformly quantized value <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is represented as (<xref ref-type="disp-formula" rid="e1">Equation 1</xref>):<disp-formula id="e1">
<mml:math id="m13">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>&#x2309;</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
<mml:mi>x</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mi>x</mml:mi>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
<mml:mi>x</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
<mml:mo>&#x2309;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a round function to the nearest integer, <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf17">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>/<inline-formula id="inf18">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are full-precision value, bias, scale factors and boundary values respectively. The simplest case of uniform quantization is binary quantization (with <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) (<xref ref-type="disp-formula" rid="e2">Equation 2</xref>):<disp-formula id="e2">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="2em"/>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="2em"/>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>Another widely used uniform quantization is n-bits integer quantization (<xref ref-type="disp-formula" rid="e3">Equation 3</xref>):<disp-formula id="e3">
<mml:math id="m22">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x2309;</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>In practice, the full-precision parameter <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is passed through a bounded non-linear function before quantization. For instance, in (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>), a hard clip function is applied to the parameter to be quantized before actual quantization. In (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>), parameters are passed through a hyperbolic tangent function limiting the range of the parameters into the quantization range <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, the ability to update all the parameters is retained, even if the parameters are outside the quantization range. However, it is not always optimal to update all the parameters, this topic is detailed studied and discussed in (<xref ref-type="bibr" rid="B102">Sakr et al., 2022</xref>).</p>
<p>Generally, uniform quantization is simple, as all operations on the quantized parameters are either integer or bit-wise operations, suitable to be executed by arithmetic logic units (ALUs) in von Neumann systems, gate-level circuits in ASICs, and lookup tables (LUTs) in FPGA. However, uniform quantization inherently exhibits a poor dynamic range. With <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-bit uniform quantization, the ratio <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> between the largest positive value and the smallest positive value can be expressed as <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. To mitigate this drawback, a full precision scale factor is attached to the quantized values (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>; <xref ref-type="bibr" rid="B96">Rastegari et al., 2016</xref>), at the cost of computation complexity. For instance, full precision scale factors are multiplied with corresponding quantized values and summed up during convolution operations, which requires full precision floating point arithmetic support on the hardware. Besides, other quantization methods have better dynamic range (discussed in the following section).</p>
</sec>
<sec id="s3-2">
<title>3.2 Low-precision floating point format and logarithmic quantization</title>
<p>One of the straightforward approaches to reduce data precision (reduce bit-width) while maintaining dynamic range is to truncate the commonly used IEEE-754 single-precision floating-point format to half-precision format. Examples of low-precision floating point formats are presented in <xref ref-type="table" rid="T1">Table 1</xref>. To further reduce the bit-width and computation complexity while maximizing the dynamic range, a 4-bit radix-4 logarithmic quantization algorithm (<xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>) can be used in the format of <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mtext>sign,&#x2009;exponent</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e4">
<mml:math id="m29">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2264;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2264;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3c;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2264;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mspace width="1em"/>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,0,1,2,3</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>Derived from <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, the ratio between the largest and smallest magnitude can be expressed as <inline-formula id="inf26">
<mml:math id="m30">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Compared with 4-bit uniform quantization <inline-formula id="inf27">
<mml:math id="m31">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the logarithmic quantization has a much larger dynamic range <inline-formula id="inf28">
<mml:math id="m32">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This makes it suitable for the quantization of data with a large dynamic range, e.g., gradients. Neural network training with the 4-bit radix four logarithmic quantization can achieve comparable results as using a 32-bit floating-point format (<xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Examples of different floating point format and logarithmic quantization format (&#x2a; Log4 is a radix-4 format).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Format</th>
<th align="center">Bit-width</th>
<th align="center">Bit allocation (sign, exponent, fraction)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">IEEE-754 Half precision</td>
<td align="center">16-bit</td>
<td align="center">(1, 5, 10)</td>
</tr>
<tr>
<td align="center">FP8 (<xref ref-type="bibr" rid="B129">Wang et al., 2018</xref>)</td>
<td align="center">8-bit</td>
<td align="center">(1, 5, 2)</td>
</tr>
<tr>
<td align="center">Log4&#x2a; (<xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>)</td>
<td align="center">4-bit</td>
<td align="center">(1, 3, 0)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From the hardware point of view, the multiplication operation of logarithmically quantized data can be simply implemented with shift operations. However, the sum operation puts high requirements on accumulators due to the high dynamic range. Therefore, to support logarithmic quantization, the accumulator usually has large bit-width of the output or supports a floating point sum operation.</p>
</sec>
<sec id="s3-3">
<title>3.3 Codebook quantization</title>
<p>The parameters of a well-trained neural network follow a certain distribution, which is neither linear nor logarithmic. To represent these parameters, the quantized value set and corresponding mapping rules can be customized resulting in a codebook-style quantization. In (<xref ref-type="bibr" rid="B30">Gong et al., 2014</xref>), the quantized value set is found by k-mean clustering (<xref ref-type="disp-formula" rid="e5">Equation 5</xref>):<disp-formula id="e5">
<mml:math id="m33">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mspace width="2em"/>
<mml:mi mathvariant="bold">w</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold">c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>Then, each value <inline-formula id="inf29">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be assigned an index to form a codebook. The index requires <inline-formula id="inf30">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>log</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> bits to be represented. With the quantized value set, the parameter can be quantized and represented using indexes, with <inline-formula id="inf31">
<mml:math id="m36">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>B</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> being the inverse codebook mapping (value to index) (<xref ref-type="disp-formula" rid="e6">Equation 6</xref>):<disp-formula id="e6">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>B</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>arg</mml:mi>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>A similar method is adopted and implemented on hardware in (<xref ref-type="bibr" rid="B68">Lee et al., 2017</xref>). In (<xref ref-type="bibr" rid="B37">Han et al., 2015</xref>), a network is quantized by codebook quantization using <xref ref-type="disp-formula" rid="e6">Equation 6</xref> and finetuned. In (<xref ref-type="bibr" rid="B122">Teng et al., 2019</xref>), codebook quantization is applied, where the most frequent values form a quantization value set instead of the cluster centroids. The codebook is updated after every epoch during training. Uniform quantization and logarithmic quantization can be treated as a special case of codebook quantization with the quantized value showing uniform or logarithmic distribution.</p>
<p>The hardware requirements to implement codebook quantization depend on the values in the codebook. For instance, if these values are floating-point values, the hardware should support floating-point operations. Compared to uniform and logarithmic methods, codebook quantization brings additional overhead of reading the codebook.</p>
</sec>
<sec id="s3-4">
<title>3.4 Mixed-precision quantization</title>
<p>Different parts of the neural network tend to exhibit different levels of abstraction and expression ability (<xref ref-type="bibr" rid="B16">Chu et al., 2021</xref>). Hence, different quantization parameters can be chosen for different parts of the neural network to ensure optimum model size without accuracy degradation, which is named mixed-precision quantization (MPQ) (<xref ref-type="bibr" rid="B95">Rakka et al., 2022</xref>). MPQ can provide a full-precision accuracy while maintaining the same model size as extremely low bit-width quantization (<xref ref-type="bibr" rid="B88">Nguyen et al., 2020</xref>; <xref ref-type="bibr" rid="B53">Kim et al., 2020</xref>). In MPQ, quantization parameters (bit-width, scale factors, quantization boundaries, etc.) for different parts of the neural network can be determined by some specification/metrics of the corresponding part (<xref ref-type="bibr" rid="B79">Ma et al., 2021</xref>; <xref ref-type="bibr" rid="B139">Yao et al., 2021</xref>), by differentiable optimization (<xref ref-type="bibr" rid="B73">Li et al., 2020</xref>; <xref ref-type="bibr" rid="B36">Habi et al., 2020</xref>), or by reinforcement learning (<xref ref-type="bibr" rid="B128">Wang et al., 2020</xref>; <xref ref-type="bibr" rid="B22">Elthakeb et al., 2019</xref>).</p>
<p>MPQ implementation requires additional hardware support and creates hardware overhead to handle the heterogeneity brought by MPQ (<xref ref-type="bibr" rid="B88">Nguyen et al., 2020</xref>; <xref ref-type="bibr" rid="B133">Wu et al., 2021</xref>). Any quantization method, e.g., uniform, logarithmic or codebook, can be used to create a mixed-precision model.</p>
</sec>
</sec>
<sec id="s4">
<title>4 How to generate a quantized neural network?</title>
<p>There are two main approaches to generate a quantized neural network (QNN) model: (1) quantizing a well-trained full-precision model, known as Post-Training Quantization (PTQ), and (2) training or fine-tuning the model with quantization effects incorporated, referred to as Quantization-Aware Training (QAT). PTQ is typically faster and more efficient in terms of runtime, energy consumption, and computation cost because it uses a small calibration dataset without modifying the model weights. However, PTQ often results in lower performance compared to QAT (<xref ref-type="bibr" rid="B50">Jiang et al., 2022</xref>; <xref ref-type="bibr" rid="B27">Gholami et al., 2021</xref>; <xref ref-type="bibr" rid="B97">Rokh et al., 2022</xref>). As discussed in subsequent sections, current edge-oriented hardware accelerators do not fully support neural network training. Consequently, in edge-oriented vision applications (where QCNNs are commonly deployed), models are usually prepared offsite&#x2014;on servers where runtime, energy consumption, and computational cost are less critical&#x2014;making the extra overhead of QAT acceptable in exchange for improved accuracy<xref ref-type="bibr" rid="B82">Menghani (2023)</xref>. To fully exploit the advantages of PTQ, instead of applying it to edge-oriented vision tasks, PTQ is frequently employed in domains like large language models, where updating weights is prohibitively expensive even with modern computational resources <xref ref-type="bibr" rid="B106">Shen et al. (2024a)</xref>. Given these considerations, we focus on QAT methods in this paper.</p>
<p>The major challenge for training QNNs is the stair-like nature of the quantization function, resulting in zero gradients. Therefore, traditional stochastic gradient descent (SGD)-based training methods cannot be applied directly for QNN training. Hence, the key challenge in QNN training is backpropagation methods. Based on the backpropagation of the loss, QNN training methods can be categorized into (1) approximated gradient methods with exact gradients and (2) exact gradient methods with gradual quantization (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>General training flow based on approximated gradient and exact gradient.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g005.tif">
<alt-text content-type="machine-generated">Flowchart compares approximated and exact gradient methods in machine learning. In both, SGD update affects weight \(w_c\), followed by quantization (hard for approximated, soft for exact). Weights \(w_q\) undergo inference producing output \(x\), and computed cost \(C\) used in backpropagation. Exact gradient includes a graph transition from smooth to step function.</alt-text>
</graphic>
</fig>
<sec id="s4-1">
<title>4.1 Approximated gradient under exact quantization</title>
<p>One of the solutions to the zero-gradient problem of the quantization function is to generate an approximated gradient to update the weights. The most straightforward approximation strategy is called the straight-through estimator (STE), which offers a simple and efficient way to backpropagate gradients through quantization functions. In STE, the Jacobian matrix <inline-formula id="inf32">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is set to be a diagonal matrix, where all diagonal entries equal to 1 (with <inline-formula id="inf33">
<mml:math id="m39">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> being the cost) <inline-formula id="inf34">
<mml:math id="m40">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>. STE is used in all approximated gradient methods highlighted below.</p>
<sec id="s4-1-1">
<title>4.1.1 Binary-connect and QNN</title>
<p>One of the first CNNs with binarized weights trained using STE is presented in (<xref ref-type="bibr" rid="B17">Courbariaux et al., 2015</xref>) (shown in <xref ref-type="fig" rid="F6">Figure 6</xref>). This model achieved the accuracy comparable with floating-point models. Following the idea of training QNN using STE, this method is extended to n-bits uniform quantization of both weights and activations in (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>). In n-bit quantization (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>), the constraints are added to the weights and gradient values (a binary case example) (<xref ref-type="disp-formula" rid="e7">Equation 7</xref>):<disp-formula id="e7">
<mml:math id="m41">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>clip</mml:mtext>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2207;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf35">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf36">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are gradients with respect to continuous activation and quantized activation, <inline-formula id="inf37">
<mml:math id="m44">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the learning rate, and <inline-formula id="inf38">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> equals to 1 when condition satisfied otherwise 0. Constraints on the weights and gradients of the activations avoid extremely large values, improving the training performance.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visualization of weights before and after quantization during training using different training methods. <bold>(a)</bold> Basic STE. <bold>(b)</bold> DoReFa. <bold>(c)</bold> LNS-Madam. <bold>(d)</bold> ANA. <bold>(e)</bold> nBitQNN. <bold>(f)</bold> ProxQuant. <bold>(g)</bold> Sigmoid QN.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g006.tif">
<alt-text content-type="machine-generated">Comparison of approximated and exact gradients across seven subplots. Subplots (a), (b), and (c) depict approximated gradients using forward and backward steps for different ranges of \\( w_c \\) and \\( w_q \\). Subplot (d) shows exact gradients over epochs with varying \\(\\sigma\\) values. Subplot (e) contrasts training and inference gradients. Subplot (f) presents gradient progression over three iterations. Subplot (g) illustrates the effect of temperature changes on epochs, displaying gradients for different temperatures. Each subplot uses plots and arrows to visualize the changes in gradients.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-1-2">
<title>4.1.2 DoReFa-Net</title>
<p>In (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>), the method to quantize weights, activations, and gradients is presented (<xref ref-type="fig" rid="F6">Figure 6</xref>) (<xref ref-type="disp-formula" rid="e8">Equation 8</xref>):<disp-formula id="e8">
<mml:math id="m46">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>max</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>clip</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>max</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>max</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf39">
<mml:math id="m47">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the k-bits uniform quantization function bounded between 0 and 1. The binary quantization case with <inline-formula id="inf40">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> being the mean of all the weights in the same layer can be formulated as:<disp-formula id="e9">
<mml:math id="m49">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="bold">E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>As shown in <xref ref-type="disp-formula" rid="e9">Equation 9</xref>, a layer-wise scale factor <inline-formula id="inf41">
<mml:math id="m50">
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is attached to the binarized weights. Therefore, the DoReFa-Net can not achieve a fully binarized inference. Similar to (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>), a hyperbolic tangent function and a clip function are applied to weights, and activations, respectively before quantization to avoid extremely large values and guarantee good performance. The first and last layers in the DoReFa-Net are not quantized.</p>
</sec>
<sec id="s4-1-3">
<title>4.1.3 LQ-Net</title>
<p>Even though Binary-Connect and Dorefa are based on uniform quantization, the data distribution of weights and activations in a well-trained neural network is non-uniform. In (<xref ref-type="bibr" rid="B141">Zhang et al., 2018</xref>), to reduce the quantization error, a quantized value is obtained as:<disp-formula id="e10">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold">v</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="bold">b</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf42">
<mml:math id="m52">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a vector representing the basis of the quantized value space, <inline-formula id="inf43">
<mml:math id="m53">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a vector where all elements are either <inline-formula id="inf44">
<mml:math id="m54">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> or 1, and <inline-formula id="inf45">
<mml:math id="m55">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of quantization bits. This method brings a more flexible quantized value space while being compatible with the bit-wise operation <inline-formula id="inf46">
<mml:math id="m56">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e11">
<mml:math id="m57">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>Training using this method consists of optimization of the quantizer (vector <inline-formula id="inf47">
<mml:math id="m58">
<mml:mrow>
<mml:mi mathvariant="bold">v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf48">
<mml:math id="m59">
<mml:mrow>
<mml:mi mathvariant="bold">b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) and optimization of the neural network parameters. Quantizer optimization is performed during the forward pass by minimizing the mean squared quantization error. For better efficiency, two vectors are optimized in a block coordinate descent fashion (two vectors are optimized alternatively). During the backward pass, neural network parameters are updated using the traditional SGD method with gradients passed through the quantizer via STE. To avoid adding considerably more parameters to the neural network, the learned quantizer is assigned channel-wise for weight and layer-wise for activations.</p>
</sec>
<sec id="s4-1-4">
<title>4.1.4 LNS-Madam</title>
<p>As logarithmic quantization offers a better dynamic range than uniform quantization, tailored logarithmic number system (LNS) with fractional exponents is proposed in (<xref ref-type="bibr" rid="B143">Zhao et al., 2022</xref>) and represented as (<xref ref-type="disp-formula" rid="e12">Equation 12</xref>):<disp-formula id="e12">
<mml:math id="m60">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>sign</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>clip</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>round</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>log</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where <inline-formula id="inf49">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf50">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the value before and after quantization respectively, <inline-formula id="inf51">
<mml:math id="m63">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the base factor controls the quantization gap, <inline-formula id="inf52">
<mml:math id="m64">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the quantization bit-width, and <inline-formula id="inf53">
<mml:math id="m65">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a scale factor related to the magnitude of <inline-formula id="inf54">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. By selecting <inline-formula id="inf55">
<mml:math id="m67">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> from powers of 2, the overhead of hardware complexity is reduced while maintaining a variable quantization gap. In this LNS, the multiplication of the quantized values is easy to implement as the traditional power-of-2-based LNS. To efficiently perform add operations, the exponents are decomposed <inline-formula id="inf56">
<mml:math id="m68">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, with <inline-formula id="inf57">
<mml:math id="m69">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> being processed by lookup tables and <inline-formula id="inf58">
<mml:math id="m70">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> being processed by the approximation <inline-formula id="inf59">
<mml:math id="m71">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Low-precision LNS training framework based on a modified Madam optimizer presented in (<xref ref-type="bibr" rid="B6">Bernstein et al., 2020</xref>) directly optimizes the exponents in the LNS enabling 8-bit low-precision training. <xref ref-type="fig" rid="F6">Figure 6</xref> visualize the relation between quantized weight <inline-formula id="inf60">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and original weight <inline-formula id="inf61">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf62">
<mml:math id="m74">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s4-1-5">
<title>4.1.5 PACT</title>
<p>In (<xref ref-type="bibr" rid="B15">Choi et al., 2018</xref>), the parameterized clipping activation (PACT) algorithm to quantize an activation to low bit-width without significant accuracy drop is proposed. One of the challenges of activation quantization is to decide the clipping range of a quantizer. A manual-designed clipping range is hard to adapt to different activation value distributions from various neural network architectures. An excessively small or large clipping range causes important values to be clipped or vanish in a quantization step size. PACT determines the clipping range automatically via gradient update (<xref ref-type="disp-formula" rid="e13">Equation 13</xref>):<disp-formula id="e13">
<mml:math id="m75">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">PACT</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>clip</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>round</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>where <inline-formula id="inf63">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf64">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are activation values before and after quantization, <inline-formula id="inf65">
<mml:math id="m78">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a learnable clipping range parameter and <inline-formula id="inf66">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is 1 if <inline-formula id="inf67">
<mml:math id="m80">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and 0 otherwise. PACT replaces the clipped values with <inline-formula id="inf68">
<mml:math id="m81">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in the computation graph so that <inline-formula id="inf69">
<mml:math id="m82">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be automatically learned via gradients. PACT exhibits better accuracy over manually designed clipping ranges on various datasets.</p>
</sec>
<sec id="s4-1-6">
<title>4.1.6 Summary</title>
<p>The relations between the continuous and quantized versions of the weights for both forward and backward passes are visualized in <xref ref-type="fig" rid="F6">Figure 6</xref>. Overall, approximated gradient based training methods approximate the gradient according to a &#x201c;trend line&#x201d; (red line in <xref ref-type="fig" rid="F6">Figure 6</xref>). Also, these training methods execute the forward pass using quantized values, showing the potential of being deployed to low-end devices. However, most of the methods use full precision weights to aggregate the full precision gradients. The possibilities of training QNNs using low-precision latent weights (<xref ref-type="bibr" rid="B3">Banner et al., 2018</xref>; <xref ref-type="bibr" rid="B35">Gupta et al., 2015</xref>; <xref ref-type="bibr" rid="B143">Zhao et al., 2022</xref>) and gradients (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>; <xref ref-type="bibr" rid="B96">Rastegari et al., 2016</xref>; <xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>) have been explored. Such methods make approximated gradient-based training methods to be good candidates for deployment on low-end devices.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Exact gradient with a gradual quantization</title>
<p>Besides approximated gradient methods, the other solution to the zero-gradient problem is a &#x201c;soft&#x201d; quantization using the time-evolving quantization function with non-zero derivatives, which converges to a &#x201c;hard&#x201d; quantization function as training proceeds.</p>
<sec id="s4-2-1">
<title>4.2.1 Additive noise annealing (ANA)</title>
<p>In (<xref ref-type="bibr" rid="B114">Spallanzani et al., 2019</xref>), the expectations of quantized values and corresponding gradients are defined and derived considering a noise being added to the full-precision values. Consider <inline-formula id="inf70">
<mml:math id="m83">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="script">Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> being a multi-step quantization function, <inline-formula id="inf71">
<mml:math id="m84">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> being a zero-mean noise with probability density <inline-formula id="inf72">
<mml:math id="m85">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e14">
<mml:math id="m86">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>&#x3bc;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where <inline-formula id="inf73">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the expectation over <inline-formula id="inf74">
<mml:math id="m88">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf75">
<mml:math id="m89">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2217;</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the convolution operation. <xref ref-type="disp-formula" rid="e14">Equation 14</xref> can be used to perform forward and backward passes of a noise-injected neural network. If <inline-formula id="inf76">
<mml:math id="m90">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a continuously differentiable function, the expectation <inline-formula id="inf77">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of the quantization function with noise injected is not a multi-step function anymore, instead, a function with non-zero gradient, which can be expressed as <inline-formula id="inf78">
<mml:math id="m92">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>. When the probability density function of the noise is a delta function <inline-formula id="inf79">
<mml:math id="m93">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, the expectation <inline-formula id="inf80">
<mml:math id="m94">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> matching the case of exact quantization. Hence, the key strategy of ANA is to construct a time-dependent noise probability density function <inline-formula id="inf81">
<mml:math id="m95">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, whose gradient is not always zero and converges to delta function as training proceeded: <inline-formula id="inf82">
<mml:math id="m96">
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:mi>lim</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Practically, after a certain number of training steps, a hard quantization function is applied (i.e., no noise is injected), to generate QNN for inference.</p>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> shows an example of different expectations <inline-formula id="inf83">
<mml:math id="m97">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with different <inline-formula id="inf84">
<mml:math id="m98">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf85">
<mml:math id="m99">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> being a ternary quantization function. <inline-formula id="inf86">
<mml:math id="m100">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is considered to be a uniform distribution <inline-formula id="inf87">
<mml:math id="m101">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">U</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Large <inline-formula id="inf88">
<mml:math id="m102">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at the beginning of the training results in a piece-wise linear function with non-zero gradients, enabling the backpropagation of the gradient through the quantization function. As training proceeds, <inline-formula id="inf89">
<mml:math id="m103">
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> decreases towards zero, meanwhile, the expectation converges to a ternary quantization function.</p>
</sec>
<sec id="s4-2-2">
<title>4.2.2 ProxQuant</title>
<p>Unlike other methods, the ProxQuant algorithm (<xref ref-type="bibr" rid="B2">Bai et al., 2018</xref>) does not modify traditional SGD-based training, being directly compatible with SGD optimizers, e.g., Momentum SGD and Adam. The key point of the ProxQuant algorithm is adding a regularization process after each SGD update (<xref ref-type="disp-formula" rid="e15">Equation 15</xref>):<disp-formula id="e15">
<mml:math id="m104">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>prox</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mi>&#x2207;</mml:mi>
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mtext>prox</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>arg min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:munder>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>x</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">&#x2016;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>where <inline-formula id="inf90">
<mml:math id="m105">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a regularizer, which achieves minimum value when <inline-formula id="inf91">
<mml:math id="m106">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with Q being a set containing quantized values. By applying <inline-formula id="inf92">
<mml:math id="m107">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> function, the weight is updated considering both SGD result and distance from the quantized set. The weights converge to the values in the quantized set <inline-formula id="inf93">
<mml:math id="m108">
<mml:mrow>
<mml:mi mathvariant="script">Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> after applying <inline-formula id="inf94">
<mml:math id="m109">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> several times. <xref ref-type="fig" rid="F6">Figure 6</xref> shows a special binary quantization (i.e., <inline-formula id="inf95">
<mml:math id="m110">
<mml:mrow>
<mml:mi mathvariant="script">Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) case when the weights are updated by appllying <inline-formula id="inf96">
<mml:math id="m111">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for several times. The weight distribution gradually converges to the binary hard quantized case. To force the weights update towards the quantized set and to improve the performance, the parameter <inline-formula id="inf97">
<mml:math id="m112">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be increased as training proceeds.</p>
<p>Since the <inline-formula id="inf98">
<mml:math id="m113">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>prox</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:mi>&#x2207;</mml:mi>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is applied iteratively, there is no closed form of the equivalent &#x201c;Soft Quant&#x201d; function shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. However, we can still use a generalized time-dependent &#x201c;Soft Quant&#x201d; function, which converges to a hard quantization function, to describe the effect of iteratively applying <inline-formula id="inf99">
<mml:math id="m114">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s4-2-3">
<title>4.2.3 nBitQNN</title>
<p>In (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>), a QNN training method that mixes the full precision weights and quantized weights to generate mixed-precision QNN is applied as (<xref ref-type="disp-formula" rid="e16">Equation 16</xref>):<disp-formula id="e16">
<mml:math id="m115">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>where <inline-formula id="inf100">
<mml:math id="m116">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is a pseudo-quantization function mixing the full precision weight <inline-formula id="inf101">
<mml:math id="m117">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and the hard quantized weight <inline-formula id="inf102">
<mml:math id="m118">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf103">
<mml:math id="m119">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2260;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, enables the gradient to back propagate through <inline-formula id="inf104">
<mml:math id="m120">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The parameter <inline-formula id="inf105">
<mml:math id="m121">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> adjusts the ratio between full precision weights and hard quantized weights. Under a sufficient number of training steps, the weights converge to the hard quantized values even though <inline-formula id="inf106">
<mml:math id="m122">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a finite constant. The radix-2 logarithmic quantization is used to hard quantize the weights, while the activations are not quantized.</p>
</sec>
<sec id="s4-2-4">
<title>4.2.4 Quantization using sigmoid and hyperbolic tangent</title>
<p>In (<xref ref-type="bibr" rid="B136">Yang et al., 2019</xref>), the quantization function reformulated using a combination of shifted and scaled step functions:<disp-formula id="e17">
<mml:math id="m123">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold">H</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>where <inline-formula id="inf107">
<mml:math id="m124">
<mml:mrow>
<mml:mi mathvariant="bold">H</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the standard unit step function, <inline-formula id="inf108">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the scale factor and shift of the corresponding unit step function, and <inline-formula id="inf109">
<mml:math id="m126">
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a global offset zero-centering the distribution of quantized parameter <inline-formula id="inf110">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. To overcome the zero-gradient problem of the step functions during training, the unit step functions in <xref ref-type="disp-formula" rid="e17">Equation 17</xref> are replaced by temperature-modulated sigmoid functions <inline-formula id="inf111">
<mml:math id="m128">
<mml:mrow>
<mml:mi mathvariant="bold">&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e18">
<mml:math id="m129">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold">&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi mathvariant="bold">&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where <inline-formula id="inf112">
<mml:math id="m130">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a layer-wise scale factor for the output. With <xref ref-type="disp-formula" rid="e18">Equation 18</xref> being a differentiable function, the gradient can be backpropagated through the neural network. As the training proceeds, the temperature <inline-formula id="inf113">
<mml:math id="m131">
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> grows, reducing the gap between <inline-formula id="inf114">
<mml:math id="m132">
<mml:mrow>
<mml:mi mathvariant="bold">&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and unit step functions (<xref ref-type="fig" rid="F6">Figure 6</xref>). Once the training is finished, the unit step functions are used in inference and validation. To guarantee high accuracy, training is divided into three phases: training a full precision neural network, training with weight quantization only, and training with activation quantization while fixing weight quantization.</p>
<p>In (<xref ref-type="bibr" rid="B29">Gong et al., 2019</xref>), the differentiable soft quantization framework (DSQ) is proposed, which shares a similar idea of quantizing data piece-wisely using a series of evolving hyperbolic tangent basis functions. Different from (<xref ref-type="bibr" rid="B136">Yang et al., 2019</xref>), the evolution of the basis functions is not performed explicitly with the training progress. A characteristic variable (describing the error between hard quantization and basis functions) is calculated and minimized during training. Additionally, DSQ adopts trainable clipping ranges similar to PACT (<xref ref-type="bibr" rid="B15">Choi et al., 2018</xref>).</p>
</sec>
<sec id="s4-2-5">
<title>4.2.5 Summary</title>
<p>Exact gradient methods adopt evolving quantization functions to guarantee a feasible gradient during training and reach hard quantization at the end of training. However, this implies that these methods need to be operated with full precision data. Hence, they are more suitable for a cloud-based execution generating QNNs to be deployed on edge devices.</p>
</sec>
</sec>
<sec id="s4-3">
<title>4.3 Fake and real quantization</title>
<p>Most quantization studies implement their algorithms in software (e.g., using Pytorch, Tensorflow) (<xref ref-type="bibr" rid="B72">Li et al., 2021</xref>). Therefore, all the quantization operations and quantized data are simulated in floating-point format, which can be referred to as &#x201c;<italic>Fake Quantization</italic>&#x201d;. On the contrary, when the quantized models are deployed on hardware, the quantized data is processed by executors supporting the corresponding format (e.g., 8-bit integer arithmetic unit for 8-bit uniform quantization). This can be referred to as &#x201c;<italic>Real Quantization</italic>&#x201d;. The data format mismatch between fake and real quantization may cause unavoidable data value differences. Adaption from fake quantization to real quantization considering different hardware architectures is discussed in <xref ref-type="sec" rid="s5">Section 5</xref>.</p>
</sec>
<sec id="s4-4">
<title>4.4 Benchmarking</title>
<p>In this section, the training methods mentioned in <xref ref-type="sec" rid="s4">Section 4</xref> are benchmarked under different configurations using corresponding open-source codes modified where required.</p>
<sec id="s4-4-1">
<title>4.4.1 Network and training configurations</title>
<p>In the aforementioned works, different configurations of either the network or training are used. In all the original works, the affine operation of the batch-norm layer is enabled and all the trainable parameters for affine operation are not quantized. While in our experiments the affine operation is only enabled according to configurations. Affine operation adds two full precision trainable parameters at each output channel. They do not affect the performance during inference with batch normalization fusing, however, during training they degrade the compression rate and increase the computation complexity. To improve generalization ability, all the aforementioned methods involve data augmentation. To analyze the influence of each setting, we benchmark the aforementioned training methods using the settings summarized in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Benchmark settings for training methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Index</th>
<th align="center">Bit Width (W,A,G)<sup>&#x2a;</sup>
</th>
<th align="center">Epochs</th>
<th align="center">Bias</th>
<th align="center">BNAffine</th>
<th align="center">DataAug</th>
<th align="center">Notation<sup>&#x2a;2</sup>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">BNN (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>)</td>
</tr>
<tr>
<td align="center">1</td>
<td align="center">(1,1,32)</td>
<td align="center">900</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[1,1,1]</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">(1,1,32)</td>
<td align="center">900</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[0,0,0]</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">(1,1,32)</td>
<td align="center">900</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[1,1,0]</td>
</tr>
<tr>
<td colspan="7" align="center">DoReFa (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>)</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">(1,1,32)</td>
<td align="center">500</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[1,1,1]</td>
</tr>
<tr>
<td align="center">5</td>
<td align="center">(1,1,32)</td>
<td align="center">500</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[0,1,0]</td>
</tr>
<tr>
<td align="center">6</td>
<td align="center">(1,1,32)</td>
<td align="center">500</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[0,0,0]</td>
</tr>
<tr>
<td align="center">7</td>
<td align="center">(1,2,4)</td>
<td align="center">500</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[0,1,0]</td>
</tr>
<tr>
<td colspan="7" align="center">ANA (<xref ref-type="bibr" rid="B114">Spallanzani et al., 2019</xref>)</td>
</tr>
<tr>
<td align="center">8</td>
<td align="center">(<inline-formula id="inf115">
<mml:math id="m133">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>T</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>,T,32)</td>
<td align="center">1000</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[0,1,1]</td>
</tr>
<tr>
<td align="center">9</td>
<td align="center">(T,T,32)</td>
<td align="center">1000</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[1,0,0]</td>
</tr>
<tr>
<td align="center">10</td>
<td align="center">(T,T,32)</td>
<td align="center">1000</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[0,0,0]</td>
</tr>
<tr>
<td colspan="7" align="center">ProxQuant (<xref ref-type="bibr" rid="B2">Bai et al., 2018</xref>)</td>
</tr>
<tr>
<td align="center">11</td>
<td align="center">(1,32,32)</td>
<td align="center">200 &#x2b; 700<sup>&#x2a;4</sup>
</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[0,1,1]</td>
</tr>
<tr>
<td align="center">12</td>
<td align="center">(1,32,32)</td>
<td align="center">200 &#x2b; 700</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[0,1,0]</td>
</tr>
<tr>
<td align="center">13</td>
<td align="center">(1,32,32)</td>
<td align="center">200 &#x2b; 700</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[0,0,0]</td>
</tr>
<tr>
<td colspan="7" align="center">nBQNN (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>)</td>
</tr>
<tr>
<td align="center">14</td>
<td align="center">(<inline-formula id="inf116">
<mml:math id="m134">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>L</mml:mtext>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>,32,32)</td>
<td align="center">500</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[1,1,1]</td>
</tr>
<tr>
<td align="center">15</td>
<td align="center">(L2,32,32)</td>
<td align="center">500</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[0,0,0]</td>
</tr>
<tr>
<td align="center">16</td>
<td align="center">(L2,32,32)</td>
<td align="center">500</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">[0,0,1]</td>
</tr>
<tr>
<td colspan="7" align="center">LQ-net (<xref ref-type="bibr" rid="B141">Zhang et al., 2018</xref>)</td>
</tr>
<tr>
<td align="center">17</td>
<td align="center">(1,1,32)</td>
<td align="center">400</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[1,1,1]</td>
</tr>
<tr>
<td align="center">18</td>
<td align="center">(1,1,32)</td>
<td align="center">400</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[0,1,1]</td>
</tr>
<tr>
<td align="center">19</td>
<td align="center">(1,1,32)</td>
<td align="center">400</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">[1,0,1]</td>
</tr>
<tr>
<td align="center">20</td>
<td align="center">(1,1,32)</td>
<td align="center">400</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[1,1,0]</td>
</tr>
<tr>
<td colspan="7" align="center">Sigmoid-QN (<xref ref-type="bibr" rid="B136">Yang et al., 2019</xref>)</td>
</tr>
<tr>
<td align="center">21</td>
<td align="center">(1,32,32)</td>
<td align="center">100 &#x2b; 100<sup>&#x2a;6</sup>
</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[1,1,1]</td>
</tr>
<tr>
<td align="center">22</td>
<td align="center">(1,32,32)</td>
<td align="center">100 &#x2b; 100</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[0,1,1]</td>
</tr>
<tr>
<td align="center">23</td>
<td align="center">(1,32,32)</td>
<td align="center">100 &#x2b; 100</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">[1,0,1]</td>
</tr>
<tr>
<td align="center">24</td>
<td align="center">(1,32,32)</td>
<td align="center">100 &#x2b; 100</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[1,1,0]</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x2a;W/A/G: weights/activations/gradients,<sup>&#x2a;2</sup> (<inline-formula id="inf117">
<mml:math id="m135">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">bias</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Affine</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Aug</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>).</p>
</fn>
<fn>
<p>&#x2a; <sup>3</sup> ternary quantization, &#x2a; <sup>4</sup>200/700 epochs for FP/with quantization.</p>
</fn>
<fn>
<p>&#x2a; <sup>5</sup> 2-bits logarithmic quantization.</p>
</fn>
<fn>
<p>&#x2a; <sup>6</sup> 100/100 epochs for FP/with quantization.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4-4-2">
<title>4.4.2 Experiments</title>
<p>The dataset and neural network structure adopted in the experiments are CIFAR-10 and VGG-small, respectively. We prioritize the official code provided by the authors during the benchmark and keep a minimum modification. For a fair comparison, the data augmentation conducted in the experiments only includes random cropping and random horizontal flipping. The number of training epochs is chosen to be large enough for each case to be well-trained.</p>
<p>The results of different cases are visualized in <xref ref-type="fig" rid="F7">Figure 7</xref>. To compare the relative size of the networks in different cases, we calculate the averaged bit-width using the following equation (<xref ref-type="disp-formula" rid="e19">Equation 19</xref>):<disp-formula id="e19">
<mml:math id="m136">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Avg</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>B</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>0,0,0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>where <inline-formula id="inf118">
<mml:math id="m137">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the bit-width of weights <inline-formula id="inf119">
<mml:math id="m138">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in layer <inline-formula id="inf120">
<mml:math id="m139">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf121">
<mml:math id="m140">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the number of trainable parameters in the 1-bit QNN without bias and batch normalization affine parameters.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Visualization of the benchmark results, with important cases marked using &#x201c;Index&#x201d; and &#x201c;Notation&#x201d; in <xref ref-type="table" rid="T2">Table 2</xref>. The circle diameter represents the size of the model.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g007.tif">
<alt-text content-type="machine-generated">A scatter plot shows the relationship between accuracy and epochs for different neural network quantization methods. The y-axis represents accuracy (75% to 95%), and the x-axis shows epochs (from 0 to 1000). Data points in various colors represent different methods such as ANA, Dorefa, and ProxQuant. Legends indicate methods' corresponding colors. Some points are labeled with numbers and brackets indicating specific configurations. A dashed line represents a baseline accuracy just above 90%.</alt-text>
</graphic>
</fig>
<p>Overall, all the methods can achieve an accuracy higher than 85<inline-formula id="inf122">
<mml:math id="m141">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The BNN (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>), ProxQuant (<xref ref-type="bibr" rid="B2">Bai et al., 2018</xref>), LQ-net (<xref ref-type="bibr" rid="B141">Zhang et al., 2018</xref>) and Sigmoid-QN (<xref ref-type="bibr" rid="B136">Yang et al., 2019</xref>) methods result in accuracy over 90<inline-formula id="inf123">
<mml:math id="m142">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, close to the full precision model baseline (<xref ref-type="bibr" rid="B66">Lee et al., 2016</xref>). DoReFa and nBitQNN methods are more robust against different bias and affine operation settings. DoReFa introduces scale factors to each layer, and nBitQNN uses logarithmic quantization, which brings them extra representation abilities. ANA, DoReFa, and nBitQNN are not sensitive to the presence of data augmentation. The BNN method is more vulnerable to the absence of data augmentation. Comparing Case 1 and Case 3 (<xref ref-type="table" rid="T2">Table 2</xref>), the BNN method experiences an accuracy drop of 6.92% when data augmentation is removed.</p>
<p>An average bit-width is represented by the diameter of the circles in <xref ref-type="fig" rid="F7">Figure 7</xref>, which shows that the additional parameters from the biases and affine operations occupy a small portion of the network size. Even though shift-based batch normalization is introduced in (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>) and realized in (<xref ref-type="bibr" rid="B144">Zhijie et al., 2020</xref>), batch normalization still brings overhead during training.</p>
<p>LQ-net and Sigmoid-QN achieve the best accuracy close to the full precision baseline model at the cost of additional scale factors and the first and the last layers are not quantized. The BNN method can achieve an accuracy of over 90% while maintaining the size of a binary neural network with all the layers quantized without scale factors. However, it is sensitive to bias and batch normalization parameters. ProxQuant and nBitQNN also achieve high accuracy, but they do not quantize the activations. The ANA method also demonstrates high accuracy but has a larger model size and is sensitive to changes. DoReFa is robust against configuration changes and supports gradient quantization. However, the first and last layers are not quantized in DoReFa, and layer-wise scaling factors are adopted bringing overhead in terms of compression rate and computation complexity.</p>
</sec>
</sec>
<sec id="s4-5">
<title>4.5 Strategies to improve QNN performance</title>
<p>This section introduces general strategies helping to improve the performance of QNN, including learning rate scheduling and trade-offs between accuracy and model complexity.</p>
<sec id="s4-5-1">
<title>4.5.1 Learning rate scheduling</title>
<p>For QNNs, the selection of learning rate is more critical than full precision networks. A low learning rate leads to slow convergence, while a high learning rate causes an unstable weight update. <xref ref-type="fig" rid="F8">Figure 8</xref> shows the gradient descent process for both full precision and quantized networks. In full precision models, the loss surface is smooth, the gradients shrink down as the loss approaches the minima (<xref ref-type="fig" rid="F8">Figure 8a</xref>). In QNNs, the loss surface is stair-liked. If the gradients are calculated and referred to as the quantized value (methods using approximated gradients in <xref ref-type="sec" rid="s4-1">Section 4.1</xref>), the gradient preserves a constant value regardless of the current position on the flat plateau of the loss surface, causing a cross over the minimum (<xref ref-type="fig" rid="F8">Figure 8b</xref>). Hence, without gradient shrinking down as in the full precision case, the training of QNNs must be performed with lower learning rates compared with a full precision case. This phenomenon is confirmed by experiments in (<xref ref-type="bibr" rid="B121">Tang et al., 2017</xref>).</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Gradient descent process for <bold>(a)</bold> full precision and <bold>(b)</bold> quantized neural networks.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g008.tif">
<alt-text content-type="machine-generated">Illustration comparing two concepts. On the left, labeled &#x22;Full precision,&#x22; red spheres sit on a smooth, curved yellow surface. On the right, labeled &#x22;Quantized,&#x22; red spheres are placed on a jagged, stepped blue surface with a green arrow indicating movement.</alt-text>
</graphic>
</fig>
<p>QNN training usually starts with lower learning rates compared to full-precision networks. For instance, the learning rate for training full precision VGG network in (<xref ref-type="bibr" rid="B111">Simonyan and Zisserman, 2014</xref>) starts at <inline-formula id="inf124">
<mml:math id="m143">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, while for training a small VGG network for CIFAR-10 dataset in (<xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>) starts at <inline-formula id="inf125">
<mml:math id="m144">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Besides the difference in the starting learning rates, different learning scheduling methods are adopted for QNN training, including exponential decaying (<xref ref-type="bibr" rid="B17">Courbariaux et al., 2015</xref>; <xref ref-type="bibr" rid="B45">Hubara et al., 2017</xref>; <xref ref-type="bibr" rid="B114">Spallanzani et al., 2019</xref>; <xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>), and manual assigning (<xref ref-type="bibr" rid="B2">Bai et al., 2018</xref>; <xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>).</p>
</sec>
<sec id="s4-5-2">
<title>4.5.2 Trade for a higher accuracy</title>
<p>The majority of QNN training methods are offline methods, which can be performed on high-performance computation platforms, e.g., cloud servers. Meanwhile, some QNN applications are not strictly constrained by computation power or computation time. Hence, there are some strategies to increase the QNN accuracy at the cost of computation complexity or computation time.</p>
<p>The easiest way to improve accuracy without increasing a model size is to increase activation precision. For example, if the same network is trained with the DoReFa-Net method, a model trained with 2-bit activations achieves the accuracy of 86.5<inline-formula id="inf126">
<mml:math id="m145">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with SVHN dataset (<xref ref-type="bibr" rid="B87">Netzer et al., 2011</xref>), while the model with 1-bit activations results in 84.1<inline-formula id="inf127">
<mml:math id="m146">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> accuracy. In (<xref ref-type="bibr" rid="B121">Tang et al., 2017</xref>), instead of directly increasing activation precision, multiple-quantization of the activations is performed. This method is more suitable for CPU/GPU-based computation platforms, while directly increasing the number of bits is more preferred by FPGA/ASIC-based platforms. In (<xref ref-type="bibr" rid="B78">Liu et al., 2018</xref>), a full precision bypass is introduced in each layer. The activation of <inline-formula id="inf128">
<mml:math id="m147">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th layer in this architecture can be expressed as (<xref ref-type="disp-formula" rid="e20">Equation 20</xref>):<disp-formula id="e20">
<mml:math id="m148">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>where <inline-formula id="inf129">
<mml:math id="m149">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf130">
<mml:math id="m150">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the layer outputs before and after the quantization and activation, <inline-formula id="inf131">
<mml:math id="m151">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents quantization and activation function, and <inline-formula id="inf132">
<mml:math id="m152">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the operation the layer performs parameterized by <inline-formula id="inf133">
<mml:math id="m153">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. By introducing the bypass, a binary ResNet-34 network can achieve a top-1 accuracy of 69.7<inline-formula id="inf134">
<mml:math id="m154">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, showing an accuracy boost compared with the case without the bypass (67.9<inline-formula id="inf135">
<mml:math id="m155">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>).</p>
<p>In (<xref ref-type="bibr" rid="B121">Tang et al., 2017</xref>), a new regularization term substituting commonly-used L2 norm regularization is proposed. In binary QNN, the ideal quantized parameters take the values of {-1, 1}. However, the traditional L2 norm regularization term forces the parameters to approach zero, which contradicts the distribution of the parameters in a binarized network, resulting in frequent weight fluctuation during training. Hence, new regularization biases the update of parameters toward their designated quantized value (e.g., in binary quantization case, the parameters are biased toward <inline-formula id="inf136">
<mml:math id="m156">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) The new object function for QNN training using the proposed regularization term in binary quantization case can be expressed (<xref ref-type="disp-formula" rid="e21">Equation 21</xref>):<disp-formula id="e21">
<mml:math id="m157">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="double-struck">L</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>where <inline-formula id="inf137">
<mml:math id="m158">
<mml:mrow>
<mml:mi mathvariant="double-struck">L</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold">W</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf138">
<mml:math id="m159">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf139">
<mml:math id="m160">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the loss term, regularization term, and trade-off hyperparameter respectively.</p>
<p>The other QNN training strategies, including two-stage optimization (TS), progressive quantization (PQ), and guided training, are shown in (<xref ref-type="bibr" rid="B147">Zhuang et al., 2018</xref>). TS consists of two steps: (1) weight quantization during training, and (2) activation quantization with trained weights. TS helps to avoid local minima when training a network from scratch. Moreover, instead of quantizing directly to the fixed bit-width (e.g., 2 bits), the network is quantized progressively (e.g., 32-bits<inline-formula id="inf140">
<mml:math id="m161">
<mml:mrow>
<mml:mo>&#x2192;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>16-bits &#x2026; 2-bits) with the parameter in the higher precision model being the initial value for the lower precision model training. This guided training refers to QNN training using guidance loss from a teacher model, which shares the same structure as the quantized model. More specifically, at the output of each layer in the quantized model, the loss from the label (back propagated from the network output) is combined with the loss between the full precision and quantized model at the same position. This is called layer-wise knowledge distillation (<xref ref-type="bibr" rid="B42">Hinton et al., 2015</xref>; <xref ref-type="bibr" rid="B41">Heo et al., 2019</xref>; <xref ref-type="bibr" rid="B71">Leroux et al., 2020</xref>). With the proposed set of training strategies, the accuracy of the trained 4-bits AlexNet (<xref ref-type="bibr" rid="B62">Krizhevsky et al., 2017</xref>) outperforms the same network trained using DoReFa-Net (<xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>) by 2.8<inline-formula id="inf141">
<mml:math id="m162">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The other method to improve QNN accuracy is relaxing the compression rate if the application is not strictly constrained by memory size. In (<xref ref-type="bibr" rid="B16">Chu et al., 2021</xref>), mixed-precision QNN is proposed. As the features propagate along the network, they become more abstract and separable. Therefore, each layer of QNN can be quantized using a set of decreasing bit-widths as the layer goes downstream. For example, VGG-7 is quantized using {8-4-2-1-1-1}-bits for each layer from the input to the output, respectively. For the CIFAR-10 dataset, this network shows an accuracy of 93.22<inline-formula id="inf142">
<mml:math id="m163">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, while the full-precision model achieves an accuracy of 92.48<inline-formula id="inf143">
<mml:math id="m164">
<mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Also, the size of such QNN is 1.06 times the size of the binarized network.</p>
<p>In (<xref ref-type="bibr" rid="B102">Sakr et al., 2022</xref>), tensor clipping during QNN training is considered. Commonly, the tensor values are scaled based on the maximum in this tensor; however such scaling results in a large quantization step (especially for uniform quantization). As most of the tensor values are small, maximum scaling makes quantization bit-insufficient. In (<xref ref-type="bibr" rid="B102">Sakr et al., 2022</xref>), an algorithm to determine the optimal clipping and scaling factor for each tensor during each iteration of neural network training is shown. The optimal clipping factor minimizes the overall mean squared error including quantization error and clipped error.</p>
<p>For conventional quantization-aware training methods, it is common to use STE for gradient estimation. However, STE can cause gradient explosion by assigning clipped values with constant gradients. Though assigning zero gradients to the clipped values can avoid such explosion, the clipped values are prevented from being trained equivalent to shrink model size. To mitigate these two challenges, a magnitude-based gradient estimator, which assigns smaller gradients to the clipped values away from the threshold, is proposed in (<xref ref-type="bibr" rid="B102">Sakr et al., 2022</xref>). By applying both the optimal clipping values and gradient estimator, <inline-formula id="inf144">
<mml:math id="m165">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> accuracy degradation on the ImageNet dataset using 4-bit quantization compared to the full precision model is achieved in (<xref ref-type="bibr" rid="B102">Sakr et al., 2022</xref>).</p>
</sec>
</sec>
</sec>
<sec id="s5">
<title>5 Hardware implementation of QNNs</title>
<p>Efficient hardware implementation of a deep neural network for low-resource hardware requires compression techniques, including quantization, pruning, and Huffman coding (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>). Usually, reduction of energy consumption, memory access, and data transfers between memory and computation units is achieved by simplifying computationally complex operations, e.g., floating-point computations. In this section, we discuss three main types of QNN implementations: FPGA-based solutions, ASIC solutions, and emerging non-volatile memory (NVM) based CNN implementations. This review focuses on state-of-the-art works considering the studies from the last 5 years with systematic neural network evaluations on hardware.</p>
<p>In QNN hardware, different quantization schemes require different bit configurations (<xref ref-type="bibr" rid="B101">Ryu et al., 2020</xref>). QNN hardware can be divided into 3 main groups based on the quantization scheme: (1) fixed-point arithmetic, (2) power-of-two quantization (logarithmic quantization), and (3) binary representation. Compared to the floating-point representation, fixed-point arithmetic keeps the location of the radix point fixed (<xref ref-type="fig" rid="F9">Figure 9a</xref>). The power-of-two scheme represents the weights in the form of <inline-formula id="inf145">
<mml:math id="m166">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, which allows replacing computationally expensive multipliers with a shift operation. In binary representation, 1-bit weights and activations are used; however, most of the implemented binary hardware still requires multi-bit support for the input layer and weight update (<xref ref-type="bibr" rid="B39">Hashemi et al., 2017</xref>). Compared to floating-point operations, QNNs improve energy efficiency significantly. For example, the 4-bit fixed-point representation and binarized neural networks allow for more than 90% of power savings compared to the 32-bit floating-point representation of weights.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>
<bold>(a)</bold> Floating-point <italic>versus</italic> fixed-point operations. <bold>(b)</bold> Off-chip memory bottleneck. <bold>(c)</bold> An example of moving from floating point operations to approximate operations in a processing element (<xref ref-type="bibr" rid="B132">Wei et al., 2019</xref>). <bold>(d)</bold> Reducing the memory space with low-precision computation. <bold>(e)</bold> Example of QNN accelerator (PE-processing element), modified from (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>). <bold>(f)</bold> PE preference for different hardware architecture under different bit-width.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g009.tif">
<alt-text content-type="machine-generated"> Diagram illustrating a hardware accelerator for quantum neural networks (QNN), highlighting several components: (a) Data representation shows floatingpoint to fixed-point conversion for efficiency. (b) Illustrates the off-chip memory bottleneck in QNN accelerators. (c) Displays floating-point and mixed-type processing elements for approximate computations. (d) Depicts memory efficiency using low-precision weights. (e) Shows system architecture with processing elements, SRAM, data bus, and control units. (f) Shows PE preference for different hardware architecture under different bit-width. FPGA implementation favors fully digital PEs designed for binary operations, logarithmic multiplications, and integer operations; ASIC implementation favors both SRAM crossbar-based (shorter bit-width, &#x003c; 4 bit) and fully digital PEs (longer bit-width, &#x003e; 8 bit);  Analog crossbar implementation favors a bit-width ranging from 2 to over 16 bits.</alt-text>
</graphic>
</fig>
<p>QNN hardware accelerators focus on different issues affecting hardware efficiency, including data movement, hardware efficiency, memory consumption, and hardware utilization.</p>
<p>Similar to any neural network hardware design, one of the main challenges of QNN hardware is data movement between the QNN accelerator and off-chip memory storing QNN weights (<xref ref-type="fig" rid="F9">Figure 9b</xref>). Typically, data movement is more energy-consuming than computation (<xref ref-type="bibr" rid="B13">Chen et al., 2016</xref>). This problem is targeted by in-memory computing-based QNN accelerators (<xref ref-type="bibr" rid="B60">Krestinskaya et al., 2023</xref>). The other challenge is to improve the energy efficiency of QNN hardware while preserving the performance accuracy. This can be addressed by combining fixed-point approximate operations with floating-point operations to create mixed-type processing elements (<xref ref-type="fig" rid="F9">Figure 9c</xref>) (<xref ref-type="bibr" rid="B132">Wei et al., 2019</xref>). Memory consumption problem is tackled by lowering data (weights) precision (<xref ref-type="fig" rid="F9">Figure 9d</xref>) (<xref ref-type="bibr" rid="B132">Wei et al., 2019</xref>). Various data reuse strategies are explored to improve memory efficiency (<xref ref-type="bibr" rid="B1">Ankit et al., 2019</xref>; <xref ref-type="bibr" rid="B138">Yao et al., 2020</xref>; <xref ref-type="bibr" rid="B113">Song et al., 2017</xref>; <xref ref-type="bibr" rid="B104">Shafiee et al., 2016</xref>). Hardware efficiency often comes with the cost of hardware utilization, when some processing elements (PEs) remain unused (<xref ref-type="fig" rid="F9">Figure 9e</xref>). Data utilization, data flow complexity, and resource allocation along with resource parallelisms should be considered in any QNN design, especially for FPGA-based QNNs (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>). Different processing engine designs are summarized in <xref ref-type="fig" rid="F9">Figure 9f</xref>. For FPGA-based accelerators, the &#x201c;on-chip&#x201d; part only refers to the parallel acceleration parts implemented on FPGA fabrics. For some FPGA accelerator designs, the host CPU is either implemented using on-chip ARM processors or directly synthesized using resources on FPGA fabrics.</p>
<p>The other challenge of translating QNN to hardware implementation is relate to algorithm-hardware co-design challenges (<xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>). This involves finding the trade-offs between performance accuracy and hardware cost. The transition from software design of QNN to hardware implementation involves loops unrolling in a software algorithm, mapping software computations to particulate hardware blocks, array partitioning, matrix decomposition, loop pipelining, etc. In addition, it is also important to accommodate the quantization and processing differences in different types of layers. For example, convolution layers are computational-centric (few parameters requiring many computations), while fully connected layers are memory-centric (many parameters used once requiring loading from external memory in some cases contributing to the hardware efficiency limitations) (<xref ref-type="bibr" rid="B94">Qiu et al., 2016</xref>). The other challenge is to accommodate layer-wise and mixed-precision quantization implemented in software into the hardware, which varies from one QNN design to the other.</p>
<p>This section focuses on QNN architectures implemented on FPGA and ASIC, and related open challenges. Also, designs with emerging non-volatile memory devices are considered. <xref ref-type="table" rid="T3">Table 3</xref> shows the summary of QNN hardware architectures. <xref ref-type="fig" rid="F9">Figure 9</xref> illustrates the area and power efficiency of different QNN architectures with respect to the precision of weights. The performance of RRAM-based architectures and SRAM-based ASIC implementations of QNNs are comparable in terms of power efficiency, while FPGA-based designs compromise energy efficiency due to hardware reconfigurability. This section also provides general guidelines on software-hardware co-design of QNN accelerators.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Summary of QNN hardware.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Work</th>
<th align="center">Main features</th>
<th align="center">Quantization</th>
<th align="center">Implementation and hardware parameters</th>
<th align="center">Performance, Power, power/Area efficiency</th>
<th align="center">Network architecture (database)</th>
<th align="center">Training support</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="7" align="center">FPGA architectures</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B125">Umuroglu et al. (2017)</xref>
</td>
<td align="left">FINN: Binary inference accelerator</td>
<td align="center">1b-<inline-formula id="inf146">
<mml:math id="m167">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mtext>U</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>6</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>,A<sup>&#x2a;7</sup>]</td>
<td align="center">Xilinx Zynq 706<break/>186 BRAM, 200&#xa0;MHz</td>
<td align="center">11.6 TOPS<break/>408 GOPS/s/W</td>
<td align="center">BinaryNet, VGG-16 CIFAR-10,SVHN)<break/>
</td>
<td align="center">no</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B32">Guo et al. (2018)</xref>
</td>
<td align="left">FBNA: Binarized neural network accelerator</td>
<td align="center">1b-[U,A]</td>
<td align="center">Xilinx Zynq 702<break/>103 BRAM</td>
<td align="center">722 GOPS<break/>219 GOPS/s/W</td>
<td align="center">2 conv and 3 FC (CIFAR10,SVHM)</td>
<td align="center">no</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B11">Chen et al. (2020)</xref>
</td>
<td align="left">QNN framework for FPGA</td>
<td align="center">
<inline-formula id="inf147">
<mml:math id="m168">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>16</mml:mn>
<mml:mtext>b</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> (W/A)</td>
<td align="center">Xilinx ZCU102 (only conv. for inference)<break/>200&#xa0;MHz</td>
<td align="center">957.4 GOPS, 19.6W<break/>48.85 GOPs/W</td>
<td align="center">ResNet, DenseNet, AlexNet (MNIST, CIFAR-10/100<break/>SVHN, ImageNet)</td>
<td align="center">
<inline-formula id="inf148">
<mml:math id="m169">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>yes</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B9">Chang et al. (2021)</xref>
</td>
<td align="left">Mix and Match: QNN with mixed scheme quantization</td>
<td align="center">4b-[N,A] (W/A)</td>
<td align="center">Zynq XC7Z020<break/>Zynq XC7Z045, 100&#xa0;MHz</td>
<td align="center">77.0-360 GOPS (depends on FPGA)</td>
<td align="center">ResNet-18, MobileNet-v2 (CIFAR10/100, ImageNet)</td>
<td align="center">yes</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B142">Zhang et al. (2021)</xref>
</td>
<td align="left">FracBNN: all binary BNN implementation with fractional activations</td>
<td align="center">2b-[U,A]</td>
<td align="center">Xilinx Ultra96 v2 (for inference), 250&#xa0;MHz</td>
<td align="center">6.1&#xa0;W</td>
<td align="center">MobileNetV2 (ImageNet)</td>
<td align="center">yes, QAT</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B132">Wei et al. (2019)</xref>
</td>
<td align="left">Hybrid-type QNN on FPGA</td>
<td align="center">Hybrid 4-6b-[U,A] (W/A in conv.) <break/>&#x2b; floating in outputs</td>
<td align="center">FPGA Xilinx xc7k325tffg900-2<break/>100&#xa0;MHz, 73.5 36&#xa0;Kb BRAM<break/>11.91 Gbps</td>
<td align="center">-</td>
<td align="center">Lenet-5 (MSTAR)</td>
<td align="center">yes, QAT</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B102">Sun et al. (2022)</xref>
</td>
<td align="left">Intra-layer mixed-precision quantization based accelerator</td>
<td align="center">4b,8b-[U,A] (W)<break/>5b-[U,A] (A)</td>
<td align="center">Xilinx ZCU102<break/>150MHz, 440 BRAM</td>
<td align="center">12W, 24.8/69 GOPs/s/W (ResNet-50/MobileNet), 320/891 GOPS</td>
<td align="center">ResNet-18, ResNet-50<break/>MobileNet-v2</td>
<td align="center">QAT</td>
</tr>
<tr>
<td colspan="7" align="center">ASIC</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B13">Chen et al. (2016)</xref>
</td>
<td align="left">Eyeriss: CNN accelerator based on NoC</td>
<td align="center">16b-[U,-] (W/A)</td>
<td align="center">SIMD, 65nm, 181.5&#xa0;Kb SRAM<break/>12.25mm<sup>2</sup>, 200&#xa0;MHz</td>
<td align="center">153.6 GOPS, 0.82&#x2013;1.17V<break/>278&#xa0;mW (AlexNet), 236&#xa0;mW (VGG-16)</td>
<td align="center">AlexNet, VGG-16 (ImageNet)<break/>
</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B7">Biswas and Chandrakasan (2018)</xref>
</td>
<td align="left">CONV-SRAM: IMC architecture for convolution operation</td>
<td align="center">1b W<break/>6b A-[U,-]</td>
<td align="center">Crossbar array IMC<break/>65nm, 16KB SRAM, 250&#xa0;MHz</td>
<td align="center">51.3 TOPS/s/W<break/>57 GOPS/mm<sup>2</sup>, 4 GOPS</td>
<td align="center">Lenet-5 (CIFAR-10)</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B14">Chen et al. (2019)</xref>
</td>
<td align="left">Eyeriss v2: Hierarchical mesh NoC based accelerator</td>
<td align="center">8b-[U,-] (W/A)<break/>20b <inline-formula id="inf149">
<mml:math id="m170">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>PS</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">SIMD<break/>65nm, 246Kb SRAM, 200&#xa0;MHz</td>
<td align="center">253.2 GOPS/s/W (AlexNet)<break/>193.7 GOPS/s/W (MobileNet)</td>
<td align="center">AlexNet<break/>MobileNet</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B140">Yin et al. (2020)</xref>
</td>
<td align="left">XNOR-SRAM: Mixed-signal<break/>IMC architecture with ternary operation</td>
<td align="center">1b-[N,-] W<sup>&#x2a;3</sup>
</td>
<td align="center">Crossbar array IMC with SRAM<break/>65nm, 256 &#xd7; 64 SRAM</td>
<td align="center">403 TOPS/s/W<break/>5461 GOPS/mm<sup>2</sup>
</td>
<td align="center">CIFAR-10</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B14">Chang and Chang (2019)</xref>
</td>
<td align="left">2D systolic array based QNN</td>
<td align="center">8-16b-[U,-] (W/A)</td>
<td align="center">Systolic array, 144&#xa0;Kb SRAM<break/>1024&#xa0;PEs</td>
<td align="center">-</td>
<td align="center">Yolov3-tiny<break/>VGG-16, AlexNet</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B77">Liu et al. (2021)</xref>
</td>
<td align="left">VWA: Vectorwise CNN kernel accelerator based on systolic array</td>
<td align="center">16b-[U,-] (W/A)</td>
<td align="center">Systolic array<break/>40nm, 1.56&#xa0;mm<sup>2</sup> (core), <inline-formula id="inf150">
<mml:math id="m171">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>267</mml:mn>
<mml:mtext>mm</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (logic)<break/>191Kb SRAM, 500&#xa0;MHz,168PEs</td>
<td align="center">168 GOPS, 1.084 TOPS/s/W<break/>154&#xa0;mW (per core, VGG-16)</td>
<td align="center">VGG-16, ResNet-34<break/>GoogLeNet, Mobilenet (ImageNet)</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B84">Moons et al. (2017)</xref>
</td>
<td align="left">Envision: variable precision<break/>CNN processor</td>
<td align="center">4b,8b,16b-[U,-]</td>
<td align="center">SIMD, 28nm, 200&#xa0;MHz<break/>144KB SRAM, 1.87&#xa0;mm<sup>2</sup>
</td>
<td align="center">0.41 TOPS (4b/4b), 4.3 TOPS/s/W (4b/4b)<break/>0.43 TOPs/s/mm<sup>2</sup> (4b/4b)</td>
<td align="center">AlexNet<break/>VGG-16 (conv. only)</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B67">Lee et al. (2018),</xref> <xref ref-type="bibr" rid="B108">Shin et al. (2017)</xref>
</td>
<td align="left">UNPU: Variable bit precision accelerator for CNN and RNN and FC layers</td>
<td align="center">Variable<break/>1&#x2013;16&#xa0;b-[U,-]</td>
<td align="center">SIMD, 65&#xa0;nm CMOS<break/>16 <inline-formula id="inf151">
<mml:math id="m172">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>mm</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> die, 256&#xa0;KB SRAM<break/>0.63&#x2013;1.1V, 200&#xa0;MHz, 13.18&#xa0;mm<sup>2</sup>
</td>
<td align="center">345.6 GOPS (16b), 7372 GOPS (1b)<break/>297&#xa0;mW @1.1V<break/>3.08 TOPS/s/W (16b), 50.6 TOPS/s/W (1b)</td>
<td align="center">AlexNet<break/>VGG-16 (ImageNet)</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B74">Lin et al. (2020a)</xref>
</td>
<td align="left">Dual-core deep-learning accelerator in 5G Smartphone SoC</td>
<td align="center">8b, 16b-[U,-]</td>
<td align="center">SIMD, 7nm, 290&#x2013;880&#xa0;MHz<break/>2176&#xa0;kB SRAM, 3.04&#xa0;mm<sup>2</sup>
</td>
<td align="center">3.6 TOPS (8b), 6.83 TOPS/s/W (8b)<break/>1.19 TOPs/s/mm<sup>2</sup> (8b)</td>
<td align="center">Inception-v3<break/>MobileNet-v1</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B51">Jiao et al. (2020)</xref>
</td>
<td align="left">Programmable Convolution- Efficient Neural-Processing-Unit chip</td>
<td align="center">8b, 16b-[U,-]</td>
<td align="center">SIMD, 12&#xa0;nm, 290&#x2013;880&#xa0;MHz<break/>196,608&#xa0;kB SRAM, 709 <inline-formula id="inf152">
<mml:math id="m173">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>mm</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">825 TOPS (8b/8b), 2.95 TOPS/s/W (8b/8b)<break/>1.17 TOPs/s/mm<sup>2</sup> (8b/8b)</td>
<td align="center">ResNet50-v1</td>
<td align="center">no, PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B99">Ryu et al. (2022),</xref> <xref ref-type="bibr" rid="B100">Ryu et al. (2019)</xref>
</td>
<td align="left">Bitblade: Variable bit-precision accelerator</td>
<td align="center">2b,4b,8b-[U,-] (W/I)</td>
<td align="center">SIMD, 28nm, 44&#x2013;195&#xa0;MHz<break/>144KB SRAM, 0.71&#xa0;mm<sup>2</sup>
</td>
<td align="center">1.42 TOPS (2b/2b), 44.1 TOPS/s/W (2b/2b)<break/>3.3 TOPs/s/mm<sup>2</sup> (2b/2b)</td>
<td align="center">AlexNet, VGG-16<break/>ResNet-18, MobileNet</td>
<td align="center">no</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B77">Liu et al. (2021)</xref>
</td>
<td align="left">Multi-precision RRAM CNN architecture for layerwise quantization</td>
<td align="center">6&#x2013;10&#xa0;b-[U,A] (W)</td>
<td align="center">Crossbar array IMC with RRAM<break/>45nm, 100&#xa0;MHz</td>
<td align="center">3.44 TOPS/s/W (only crossbars)</td>
<td align="center">Lenet-5, VGG-16<break/>ResNet-18</td>
<td align="center">
<inline-formula id="inf153">
<mml:math id="m174">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>part.</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B123">Ueyoshi et al. (2018)</xref>
</td>
<td align="left">QUEST: Inference Engine with s 3D stacking SRAMs <break/>for CNN and RNN and FC layers</td>
<td align="center">Variable<break/>1-4b-[L,-] (W/A)</td>
<td align="center">MIMD, 40nm, 113 <inline-formula id="inf154">
<mml:math id="m175">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>mm</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
<break/>7680&#xa0;KB SRAM<break/>1.1 V, 330&#xa0;MHz (max)</td>
<td align="center">1.98 TOPS (4b), 7.49 TOPS (1b), 3.3W<break/>2.27 TOPS/s/W (1b)<break/>0.59 TOPS/s/W (4b)</td>
<td align="center">AlexNet (ImageNet)<break/>LeNet (MNIST)<break/>VGG-11 (CIFAR-10)</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B101">Ryu et al. (2020)</xref>
</td>
<td align="left">Deep QNN accelerator with precision scablable PEs</td>
<td align="center">Scalable precision<break/>4-16b-[U,-] (W/A)</td>
<td align="center">SIMD, 28nm, 0.71&#xa0;mm<sup>2</sup>
<break/>144&#xa0;KB SRAM<break/>44&#xa0;MHz @0.6V, 195&#xa0;MHz@1V</td>
<td align="center">1.42 TOPS for 2b/2b<break/>7.8&#xa0;mW@0.6V and 74&#xa0;mW@1V, 44.1 TOPS/s/W</td>
<td align="center">VGG-16, ResNet<break/>AlexNet (ImageNet)</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B104">Shafiee et al. (2016)</xref>
</td>
<td align="left">ISAAC RRAM-based accelerator</td>
<td align="center">16b-[N,-]</td>
<td align="center">Crossbar array IMC with RRAM<break/>32nm, 128 &#xd7; 128 tiles</td>
<td align="center">446 GOPs/s/mm<sup>2</sup>, 380 GOPs/s/W</td>
<td align="center">VGG</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B113">Song et al. (2017)</xref>
</td>
<td align="left">Pipelayer RRAM-based accelerator</td>
<td align="center">16b-[U,-]</td>
<td align="center">Crossbar array IMC with RRAM<break/>128 &#xd7; 128 tiles</td>
<td align="center">1485 GOPs/s/mm<sup>2</sup>, 142 GOPs/s/W</td>
<td align="center">AlexNet, VGG</td>
<td align="center">yes<break/>on-chip</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B119">Sun et al. (2018b)</xref>
</td>
<td align="left">XNOR-BNN with SAs</td>
<td align="center">1b-[U,-]</td>
<td align="center">Crossbar array IMC with RRAM<break/>45nm, 128 &#xd7; 128 tiles</td>
<td align="center">141 TOPs/s/W</td>
<td align="center">6 conv, 3FC</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B117">Sun et al. (2018a)</xref>
</td>
<td align="left">Binary CNN</td>
<td align="center">1b-[U,-]</td>
<td align="center">Crossbar array IMC with RRAM, 65&#xa0;nm</td>
<td align="center">137 TOPs/s/W</td>
<td align="center">4 conv, 3FC</td>
<td align="center">QAT (6b)</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B146">Zhu et al. (2019)</xref>
</td>
<td align="left">CNN with layer-wise quantization</td>
<td align="center">8b/6b-[U,-] (W/O)</td>
<td align="center">Crossbar array IMC with RRAM<break/>45nm, 256 &#xd7; 256 tiles</td>
<td align="center">3440 GOPs/s/W</td>
<td align="center">Lenet, VGG-16, ResNet</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B1">Ankit et al. (2019)</xref>
</td>
<td align="left">PUMA RRAM-based accelerator</td>
<td align="center">16b-[N,-]</td>
<td align="center">Crossbar array IMC with RRAM<break/>32nm, 128 &#xd7; 128 tiles</td>
<td align="center">577 GOPs/s/mm<sup>2</sup>, 837 GOPs/s/W</td>
<td align="center">VGG, LSTM</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B130">Wang et al. (2019b)</xref>
</td>
<td align="left">RRAM-based QNN inference architecture</td>
<td align="center">8b-[U,-]</td>
<td align="center">Crossbar array IMC with RRAM<break/>65nm, 256 &#xd7; 64 tiles</td>
<td align="center">5.9 TOPs/s/W</td>
<td align="center">VGG-16, MobileNet</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B138">Yao et al. (2020)</xref>
</td>
<td align="left">Fabricated RRAM-based CNN implementation</td>
<td align="center">3b-[U,A] (W)<break/>8b-[U,A] (I/O)</td>
<td align="center">Crossbar array IMC with RRAM<break/>130nm, 126 &#xd7; 16 tiles</td>
<td align="center">1164 GOPs/s/mm<sup>2</sup>, 11 TOPs/s/W</td>
<td align="center">2 conv, 1 FC</td>
<td align="center">hybrid<break/>PTQ</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B28">Gi et al. (2022)</xref>
</td>
<td align="left">RRAM-based CNN accelerator with analog layer normalization</td>
<td align="center">1b-[U,-] (W)<break/>12b (O)</td>
<td align="center">Crossbar array IMC with RRAM<break/>180nm, 25 &#xd7; 25 tiles (PCB)</td>
<td align="center">10 TOPS/W</td>
<td align="center">4-layer CNN (MNIST)</td>
<td align="center">no<break/>PTQ</td>
</tr>
<tr>
<td align="left">(<xref ref-type="bibr" rid="B10">Chen et al., 2022</xref>)</td>
<td align="left">RRAM-based CNN accelerator with capacitive coupling</td>
<td align="center">1b-[U,-] (W)<break/>1&#x2013;8b (O)</td>
<td align="center">Crossbar array IMC with RRAM<break/>28&#xa0;nm</td>
<td align="center">400 TOPS/W</td>
<td align="center">Customized CNN (MNIST)</td>
<td align="center">no<break/>PTQ</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x2a;W/A/I/O: weights/activations/inputs/outputs, &#x2a; 2&#xa0;PS:partial sums, &#x2a; 3 binary/ternary inputs, &#x2a; 4based on reconstructed gradient,<sup>5</sup> partially, retraining CNN, after quantizing &#x2a; 6U/L/N: uniform/logarithmic/non-uniform quantization, &#x2a; 7A/E/-: approximated gradients/exact gradients/not mentioned.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s5-1">
<title>5.1 FPGA-based implementations of QNNs</title>
<p>Field Programmable Gate Arrays (FPGAs) are designed for fixed point computations implemented using lookup tables (LUTs). Even though floating-point computation is possible to implement on FPGA using Digital Signal Processing (DSP) blocks, this is expensive and inefficient. To convert a QNN design into FPGA-based hardware implementation, different frameworks can be created to automate such conversion (<xref ref-type="bibr" rid="B125">Umuroglu et al., 2017</xref>; <xref ref-type="bibr" rid="B124">2020</xref>). For example, LogicNets framework converts trained QNNs into equivalent netlists of truth tables for FPGA including network sparsity exploration to reduce neuron fan-in (<xref ref-type="bibr" rid="B124">Umuroglu et al., 2020</xref>). Fan-in reduction contributes to efficient LUT-based QNN implementations on FPGA.</p>
<sec id="s5-1-1">
<title>5.1.1 Binarized and multi-bit precision neural networks on FPGA</title>
<p>Binarized Neural Networks (BNNs) are the most resource-efficient QNN designs on FPGA (<xref ref-type="fig" rid="F10">Figure 10</xref>) (<xref ref-type="bibr" rid="B125">Umuroglu et al., 2017</xref>; <xref ref-type="bibr" rid="B32">Guo et al., 2018</xref>; <xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>; <xref ref-type="bibr" rid="B93">Qin et al., 2020</xref>). One of the most well-known BNN accelerators on FPGA is FINN (<xref ref-type="bibr" rid="B125">Umuroglu et al., 2017</xref>), which automatically converts Theano-trained BNN to synthesizable C&#x2b;&#x2b; description with optimized hardware blocks to synthesize a bitfile through High-Level Synthesis (HLS) software to deploy to FPGA. The binarization of neural network weights reduces memory consumption and improves computation speed. Typically, the input layer of BNN is not binarized to preserve input features and accuracy. To binarize the input layer in BNN, binary padding can be used to provide resource parallelism and scalability for FPGA-based implementations, as in (<xref ref-type="bibr" rid="B32">Guo et al., 2018</xref>; <xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>).</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Power <bold>(a)</bold> and area <bold>(b)</bold> efficiency of QNN hardware implementations with respect to weight precision.</p>
</caption>
<graphic xlink:href="felec-06-1469802-g010.tif">
<alt-text content-type="machine-generated">Two scatter plots labeled (a) and (b) compare power efficiency and area efficiency against bits in weights. Each plot features various technologies like FPGAs, ASICs, and RRAM, represented by different shapes and colors. An arrow indicates increasing efficiency. Plot (a) shows power efficiency in GOPS per watt, while plot (b) displays area efficiency in GOPS per square millimeter. Key technologies include XNOR-SRAM, Bitblade, Lin et al., Envision, and ISAAC. Legends specify the symbols used.</alt-text>
</graphic>
</fig>
<p>Maintaining high accuracy after binarization is one of the main challenges of BNN, which requires specific training methods. Real-to-Binary Net framework proposed in (<xref ref-type="bibr" rid="B80">Martinez et al., 2020</xref>) performs progressive teacher-student training. Starting with a full-precision teacher model and a student model with soft-binarized activations (using <inline-formula id="inf155">
<mml:math id="m176">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> function), the student model is trained with additional guidance from the teacher model. In the following steps, the student model from the previous step becomes the teacher model in the current step, and the activations and weights of the new student model are progressively quantized in each step. By performing progressive teacher-student training, the resulting BNN experiences less accuracy degradation. The other method to preserve BNN accuracy is precision gating, where the important features are computed using higher precision. In FracBNN (<xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>), a dual-precision activation quantization is implemented where activations are quantized with either 1-bit or 2-bit based on their contribution to network accuracy (determined by a trainable parameter). An additional sparse binary convolution for the additional bit is performed for those critical activations that need to be quantized with 2 bits.</p>
<p>There have been several multi-bit FPGA-based implementations of QNN proposed recently (<xref ref-type="bibr" rid="B20">Ding et al., 2019</xref>; <xref ref-type="bibr" rid="B44">Hu et al., 2022</xref>; <xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>). One of the most common quantized weights representations used for FPGA-based QNNs is the power-of-two method quantization method, as in FlightNN (<xref ref-type="bibr" rid="B20">Ding et al., 2019</xref>). To improve hardware efficiency further, the multiplication operations can be replaced by a lightweight shift operation (<xref ref-type="bibr" rid="B20">Ding et al., 2019</xref>) or be approximated by a different number of shift-and-add operations (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>). To improve QNN efficiency further, the design of DSP blocks for quantized MAC operations can be optimized (<xref ref-type="bibr" rid="B44">Hu et al., 2022</xref>).</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Mixed-precision and hybrid neural networks on FPGA</title>
<p>Mixed-precision and hybrid QNN designs require additional design considerations for efficient implementation. Inconsistent precision throughout the neural network layers can affect the utilization of heterogeneous FPGA hardware resources (<xref ref-type="bibr" rid="B9">Chang et al., 2021</xref>). In the Mix-and-Match FPGA-based QNN optimization framework (<xref ref-type="bibr" rid="B9">Chang et al., 2021</xref>), this problem is avoided using mixed quantization, combining sum-of-power-of-2 (SP2) and fixed-point quantization schemes for different rows of weight matrix due to different distribution of weights in different rows. The quantization scheme can also be adjusted for the distribution of the weights. For example, the Mix-and-Match framework uses the quantization scheme suitable for Gaussian-like weight distribution, where multiplication arithmetic is replaced with logic shifters and adders that can be implemented on FPGA using LUTs.</p>
<p>Hybrid quantization can also be used to improve QNN accuracy and efficiency in FPGA-based implementations. For example, in hybrid-type inference in (<xref ref-type="bibr" rid="B132">Wei et al., 2019</xref>), both convolution kernels (feature maps) and parameters are quantized to a signed integer, while integer/floating mixed calculations are used for the outputs. In the inference phase, the weights and activations in convolution layers are quantized, while the dot product output is de-quantized and represented as a 32-bit floating-point number before batch-normalization operation. The floating-point batch normalization output is fetched to the activation function, and the activation function output is quantized to integer representation. This helps to reduce the number of LUTs, flip-flops, DSP blocks, and BRAM blocks in the design.</p>
<p>Mixed-precision can also be used for intra-layer quantization. In (<xref ref-type="bibr" rid="B116">Sun et al., 2022</xref>), a mixed-precision algorithm combines a majority of low-precision weights, e.g., 4 bits, with a minority of high-precision weights, e.g., 8 bits, within a layer. The weights leading to high quantization errors are assigned to be of high precision. Moreover, in (<xref ref-type="bibr" rid="B116">Sun et al., 2022</xref>), quantization optimization techniques, including DSP packing, weight reordering, and data packing, are used.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 ASIC implementations of QNNs</title>
<p>ASIC implementations of QNNs can be broadly categorized into conventional digital and mixed-signal designs, such as systolic arrays (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>; <xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>) or single/multiple instruction multiple data (S/MIMD)-based architectures with multiple cores (<xref ref-type="bibr" rid="B67">Lee et al., 2018</xref>; <xref ref-type="bibr" rid="B108">Shin et al., 2017</xref>), as well as designs leveraging emerging technologies like In-memory computing (IMC) (<xref ref-type="bibr" rid="B59">Krestinskaya et al., 2022</xref>; <xref ref-type="bibr" rid="B54">2024a</xref>) and neuromorphic computing (<xref ref-type="bibr" rid="B107">Shen G. et al., 2024</xref>; <xref ref-type="bibr" rid="B81">Matinizadeh et al., 2024</xref>). Among these emerging technologies, SRAM- and RRAM-based IMC implementations have advanced the most, therefore, this work primarily focuses on them. The key distinction between IMC-based designs and traditional von Neumann architectures, where memory and processing units are separate, is that computation occurs directly within the memory. IMC designs can be based on either volatile (SRAMs and DRAMs) or non-volatile memory devices, e.g., resistive random-access memory devices (RRAMs), phase-change memory devices (PCM or PCRAM), etc (<xref ref-type="bibr" rid="B60">Krestinskaya et al., 2023</xref>).</p>
<sec id="s5-2-1">
<title>5.2.1 Fixed-precision ASIC implementations of QNN</title>
<p>Based on <xref ref-type="fig" rid="F9">Figure 9</xref>, ASIC implementations of QNNs are more efficient than FPGA-based implementations, as they are usually hardwired in an optimum way and cannot be reconfigured. Same as FPGA-based designs, ASIC-based implementations also use a shift operator instead of the multipliers via power-of-two quantization to improve energy efficiency. The multiplication can also be converted to two shift operations and one addition, as in LightNN (<xref ref-type="bibr" rid="B21">Ding et al., 2017</xref>; <xref ref-type="bibr" rid="B19">Ding et al., 2018</xref>). Also, approximate multiplication can be used, which drops the least significant powers of two limiting the number of shifts and adds (<xref ref-type="bibr" rid="B19">Ding et al., 2018</xref>). Some ASIC accelerator designs retain a certain level of flexibility (<xref ref-type="bibr" rid="B83">Moon et al., 2022</xref>; <xref ref-type="bibr" rid="B69">Lee S. K. et al., 2021</xref>). In (<xref ref-type="bibr" rid="B83">Moon et al., 2022</xref>), a framework supporting from 1 to 4-bit of arbitrary base quantization (<xref ref-type="bibr" rid="B91">Park et al., 2017</xref>) is proposed. For arbitrary base quantization, hardware blocks performing sorting, grouping, and population counting are adopted. In (<xref ref-type="bibr" rid="B69">Lee S. K. et al., 2021</xref>), an accelerator supporting both 8/16-bit floating point format and 2/4-bit integer format, where data pipelines for these formats are separated and implemented in dedicated hardware, is proposed. This accelerator supports both training and inference using floating point and integer data correspondingly.</p>
<p>The hardware efficiency of QNN accelerators is affected by architecture hierarchy, organization of processing elements (PEs), network-on-chip (NoC) structure, and the type of NoC. For example, Eyeriss is the other accelerator using 16-bit fixed-point computation, where data movement and DRAM access are reduced by reusing data locally (<xref ref-type="bibr" rid="B13">Chen et al., 2016</xref>). The improved version of Eyeriss, Eyeriss v2 (<xref ref-type="bibr" rid="B14">Chen et al., 2019</xref>), has a hierarchical mesh NoC with sparse PE architecture adaptable to the different amounts of data reuse and bandwidth requirements aiming to improve resource utilization.</p>
<p>CONV-SRAM (<xref ref-type="bibr" rid="B7">Biswas and Chandrakasan, 2018</xref>) and XNOR-SRAM (<xref ref-type="bibr" rid="B140">Yin et al., 2020</xref>) architectures are the other ASIC QNN accelerators to improve energy efficiency and reduce the number of computations. In (<xref ref-type="bibr" rid="B140">Yin et al., 2020</xref>), binary weights and ternary data representation [-1,0,1] for XNOR-and-accumulate operation are used. To improve computation speed and reduce memory access, systolic array-based CNN implementation can be used (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>; <xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>). In (<xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>), a systolic array-based CNN, VWA, aiming for high hardware utilization with a low area overhead and suitable for different sizes of convolution kernels with 8-bit fixed point computation is shown. In (<xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>), the systolic array-based accelerator with 8/16-bit integer linear symmetric quantization of both activations and weights in convolution and fully connected layers is illustrated.</p>
<p>In IMC-based implementations, SRAM-based QNN architectures, such as CONV-SRAM (<xref ref-type="bibr" rid="B7">Biswas and Chandrakasan, 2018</xref>) and XNOR-SRAM (<xref ref-type="bibr" rid="B140">Yin et al., 2020</xref>), offer greater energy efficiency than traditional designs. Meanwhile, RRAM-based QNNs provide high computational density, energy efficiency, non-volatility, and scalability (<xref ref-type="bibr" rid="B59">Krestinskaya et al., 2022</xref>; <xref ref-type="bibr" rid="B112">Smagulova et al., 2023</xref>). With non-volatile multi-level memories, MAC operations occur in the analog domain, enabling higher storage density and faster computations (<xref ref-type="bibr" rid="B57">Krestinskaya and James, 2020</xref>). In IMC architectures, memory devices in a crossbar structure multiply row voltages by device conductances (weights), with accumulated column current as the MAC output. Quantization in multi-level IMC arises from the limited conductance levels per device (<xref ref-type="bibr" rid="B146">Zhu et al., 2019</xref>). Activation quantization is managed by peripheral DACs and ADCs. However, non-volatile IMC devices face variations, non-linear switching, and conductance drift, which require mitigation techniques.</p>
<p>In IMC-based binarized neural networks (BNNs), weights are represented using 1-bit or multi-level devices, utilizing only a high-resistive state (HRS) and a low-resistive state (LRS). Low-bit IMC designs are simpler, more robust, and less susceptible to device variations than higher-bit IMC architectures. Several RRAM-based BNN implementations have been proposed, including those in (<xref ref-type="bibr" rid="B119">Sun et al., 2018b</xref>; <xref ref-type="bibr" rid="B117">a</xref>). In (<xref ref-type="bibr" rid="B119">Sun et al., 2018b</xref>), MAC operations are performed using XNOR logic, enabling the replacement of complex, power-hungry ADCs with 1-bit sense amplifiers (SAs) (<xref ref-type="bibr" rid="B117">Sun et al., 2018a</xref>). To enhance area efficiency (<xref ref-type="bibr" rid="B10">Chen et al., 2022</xref>), introduces an RRAM-based accelerator using capacitive coupling (1T1R1C) cells with binary weights and multi-bit output. In (<xref ref-type="bibr" rid="B28">Gi et al., 2022</xref>), an RRAM-based accelerator with analog layer normalization is proposed, eliminating the need to store intermediate layer outputs in external memory. Meanwhile (<xref ref-type="bibr" rid="B52">Kim et al., 2022</xref>), presents an ADC-free RRAM-based BNN, reducing hardware overhead compared to conventional RRAM-based IMC architectures with ADCs (<xref ref-type="bibr" rid="B119">Sun et al., 2018b</xref>).</p>
<p>Multi-bit IMC-based QNN implementations have the advantage of higher computation density, however, may suffer from ADC complexity (<xref ref-type="bibr" rid="B59">Krestinskaya et al., 2022</xref>). In IMC architectures, high-precision neural network weights are often formed by combining several low-bit IMC devices in a crossbar (<xref ref-type="bibr" rid="B60">Krestinskaya et al., 2023</xref>). The design combining several 1-bit RRAM cells for higher precision weights are shown in (<xref ref-type="bibr" rid="B104">Shafiee et al., 2016</xref>; <xref ref-type="bibr" rid="B113">Song et al., 2017</xref>; <xref ref-type="bibr" rid="B1">Ankit et al., 2019</xref>; <xref ref-type="bibr" rid="B130">Wang Q. et al., 2019</xref>), where higher bit weight, e.g., 8 or 16 bits, are represented by 2-bit and 4-bit devices. Most IMC-based QNN accelerators process high-precision inputs using low-precision DACs and serial encoding, as seen in ISAAC (<xref ref-type="bibr" rid="B104">Shafiee et al., 2016</xref>), Pipelayer (<xref ref-type="bibr" rid="B113">Song et al., 2017</xref>), and PUMA (<xref ref-type="bibr" rid="B1">Ankit et al., 2019</xref>). ISAAC reduces ADC precision requirements by storing weights in both original and flipped forms to maximize zero-sums (<xref ref-type="bibr" rid="B104">Shafiee et al., 2016</xref>). Pipelayer enhances efficiency by leveraging intra-layer parallelism for training and inference (<xref ref-type="bibr" rid="B113">Song et al., 2017</xref>). PUMA employs a Network-on-Chip (NoC) architecture, where multiple cores, each integrating an RRAM crossbar and CMOS peripherals, facilitate scalable computation, additionally, its specialized instruction set architecture (ISA) and spatial architecture explicitly capture various access and reuse patterns, reducing the energy cost of moving data (<xref ref-type="bibr" rid="B1">Ankit et al., 2019</xref>). A fabricated CNN architecture presented in (<xref ref-type="bibr" rid="B138">Yao et al., 2020</xref>) adopts hybrid training to mitigate device variations and uses multiple copies of identical kernels in different parts of the memristor array so that the same weight data can be applied in parallel to different inputs. The network is first trained off-chip and then fine-tuned on-chip to improve robustness against hardware non-idealities.</p>
</sec>
<sec id="s5-2-2">
<title>5.2.2 Variable-precision and layer-wise quantization in ASIC implementations of QNN</title>
<p>Variable precision in ASIC QNN implementations aims to optimize the energy efficiency and the number of memory accesses without reducing the performance accuracy (<xref ref-type="bibr" rid="B51">Jiao et al., 2020</xref>; <xref ref-type="bibr" rid="B123">Ueyoshi et al., 2018</xref>). Fully fabricated CNN accelerators with variable precision are demonstrated in (<xref ref-type="bibr" rid="B74">Lin C.-H. et al., 2020</xref>; <xref ref-type="bibr" rid="B51">Jiao et al., 2020</xref>). State-of-the-art variable-precision QNN designs support flexibility and can vary the precision of neural network weights, as in a unified neural processing unit (UNPU) (<xref ref-type="bibr" rid="B67">Lee et al., 2018</xref>; <xref ref-type="bibr" rid="B108">Shin et al., 2017</xref>) supporting convolution, fully connected, and recurrent network layers. UNPU also explores the full architecture hierarchy of QNN accelerator, including 2-D mesh type NoC with the unified DNN cores including weights memory and PE performing MAC operation, 1-D SIMD core, RISC controller for instructions execution, aggregation core, and two external gateways connected to this NoC. The main aim of UNPU is to achieve the trade-off between accuracy and energy consumption.</p>
<p>Variable precision configuration can be controlled by additional circuit blocks supporting the variable quantization and additional hardware modifications. For example, in Envision (<xref ref-type="bibr" rid="B84">Moons et al., 2017</xref>), a dynamic-voltage-accuracy-frequency-scalable (DVAFS) multiplier switching on and off sub-multipliers to control the precision is used. The main drawback of such an approach is inefficient hardware utilization for low-precision operation, e.g., for 4-bit precision configuration, only 25% of sub-multipliers are utilized. In the other variable-precision accelerator, Bit Fusion (<xref ref-type="bibr" rid="B105">Sharma et al., 2018</xref>), bit-level processing elements dynamically fuse to match the bit-width of individual DNN layers aiming to reduce computation and communication costs. It divides the MAC operations into multiple operations to support variable precision reducing the number of required resources. In BitBlade (<xref ref-type="bibr" rid="B99">Ryu et al., 2022</xref>; <xref ref-type="bibr" rid="B100">2019</xref>), a bit-wise summation method based on <inline-formula id="inf156">
<mml:math id="m177">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>-bit multiplications followed by shift-addition operations supporting bit-widths of input activations and weights, is used aiming to reduce the number of memory accesses. In addition, some QNN accelerators can read only required data bits in the memory datawords depending on the precision, as in Quant-PIM (<xref ref-type="bibr" rid="B70">Lee Y. S. et al., 2021</xref>), which also reduces the number of memory accesses.</p>
<p>IMC-based QNN designs with layer-wise quantization and variable precision are demonstrated in (<xref ref-type="bibr" rid="B146">Zhu et al., 2019</xref>; <xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>; <xref ref-type="bibr" rid="B124">Umuroglu et al., 2020</xref>).</p>
</sec>
</sec>
<sec id="s5-3">
<title>5.3 QNN hardware challenges and open problems</title>
<sec id="s5-3-1">
<title>5.3.1 Memory access issues</title>
<p>The complexity of state-of-the-art network models and the number of weights stored in the memory grows exponentially with the network size. Therefore, memory access and communication between memory and processor becomes the main bottleneck for speed and energy consumption rather than computation. According to (<xref ref-type="bibr" rid="B127">Wang J. et al., 2019</xref>), the bus bandwidth between the memory and processing unit is around 167&#xa0;GB/s, while the reading operation bandwidth in traditional SRAM memories is 328&#xa0;TB/s. The trend is the same for the energy spent on data transmission between the memory and processing unit. If the readout operation requires an energy of 1.6pJ, the data transmission may take up to 42&#xa0;pJ in the same system (<xref ref-type="bibr" rid="B127">Wang J. et al., 2019</xref>). Overall, the problem with memory access is common for all types of neural network hardware implementations. Even though QNN designs target the reduction of memory accesses by lowering the computation precision, thus reducing the number of stored bits, the memory access problem is still relevant.</p>
<p>The problem of memory access is addressed by IMC-based designs keeping the processing of MAC operations close to the memory. However, the local or external memory is required to store the outputs of intermediate layers in the inference and preserve the gradients during the training. Even though the memory access challenge is reduced in IMC-based QNN implementations, IMC-based architectures can experience other problems related to the immaturity of non-volatile memory, which is the cause of device non-idealities. In addition, thorough design considerations are still required to create efficient QNN architectures, especially for on-chip QNN training.</p>
</sec>
<sec id="s5-3-2">
<title>5.3.2 Hardware overhead and hardware utilization in variable and reconfigurable precision designs</title>
<p>Flexibility and reconfigurability of the architecture are key for moving from task-specific to general-purpose neural network architectures. However, this reconfigurability leads to area overhead and hardware underutilization (<xref ref-type="bibr" rid="B101">Ryu et al., 2020</xref>). In QNN designs with variable bit precision and layer-wise quantization, the implementation of bit-reconfigurable designs and circuits is necessary to ensure minimum hardware overhead and the efficient utilization of hardware resources. Several mixed-precision quantization frameworks mentioned in previous sections focus on improving energy efficiency; however, they do not consider the control circuits overhead to implement mixed-precision models. For example, FPGA-based QNN architecture FlightNN is based on mixed-precision convolution filters on FPGA, while does not discuss the challenges of a full architecture implementation and scheduling (<xref ref-type="bibr" rid="B20">Ding et al., 2019</xref>).</p>
<p>Variable precision within and between QNN layers and adaptive quantization based on the distribution of weights and activations (<xref ref-type="bibr" rid="B19">Ding et al., 2018</xref>) may also lead to inefficient resource utilization. In many cases of variable bit precision in QNNs, the extra weights are simply switched off causing hardware utilization inefficiency. This problem is also valid for QNN on-chip training, where full precision computation is often required for weight update while the inference is typically quantized. To implement this, a QNN accelerator should support both full-precision and fixed-precision quantization. While full-precision computations are not used during the inference leading to inefficient hardware utilization.</p>
</sec>
<sec id="s5-3-3">
<title>5.3.3 Lack of efficient on-chip training on quantized hardware</title>
<p>Training complexity and duration are the other QNN challenges. The lack of differentiable gradients in QNN training leads to more training iterations compared to full-precision networks. Moreover, QNN training algorithms use full-precision computations for weight updates (<xref ref-type="bibr" rid="B19">Ding et al., 2018</xref>). Therefore, transferring such an algorithm to low-power hardware for on-chip training is complicated leading to the lack of QNN on-chip training architectures. In addition, such architectures may require variable precision support, and additional hardware overhead for routing, computation, and additional memory to store intermediate outputs during the training.</p>
<p>Several QNN frameworks make attempts to simplify the on-chip training on QNNs (<xref ref-type="bibr" rid="B132">Wei et al., 2019</xref>). For example, the reconstructed gradients in backpropagation can be used to solve the vanishing gradient problem instead of STE (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>). Merging quantization and de-quantization operations can be used to perform &#x201c;fake quantization&#x201d; to improve QNN accuracy with low bit-precision (<xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>). However, some functions still require full-precision computation. Implementation of QNN training algorithms with low-precision weight updates is also possible. For example, in LNS-Madam training precision is reduced to 4 bits combining a logarithmic number system (LNS) and a multiplicative weight update (<xref ref-type="bibr" rid="B143">Zhao et al., 2022</xref>). However, such algorithms and related hardware implementation for low-precision QNN training is still an open challenge.</p>
</sec>
<sec id="s5-3-4">
<title>5.3.4 Automated mixed-precision quantization</title>
<p>In some cases, it may be difficult to find the optimum quantization precision within or between the layers manually. Therefore, the automated mixed-precision quantization techniques are used to convert a software-based QNN to a hardware implementation (<xref ref-type="bibr" rid="B5">Benmeziane et al., 2021</xref>). Automated mixed precision quantization is a part of hardware-aware neural network search (HW-NAS). Various optimization techniques, from constrained problem optimization to reinforcement learning and evolutionary algorithm-based methods, which automatically assign multiple bits to the layer, can be applied for automated mixed-precision quantization. The main problems in this domain include a large search space and the high computational cost required for such a search. Also, many approaches do not consider hardware-related metrics in such optimization.</p>
</sec>
</sec>
<sec id="s5-4">
<title>5.4 General considerations for hardware-software co-design in QNN</title>
<p>Hardware-software co-design implies efficient mapping and optimization of a software-based neural network to hardware <xref ref-type="bibr" rid="B54">Krestinskaya et al. (2024a)</xref>, <xref ref-type="bibr" rid="B55">Krestinskaya et al. (2024b)</xref>. For full-precision networks, this can be accomplished by compilers and software development kits (SDK), e.g., GLOW (<xref ref-type="bibr" rid="B98">Rotem et al., 2018</xref>), ONNX (<xref ref-type="bibr" rid="B90">ONNX, 2024</xref>), and TensorRT (for Nvidia GPUs) (<xref ref-type="bibr" rid="B89">Nvidia, 2024</xref>), focusing on the optimization of instruction scheduling and memory allocation based on the target platform specifications. Similarly, this can be done for QNNs with moderate bit-widths <inline-formula id="inf157">
<mml:math id="m178">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. AIMET (<xref ref-type="bibr" rid="B109">Siddegowda et al., 2022</xref>) is one of the toolkits supporting different model compression techniques (pruning, quantization) and corresponding optimization and evaluation with target hardware runtime configuration provided. However, highly specialized QNN hardware accelerators require software-hardware co-design targeting specialized QNN accelerators.</p>
<p>A QNN accelerator can be divided into two parts: the off-chip hosting computer and the on-chip acceleration hardware. The off-chip host computer runs the software application, transmits the data between off-chip storage (e.g., DRAM) and on-chip data buffers, and reconfigures the on-chip hardware by sending control signals. The on-chip hardware executes neural network operations parallelly with arrays of processing engines. The interaction between these two parts should be optimized. By the level of operation executed on-chip each time, the accelerator designs can be divided into three categories: network-level acceleration, layer-level acceleration, and tensor-level acceleration. In network-level acceleration designs, a complete neural network is implemented on-chip achieving the best throughput and efficiency while being capable of processing only simple QNN models due to the limited on-chip storage capacity. With increased complexity and quantization bits, the accelerator design alternates to layer-level or even tensor-level acceleration with lower throughput and efficiency due to the frequent loading of data for different layers/tensors. Different accelerators require specific hardware-software co-design and optimization techniques to reach the optimum efficiency.</p>
<sec id="s5-4-1">
<title>5.4.1 Processing element (PE) optimization</title>
<p>According to <xref ref-type="table" rid="T3">Table 3</xref>, FPGA-based accelerator designs favor low bit-width quantization (<xref ref-type="bibr" rid="B125">Umuroglu et al., 2017</xref>; <xref ref-type="bibr" rid="B32">Guo et al., 2018</xref>; <xref ref-type="bibr" rid="B142">Zhang et al., 2021</xref>). With binary quantization, the MAC operations can be replaced by XNOR and bit count operations, which can be efficiently implemented using LUTs. While with higher bit-width uniform quantization, the MAC operation is more efficient on DSP blocks (<xref ref-type="bibr" rid="B9">Chang et al., 2021</xref>). Meanwhile, some designs adopt logarithmic quantization on weights simplifying multiply operation to the shift operation carried out by LUTs (<xref ref-type="bibr" rid="B11">Chen et al., 2020</xref>).</p>
<p>The PE implementation of an ASIC QNN accelerator can be divided into two categories based on the domain where the computation is performed: (1) analog domain-based IMC with SRAM and RRAM (<xref ref-type="bibr" rid="B7">Biswas and Chandrakasan, 2018</xref>; <xref ref-type="bibr" rid="B140">Yin et al., 2020</xref>), and (2) digital domain with classical digital adders and multipliers (<xref ref-type="bibr" rid="B13">Chen et al., 2016</xref>; <xref ref-type="bibr" rid="B14">2019</xref>; <xref ref-type="bibr" rid="B8">Chang and Chang, 2019</xref>). In the first category, an SRAM and RRAM cell stores one or more bits of data requiring analog or mixed-signal computation level optimizations (e.g., crossbar and peripheral circuits). SRAMs have fast and efficient writing capabilities, in turn, the design can be easily reconfigured to different weight values. Therefore, the SRAM-based accelerators perform layer-level or tensor-level acceleration. Different from SRAMs, the non-volatile memory elements do not support runtime write operation; however, such cells are more area-efficient and dense. Hence, network-level acceleration with higher bit-width is more suitable for non-volatile memory-based crossbar designs.</p>
<p>In the second category, the ASIC implementation of adders and multipliers can benefit from explicit optimization. Therefore, compared with FPGA-based accelerators, the ASIC implementation favors a data format with a higher bit-width <inline-formula id="inf158">
<mml:math id="m179">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> for highly accurate QNNs leading to larger storage requirements. Consequently, such accelerators are more suitable to perform layer-level or tensor-level acceleration rather than network-level acceleration. In ASIC accelerators, approximate arithmetic logic can be adopted to reduce computation complexity and power consumption. (<xref ref-type="bibr" rid="B38">Hanif and Shafique, 2022</xref>; <xref ref-type="bibr" rid="B85">Mrazek et al., 2019</xref>; <xref ref-type="bibr" rid="B126">Venkataramani et al., 2014</xref>). In addition to the adder and multiplier, local memory/buffers can be assigned to the PEs as well as an optional accumulator especially when the PEs are arranged to form a systolic array (<xref ref-type="bibr" rid="B77">Liu et al., 2021</xref>). Like FPGA implementations, some ASIC-based accelerators adopt logarithmic quantization (power-of-2) to achieve a more efficient computation.</p>
</sec>
<sec id="s5-4-2">
<title>5.4.2 Auxiliary operations optimization</title>
<p>Except for the major matrix-vector multiplication operations in neural network inference, other operations like batch normalization, activation, pooling, etc., are noted as auxiliary operations in this section. In ASIC- and FPGA-based designs, one of the optimizations of auxiliary operations in QNN is the operation fusion. For example, the batch normalization layer first normalizes the tensor based on historical statistical data and then linearly affines the tensor. During inference, these two operations can be fused into one linear transform of the tensor, with both the normalization and affine parameters being constant:<disp-formula id="e22">
<mml:math id="m180">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>In <xref ref-type="disp-formula" rid="e22">Equation 22</xref>, <inline-formula id="inf159">
<mml:math id="m181">
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are statistic mean and standard variation respectively, <inline-formula id="inf160">
<mml:math id="m182">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are affine parameters. Such linear operation can be further fused with the prior linear or convolution layers. The fusion of batch normalization layers is called <italic>batch normalization folding</italic>. Different from inference, the statistical data <inline-formula id="inf161">
<mml:math id="m183">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and affine parameters <inline-formula id="inf162">
<mml:math id="m184">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are updated with different mechanisms during training making it difficult to simulate batch normalization fusion during training. Hence, various batch normalization folding strategies are developed considering the trade-off between training quality and training cost. (<xref ref-type="bibr" rid="B72">Li et al., 2021</xref>; <xref ref-type="bibr" rid="B61">Krishnamoorthi, 2018</xref>; <xref ref-type="bibr" rid="B47">Jacob et al., 2018</xref>)</p>
<p>The other possible fusion is binary quantization and ReLU, where the scale term in the fused operation can be omitted if the output is directly quantized (e.g., not a bypass in a ResNet). The ReLU activation function can be implemented as a compare-with-zero logic. It should be noticed that such compare-with-zero logic is not equivalent to the sign function (returns zero when input is zero) used to perform binary quantization in the software training phase. Hence, it is important to explicitly output either 1 or <inline-formula id="inf163">
<mml:math id="m185">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, especially when binarizing activations with values being 0.</p>
<p>Compared to FPGA, crossbar-based accelerators can efficiently execute most operations, e.g., convolution, linear operations, etc. However, operations like pooling, activation, and batch normalization need to be performed in the peripheral auxiliary blocks, commonly in the digital domain (rarely in analog (<xref ref-type="bibr" rid="B58">Krestinskaya et al., 2018</xref>)). Hence, crossbar array-based designs typically do not involve batch normalization fusion.</p>
</sec>
<sec id="s5-4-3">
<title>5.4.3 Data routing and reconfigurability</title>
<p>As the ASIC-based accelerators focus mostly on layer-level or tensor-level acceleration, to maximize the throughput and hardware utilization, the data flow should be routed efficiently and the PE arrangement should be reconfigured flexibly. To reduce unnecessary data movement, different levels of data reuse are implemented by broadcasting and multi-casting the common input to multiple PEs. Additionally, the inputs to the PEs are multiplexed increasing the reconfigurability of the PE array. Furthermore, the PEs can be organized into a network-on-chip which substantially increases both data routing efficiency and the reconfigurability of the PE array (<xref ref-type="bibr" rid="B14">Chen et al., 2019</xref>). In general, a data reuse strategy should be designed according to the accelerated operation. For instance, traditional convolution should have a different data reuse strategy from depth-wise separable convolutions.</p>
</sec>
<sec id="s5-4-4">
<title>5.4.4 Data value mismatch</title>
<p>Different from FPGA or ASIC implementations, the crossbar array-based accelerators perform computation in the analog domain. Therefore, there is a performance gap when pre-trained QNNs are directly deployed on the crossbars, due to device and circuit non-idealities of the crossbar and IMC cells. These non-idealities cause data mismatches between software and hardware. These non-idealities include device variation, non-linear switching, conductance drift, ADC/DAC non-linearity, mismatch, etc (<xref ref-type="bibr" rid="B56">Krestinskaya et al., 2019</xref>). To reduce the performance gap, these non-idealities should be modeled and explicitly considered during neural network training (<xref ref-type="bibr" rid="B134">Xiao et al., 2022</xref>).</p>
</sec>
</sec>
</sec>
<sec id="s6">
<title>6 Discussion and future directions</title>
<p>In this paper, we discuss different types of quantization and QNN training methods. These methods can generate well-trained QNNs featuring comparable accuracy as full-precision models. However, high accuracy is achieved at the cost of involving full-precision parameters (like scale factors). Even though these full precision parameters do not cause an obvious model size increase (shown in <xref ref-type="fig" rid="F7">Figure 7</xref>), they introduce computation overhead, especially when there&#x2019;s no dedicated floating-point unit in the hardware accelerators. Meanwhile, during backward propagation, all the aforementioned methods rely on full-precision weights or fixed-point weights with large bit-width to accumulate the gradient.</p>
<p>Various hardware accelerator designs (introduced in <xref ref-type="sec" rid="s5">Section 5</xref>) can achieve higher computation efficiencies compared with traditional general-purpose computation units (GPU/CPU). However, most of the accelerator designs only support efficient neural network inference rather than training. Additionally, higher reconfigurability is expected from the accelerator designs, which is a key component for edge online learning or federated learning.</p>
<p>From algorithm and hardware co-design perspectives, we propose future directions for both the QNN algorithms and accelerator designs as summarized in <xref ref-type="fig" rid="F11">Figure 11</xref>.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Technology advancements towards efficient edge machine learning (algorithm, hardware co-design).</p>
</caption>
<graphic xlink:href="felec-06-1469802-g011.tif">
<alt-text content-type="machine-generated">Chart presenting open problems in three areas: quantization challenges, lack of efficient QNN training algorithms, and accelerator design issues. Corresponding solutions are extreme low bit-width quantization, hardware-friendly QNN training algorithm, and efficient inference and training accelerator. It highlights strategies like mixed-precision quantization and dynamic reconfigurability for efficient edge-machine learning. Icons accompany each solution.</alt-text>
</graphic>
</fig>
<sec id="s6-1">
<title>6.1 Extreme low bit-width quantization</title>
<p>
<xref ref-type="fig" rid="F12">Figure 12</xref> shows the influence of quantization precision on power consumption and latency with different hardware platforms. For MCU-based platforms, the latency and power consumption scale down with the quantization precision due to the fixed length of the arithmetic units. While for ASIC-based accelerators, the latency and power consumption scale down drastically with the quantization precision. With a simple dataset (CIFAR-10), the model accuracy experiences less degradation than a complex dataset (ImageNet) as the quantization precision decreases. <xref ref-type="fig" rid="F12">Figure 12</xref> shows that a close-to-FP accuracy can be obtained as the quantization bit-width is larger than 4-bit. Consequently, the majority of the QNN hardware designs shown in <xref ref-type="table" rid="T3">Table 3</xref> adopt a quantization precision higher than 4-bit. Hence, there is a demand to improve model accuracy under sub-4-bit quantization scenarios. With low-bit quantization, the hardware platforms can be more energy efficient and fast, especially with ASIC-based platforms.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Qualitative study on relative power consumption per operation and latency under different precision quantization with MCU-oriented acceleration kernel (<xref ref-type="bibr" rid="B26">Garofalo et al., 2021</xref>) and ASIC-based NN accelerator (<xref ref-type="bibr" rid="B99">Ryu et al., 2022</xref>). Accuracy relative to full precision models is shown with the corresponding bit-width tested with CIFAR-10 (<xref ref-type="bibr" rid="B137">Yang et al., 2020</xref>) and ImageNet dataset (<xref ref-type="bibr" rid="B141">Zhang et al., 2018</xref>).</p>
</caption>
<graphic xlink:href="felec-06-1469802-g012.tif">
<alt-text content-type="machine-generated">Bar chart comparing relative accuracy of ImageNet and CIFAR-10 datasets at 2-bit, 4-bit, and 8-bit precision. Colors indicate relative power per operation and latency for MCU and ASIC. Higher precision correlates with increased accuracy and power consumption.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s6-2">
<title>6.2 Study of the quantization of biases and batch normalization parameters during training</title>
<p>Compared to training, biases and batch normalization parameters can be fused into the following layer during inference. There are no studies offering a systematic discussion or implementation of quantization towards biases or batch normalization parameters during training. Meanwhile, comparing the accuracy resulting from cases with and without biases or affine operations (<xref ref-type="fig" rid="F7">Figure 7</xref>), these operations play an important role in guaranteeing high performance accuracy. Hence, there is a strong demand for a systematic study of the quantization algorithms towards biases and batch normalization parameters during training. Only with quantized biases and batch normalization parameters expensive floating-point operations can be completely removed from the data path, which is a key point for efficient hardware accelerator design that supports training.</p>
</sec>
<sec id="s6-3">
<title>6.3 QNN training methods relying only on hardware-friendly operations (integer arithmetic operation, shift, bit-wise operation)</title>
<p>All QNN training methods depend on floating-point or long bit-width fixed-point parameters to accumulate the gradient preventing them from being deployed on low-end edge or IoT devices. At the same time, out of privacy concerns, machine learning methods, e.g., federated learning, require local training on low-end devices. Since there are some existing works (<xref ref-type="bibr" rid="B119">Sun et al., 2018b</xref>; <xref ref-type="bibr" rid="B145">Zhou et al., 2016</xref>; <xref ref-type="bibr" rid="B118">Sun et al., 2020</xref>) supporting the quantization of gradients during the backpropagation, the critical part of developing QNN training methods relying only on hardware-friendly operations is finding a substitution of the floating-point format in accumulating gradients. Therefore, it is worth exploring the fusion of new gradient accumulating methods and existing gradient quantization methods.</p>
</sec>
<sec id="s6-4">
<title>6.4 Hardware accelerator design supporting both efficient inference and training</title>
<p>The existing hardware accelerator designs focus more on inference rather than training assuming that costly training can be performed on powerful servers or clusters. However, as IoT technology, edge computing, and corresponding privacy concerns arise, it is required to switch neural network training from a centralized manner to a more distributed one. This trend puts a requirement on the hardware design to support not only the inference but also training. As analyzed in the previous sections, different data in the neural network, like weights, activations, and gradients, possess different ranges and distributions. This results in different types of quantization methods being applied to different data. To support both inference and training, the hardware architecture should be based on a heterogeneous design and compatible with various quantization methods and support arithmetic operations and corresponding data formats while maintaining high efficiency.</p>
</sec>
<sec id="s6-5">
<title>6.5 Dynamically reconfigurable accelerator designs</title>
<p>In addition to inference and training support, hardware reconfigurability is also critical. The edge device (e.g., mobile phone) may be required to run different applications and tasks. Neural networks running on the hardware accelerator could have different network structures, quantization methods, precision, and execution time requirements. All these factors bring challenges to the hardware design to be dynamically reconfigurable, especially in ASIC and non-volatile memory-based hardware architectures.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s7">
<title>7 Conclusion</title>
<p>In light of the swift advancement of edge computing, this paper undertakes a comprehensive, integrative survey on CNN architectures, quantization algorithms, and QCNN accelerators with a focus on energy-efficient on-edge applications. Various existing QNN accelerator designs based on ASIC, FPGA, and non-volatile memory together with commonly adopted CNN models and quantization algorithms are introduced and analyzed. On top of that, we highlight general guidelines regarding QNN software-hardware co-designs and give future research directions considering both algorithm and hardware perspectives. Concurrently, notable advancements in CNN architectures and quantization algorithms, which have yet to find common application in QCNN accelerators and thus fall outside the ambit of this review, have been made. It is anticipated that these developments will significantly influence the future evolution of QCNN accelerator designs.</p>
</sec>
</body>
<back>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>LZ: Conceptualization, Data curation, Formal Analysis, Investigation, Methodology, Software, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review and editing. OK: Data curation, Formal Analysis, Investigation, Methodology, Software, Validation, Writing &#x2013; original draft, Writing &#x2013; review and editing. MF: Writing &#x2013; review and editing. AE: Funding acquisition, Project administration, Resources, Supervision, Writing &#x2013; review and editing. KS: Funding acquisition, Project administration, Resources, Supervision, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This work is supported by the King Abdullah University of Science and Technology (KAUST) through the Competitive Research Grant program under grant URF/1/4704-01-01.</p>
</sec>
<ack>
<p>ChatGPT 4o, o4-mini-high models by OpenAI are used for grammar checking and text elaboration.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>Author MF was employed by San Francisco Inc. CA.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ankit</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hajj</surname>
<given-names>I. E.</given-names>
</name>
<name>
<surname>Chalamalasetti</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Ndu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Foltin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>R. S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>PUMA: a programmable ultra-efficient memristor-based accelerator for machine learning inference</article-title>,&#x201d; in <source>Proceedings of the twenty-fourth international conference on architectural support for programming languages and operating systems</source>, <fpage>715</fpage>&#x2013;<lpage>731</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.-X.</given-names>
</name>
<name>
<surname>Liberty</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>ProxQuant: quantized neural networks via proximal operators</article-title>. <source>arXiv Prepr. arXiv:1810.00861</source>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Banner</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hubara</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hoffer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Soudry</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Scalable methods for 8-bit training of neural networks</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>31</volume>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Estimating or propagating gradients through stochastic neurons</article-title>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Benmeziane</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Maghraoui</surname>
<given-names>K. E.</given-names>
</name>
<name>
<surname>Ouarnoughi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Niar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wistuba</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A comprehensive survey on hardware-aware neural architecture search</article-title>. <comment>
<italic>arXiv preprint arXiv:2101.09336</italic>
</comment>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bernstein</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Meister</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>M.-Y.</given-names>
</name>
<name>
<surname>Anandkumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Learning compositional functions via multiplicative weight updates</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>13319</fpage>&#x2013;<lpage>13330</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Biswas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chandrakasan</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>CONV-SRAM: an energy-efficient SRAM with in-memory dot-product computation for low-power convolutional neural networks</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>54</volume>, <fpage>217</fpage>&#x2013;<lpage>230</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2018.2880918</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>K.-W.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>T.-S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>VWA: hardware efficient vectorwise accelerator for convolutional neural network</article-title>. <source>IEEE Trans. Circuits Syst. I Regul. Pap.</source> <volume>67</volume>, <fpage>145</fpage>&#x2013;<lpage>154</lpage>. <pub-id pub-id-type="doi">10.1109/tcsi.2019.2942529</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>S.-E.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>So</surname>
<given-names>H. K.-H.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Mix and match: a novel FPGA-centric deep neural network quantization framework</article-title>,&#x201d; in <source>
<italic>2021 IEEE international Symposium on high-performance computer architecture (HPCA)</italic> (IEEE)</source>, <fpage>208</fpage>&#x2013;<lpage>220</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>A 1T2R1C ReRAM CIM accelerator with energy-efficient voltage division and capacitive coupling for CNN acceleration in AI edge applications</article-title>. <source>IEEE Trans. Circuits Syst. II Express Briefs</source> <volume>70</volume>, <fpage>276</fpage>&#x2013;<lpage>280</lpage>. <pub-id pub-id-type="doi">10.1109/tcsii.2022.3201367</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A learning framework for n-bit quantized neural networks toward fpgas</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>32</volume>, <fpage>1067</fpage>&#x2013;<lpage>1081</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2020.2980041</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Dual path networks</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>30</volume>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.-H.</given-names>
</name>
<name>
<surname>Krishna</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Emer</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Sze</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Eyeriss: an energy-efficient reconfigurable accelerator for deep convolutional neural networks</article-title>. <source>IEEE J. solid-state circuits</source> <volume>52</volume>, <fpage>127</fpage>&#x2013;<lpage>138</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2016.2616357</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y.-H.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>T.-J.</given-names>
</name>
<name>
<surname>Emer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sze</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Eyeriss V2: a flexible accelerator for emerging deep neural networks on mobile devices</article-title>. <source>IEEE J. Emerg. Sel. Top. Circuits Syst.</source> <volume>9</volume>, <fpage>292</fpage>&#x2013;<lpage>308</lpage>. <pub-id pub-id-type="doi">10.1109/jetcas.2019.2910232</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Choi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Venkataramani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chuang</surname>
<given-names>P. I.-J.</given-names>
</name>
<name>
<surname>Srinivasan</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gopalakrishnan</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>PACT: parameterized clipping activation for quantized neural networks</article-title>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mixed-precision quantized neural networks with progressively decreasing bitwidth</article-title>. <source>Pattern Recognit.</source> <volume>111</volume>, <fpage>107647</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2020.107647</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Courbariaux</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>David</surname>
<given-names>J.-P.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Binaryconnect: training deep neural networks with binary weights during propagations</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>28</volume>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Denton</surname>
<given-names>E. L.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bruna</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>LeCun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fergus</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Exploiting linear structure within convolutional networks for efficient evaluation</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>27</volume>.</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Blanton</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Marculescu</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Quantized deep neural networks for energy efficient hardware-based inference</article-title>,&#x201d; in <source>2018 23rd asia and south pacific design automation conference</source> (<publisher-name>ASP-DAC IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chin</surname>
<given-names>T.-W.</given-names>
</name>
<name>
<surname>Marculescu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Blanton</surname>
<given-names>R. D.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>FLightNNs: lightweight quantized deep neural networks for fast and accurate inference</article-title>,&#x201d; in <source>Proceedings of the 56th annual design automation conference 2019</source>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Marculescu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Blanton</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>LightNN: filling the gap between conventional deep neural networks and binarized networks</article-title>,&#x201d; in <source>Proceedings of the on great lakes symposium on VLSI 2017</source>, <fpage>35</fpage>&#x2013;<lpage>40</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Elthakeb</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pilligundla</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mireshghallah</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Yazdanbakhsh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Esmaeilzadeh</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>ReLeQ: an automatic reinforcement learning approach for deep quantization of neural networks</article-title>,&#x201d; in <source>NeurIPS ML for systems workshop</source>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fiesler</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Choudry</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Caulfield</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>1990</year>). &#x201c;<article-title>Weight discretization paradigm for optical neural networks</article-title>,&#x201d; SPIE. <source>Opt. interconnections Netw.</source>, <volume>1281</volume>. <fpage>164</fpage>&#x2013;<lpage>173</lpage>. <pub-id pub-id-type="doi">10.1117/12.20700</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fukushima</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1980</year>). <article-title>Neocognitron: a self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position</article-title>. <source>Biol. Cybern.</source> <volume>36</volume>, <fpage>193</fpage>&#x2013;<lpage>202</lpage>. <pub-id pub-id-type="doi">10.1007/BF00344251</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>S.-H.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>M.-M.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.-Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M.-H.</given-names>
</name>
<name>
<surname>Torr</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Res2net: a new multi-scale backbone architecture</article-title>. <source>IEEE Trans. pattern analysis Mach. Intell.</source> <volume>43</volume>, <fpage>652</fpage>&#x2013;<lpage>662</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2019.2938758</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garofalo</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tagliavini</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Conti</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Benini</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rossi</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>XpulpNN: enabling energy efficient and flexible inference of quantized neural networks on RISC-V based IoT end nodes</article-title>. <source>IEEE Trans. Emerg. Top. Comput.</source> <volume>9</volume>, <fpage>1489</fpage>&#x2013;<lpage>1505</lpage>. <pub-id pub-id-type="doi">10.1109/tetc.2021.3072337</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gholami</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Mahoney</surname>
<given-names>M. W.</given-names>
</name>
<name>
<surname>Keutzer</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey of quantization methods for efficient neural network inference</article-title>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gi</surname>
<given-names>S.-G.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>B.-G.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A reram-based convolutional neural network accelerator using the analog layer normalization technique</article-title>. <source>IEEE Trans. Industrial Electron.</source> <volume>70</volume>, <fpage>6442</fpage>&#x2013;<lpage>6451</lpage>. <pub-id pub-id-type="doi">10.1109/tie.2022.3190876</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Differentiable soft quantization: bridging full-precision and low-bit neural networks</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision</source>, <fpage>4852</fpage>&#x2013;<lpage>4861</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bourdev</surname>
<given-names>L. D.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Compressing deep convolutional networks using vector quantization</article-title>. <source>
<italic>Corr.</italic> abs/1412</source>, <fpage>6115</fpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Cmt: convolutional neural networks meet vision transformers</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>12175</fpage>&#x2013;<lpage>12185</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>FBNA: a fully binarized neural network accelerator</article-title>,&#x201d; in <source>2018 28th international conference on field programmable logic and applications (FPL)</source> <publisher-name>IEEE</publisher-name>, <fpage>51</fpage>&#x2013;<lpage>513</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Fouda</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Yantir</surname>
<given-names>H. E.</given-names>
</name>
<name>
<surname>Eltawil</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Unsupervised adaptive weight pruning for energy-efficient neuromorphic systems</article-title>. <source>Front. Neurosci.</source> <volume>14</volume>, <fpage>598876</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2020.598876</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A survey on methods and theories of quantized neural networks</article-title>.</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gopalakrishnan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Narayanan</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Deep learning with limited numerical precision</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>) (Association for Computing Machinery), <fpage>1737</fpage>&#x2013;<lpage>1746</lpage>.</citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Habi</surname>
<given-names>H. V.</given-names>
</name>
<name>
<surname>Jennings</surname>
<given-names>R. H.</given-names>
</name>
<name>
<surname>Netzer</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>HMQ: hardware friendly mixed precision quantization block for cnns</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>448</fpage>&#x2013;<lpage>463</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dally</surname>
<given-names>W. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Deep compression: compressing deep neural networks with pruning, trained quantization and huffman coding</article-title>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hanif</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Shafique</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A cross-layer approach towards developing efficient embedded deep learning systems</article-title>. <source>Microprocess. Microsystems</source> <volume>88</volume>, <fpage>103609</fpage>. <pub-id pub-id-type="doi">10.1016/j.micpro.2020.103609</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hashemi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Anthony</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Tann</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Bahar</surname>
<given-names>R. I.</given-names>
</name>
<name>
<surname>Reda</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Understanding the impact of precision quantization on the accuracy and energy of neural networks</article-title>,&#x201d; in <source>Design, automation and test in europe conference and exhibition (DATE)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1474</fpage>&#x2013;<lpage>1479</lpage>.</citation>
</ref>
<ref id="B40">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heo</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>J. Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Knowledge transfer via distillation of activation boundaries formed by hidden neurons</article-title>. <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>33</volume>, <fpage>3779</fpage>&#x2013;<lpage>3787</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v33i01.33013779</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Distilling the knowledge in a neural network</article-title>.</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Howard</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kalenichenko</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Weyand</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>MobileNets: efficient convolutional neural networks for mobile vision applications</article-title>.</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>TiNNA: a tiny accelerator for neural networks with efficient dsp optimization</article-title>. <source>IEEE Trans. Circuits Syst. II Express Briefs</source> <volume>69</volume>, <fpage>2301</fpage>&#x2013;<lpage>2305</lpage>. <pub-id pub-id-type="doi">10.1109/tcsii.2022.3150980</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hubara</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Courbariaux</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Soudry</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>El-Yaniv</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Quantized neural networks: training neural networks with low precision weights and activations</article-title>. <source>J. Mach. Learn. Res.</source> <volume>18</volume>, <fpage>6869</fpage>&#x2013;<lpage>6898</lpage>.</citation>
</ref>
<ref id="B46">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Batch normalization: accelerating deep network training by reducing internal covariate shift</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>) (Association for Computing Machinery), <fpage>448</fpage>&#x2013;<lpage>456</lpage>.</citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jacob</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kligys</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Quantization and training of neural networks for efficient integer-arithmetic-only inference</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>2704</fpage>&#x2013;<lpage>2713</lpage>.</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jaderberg</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vedaldi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Speeding up convolutional neural networks with low rank expansions</article-title>, <fpage>88.1</fpage>&#x2013;<lpage>88.13</lpage>. <pub-id pub-id-type="doi">10.5244/c.28.88</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Janowsky</surname>
<given-names>S. A.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>Pruning versus clipping in neural networks</article-title>. <source>Phys. Rev. A</source> <volume>39</volume>, <fpage>6600</fpage>&#x2013;<lpage>6603</lpage>. <pub-id pub-id-type="doi">10.1103/PhysRevA.39.6600</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Post training quantization after neural network</article-title>,&#x201d; in <source>2022 14th international conference on computer research and development (ICCRD)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B51">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>Y.-J.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>7.2 a 12nm programmable convolution-efficient neural-processing-unit chip achieving 825tops</article-title>,&#x201d; in <source>2020 IEEE international solid-state circuits conference-(ISSCC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>136</fpage>&#x2013;<lpage>140</lpage>.</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>L.-S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>ADC-Free ReRAM-based <italic>in-situ</italic> accelerator for energy-efficient binary neural networks</article-title>. <source>IEEE Trans. Comput.</source> <volume>73</volume>, <fpage>353</fpage>&#x2013;<lpage>365</lpage>. <pub-id pub-id-type="doi">10.1109/tc.2022.3224800</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Exploiting retraining-based mixed-precision quantization for low-cost dnn accelerator design</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>32</volume>, <fpage>2925</fpage>&#x2013;<lpage>2938</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2020.3008996</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fouda</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Benmeziane</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>El Maghraoui</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sebastian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>W. D.</given-names>
</name>
<etal/>
</person-group> (<year>2024a</year>). <article-title>Neural architecture search for in-memory computing-based deep learning accelerators</article-title>. <source>Nat. Rev. Electr. Eng.</source> <volume>1</volume>, <fpage>374</fpage>&#x2013;<lpage>390</lpage>. <pub-id pub-id-type="doi">10.1038/s44287-024-00052-7</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fouda</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Eltawil</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
</person-group> (<year>2024b</year>). <article-title>Towards efficient imc accelerator design through joint hardware-workload co-optimization</article-title>.</citation>
</ref>
<ref id="B56">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Irmanova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>James</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Memristive non-idealities: is there any practical implications for designing neural network chips?</article-title>,&#x201d; in <source>
<italic>2019 IEEE international Symposium on Circuits and systems (ISCAS)</italic> (IEEE)</source>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>James</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Analogue neuro-memristive convolutional dropout nets</article-title>. <source>Proc. R. Soc. A</source> <volume>476</volume>, <fpage>20200210</fpage>. <pub-id pub-id-type="doi">10.1098/rspa.2020.0210</pub-id>
</citation>
</ref>
<ref id="B58">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
<name>
<surname>James</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Analog backpropagation learning circuits for memristive crossbar neural networks</article-title>,&#x201d; in <source>2018 IEEE international symposium on circuits and systems (ISCAS)</source>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1109/ISCAS.2018.8351344</pub-id>
</citation>
</ref>
<ref id="B59">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Towards efficient rram-based quantized neural networks hardware: state-of-the-art and open issues</article-title>,&#x201d; in <source>2022 IEEE 22nd international conference on nanotechnology (NANO)</source> (<publisher-name>IEEE</publisher-name>), <fpage>465</fpage>&#x2013;<lpage>468</lpage>.</citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krestinskaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Towards efficient in-memory computing hardware for quantized neural networks: state-of-the-art, open challenges and perspectives</article-title>. <source>IEEE Trans. Nanotechnol.</source> <volume>22</volume>, <fpage>377</fpage>&#x2013;<lpage>386</lpage>. <pub-id pub-id-type="doi">10.1109/TNANO.2023.3293026</pub-id>
</citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krishnamoorthi</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Quantizing deep convolutional networks for efficient inference: a whitepaper</article-title>.</citation>
</ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G. E.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Imagenet classification with deep convolutional neural networks</article-title>. <source>Commun. ACM</source> <volume>60</volume>, <fpage>84</fpage>&#x2013;<lpage>90</lpage>. <pub-id pub-id-type="doi">10.1145/3065386</pub-id>
</citation>
</ref>
<ref id="B63">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kulkarni</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Hosamani</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Masur</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Hegde</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vernekar</surname>
<given-names>G. R.</given-names>
</name>
<name>
<surname>Chandana</surname>
<given-names>K. S.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>A survey on quantization methods for optimization of deep neural networks</article-title>,&#x201d; in <source>2022 international conference on automation, computing and renewable systems (ICACRS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>827</fpage>&#x2013;<lpage>834</lpage>.</citation>
</ref>
<ref id="B64">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>LeCun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Boser</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Denker</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Henderson</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Hubbard</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>1989</year>). <article-title>Backpropagation applied to handwritten zip code recognition</article-title>. <source>Neural Comput.</source> <volume>1</volume>, <fpage>541</fpage>&#x2013;<lpage>551</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1989.1.4.541</pub-id>
</citation>
</ref>
<ref id="B65">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lecun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bottou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Haffner</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Gradient-based learning applied to document recognition</article-title>. <source>Proc. IEEE</source> <volume>86</volume>, <fpage>2278</fpage>&#x2013;<lpage>2324</lpage>. <pub-id pub-id-type="doi">10.1109/5.726791</pub-id>
</citation>
</ref>
<ref id="B66">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Gallagher</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Generalizing pooling functions in convolutional neural networks: mixed, gated, and tree</article-title>,&#x201d; in <source>
<italic>Artificial intelligence and statistics</italic> (PMLR)</source>, <fpage>464</fpage>&#x2013;<lpage>472</lpage>.</citation>
</ref>
<ref id="B67">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>H.-J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>UNPU: an energy-efficient deep neural network accelerator with fully variable weight bit precision</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>54</volume>, <fpage>173</fpage>&#x2013;<lpage>185</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2018.2865489</pub-id>
</citation>
</ref>
<ref id="B68">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>H.-J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>A 21mw low-power recurrent neural network accelerator with quantization tables for embedded deep learning applications</article-title>,&#x201d; in <source>
<italic>2017 IEEE asian solid-state circuits conference (A-SSCC)</italic> (IEEE)</source>, <fpage>237</fpage>&#x2013;<lpage>240</lpage>.</citation>
</ref>
<ref id="B69">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Silberman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ziegler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Venkataramani</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). <article-title>A 7-nm four-core mixed-precision ai chip with 26.2-tflops hybrid-fp8 training, 104.9-tops int4 inference, and workload-aware throttling</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>57</volume>, <fpage>182</fpage>&#x2013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2021.3120113</pub-id>
</citation>
</ref>
<ref id="B70">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>Y. S.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>E.-Y.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>Y.-H.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>S. W.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Quant-PIM: an energy-efficient processing-in-memory accelerator for layerwise quantized neural networks</article-title>. <source>IEEE Embed. Syst. Lett.</source> <volume>13</volume>, <fpage>162</fpage>&#x2013;<lpage>165</lpage>. <pub-id pub-id-type="doi">10.1109/les.2021.3050253</pub-id>
</citation>
</ref>
<ref id="B71">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leroux</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vankeirsbilck</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Verbelen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Simoens</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dhoedt</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Training binary neural networks with knowledge transfer</article-title>. <source>Neurocomputing</source> <volume>396</volume>, <fpage>534</fpage>&#x2013;<lpage>541</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2018.09.103</pub-id>
</citation>
</ref>
<ref id="B72">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>MQBench: towards reproducible and deployable model quantization benchmark</article-title>.</citation>
</ref>
<ref id="B73">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Efficient bitwidth search for practical mixed precision neural network</article-title>.</citation>
</ref>
<ref id="B74">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>C.-H.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>C.-C.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>Y.-M.</given-names>
</name>
<name>
<surname>Hung</surname>
<given-names>S.-J.</given-names>
</name>
<name>
<surname>Kuo</surname>
<given-names>Y.-T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>P. H.</given-names>
</name>
<etal/>
</person-group> (<year>2020a</year>). &#x201c;<article-title>7.1 a 3.4-to-13.3 tops/w 3.6 tops dual-core deep-learning accelerator for versatile ai applications in 7nm 5g smartphone soc</article-title>,&#x201d; in <source>
<italic>2020 ieee international solid-state circuits conference-(isscc)</italic> (IEEE)</source>, <fpage>134</fpage>&#x2013;<lpage>136</lpage>.</citation>
</ref>
<ref id="B75">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.-M.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>MCUNetV2: memory-efficient patch-based inference for tiny deep learning</article-title>. <comment>
<italic>arXiv preprint arXiv:2110.15352</italic>
</comment>
</citation>
</ref>
<ref id="B76">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.-M.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020b</year>). <article-title>MCUNet: tiny deep learning on iot devices</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>11711</fpage>&#x2013;<lpage>11722</lpage>.</citation>
</ref>
<ref id="B77">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>C.-N.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>Y.-A.</given-names>
</name>
<name>
<surname>Kuo</surname>
<given-names>C.-H.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>S.-A.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Design of 2d systolic array accelerator for quantized convolutional neural networks</article-title>,&#x201d; in <source>2021 international symposium on VLSI design, automation and test (VLSI-DAT)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</citation>
</ref>
<ref id="B78">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>K.-T.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Bi-Real Net: enhancing the performance of 1-bit cnns with improved representational capability and advanced training algorithm</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision</source> (<publisher-name>ECCV</publisher-name>) (<publisher-loc>Springer</publisher-loc>), <fpage>722</fpage>&#x2013;<lpage>737</lpage>.</citation>
</ref>
<ref id="B79">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>OMPQ: orthogonal mixed precision quantization</article-title>.</citation>
</ref>
<ref id="B80">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Martinez</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bulat</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tzimiropoulos</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Training binary neural networks with real-to-binary convolutions</article-title>
</citation>
</ref>
<ref id="B81">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Matinizadeh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mohammadhassani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pacik-Nelson</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Polykretisl</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Mishra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shackleford</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). &#x201c;<article-title>A fully-configurable digital spiking neuromorphic hardware design with variable quantization and mixed precision</article-title>,&#x201d; in <source>2024 IEEE 67th international midwest symposium on circuits and systems (MWSCAS)</source> (<publisher-name>IEEE</publisher-name>), <fpage>937</fpage>&#x2013;<lpage>941</lpage>.</citation>
</ref>
<ref id="B82">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Menghani</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Efficient deep learning: a survey on making deep learning models smaller, faster, and better</article-title>. <source>ACM Comput. Surv.</source> <volume>55</volume>, <fpage>1</fpage>&#x2013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1145/3578938</pub-id>
</citation>
</ref>
<ref id="B83">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moon</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>K.-J.</given-names>
</name>
<name>
<surname>Mun</surname>
<given-names>H.-G.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sim</surname>
<given-names>J.-Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>An 8.9&#x2013;71.3 tops/w deep learning accelerator for arbitrarily quantized neural networks</article-title>. <source>IEEE Trans. Circuits Syst. II Express Briefs</source> <volume>69</volume>, <fpage>4148</fpage>&#x2013;<lpage>4152</lpage>. <pub-id pub-id-type="doi">10.1109/tcsii.2022.3185184</pub-id>
</citation>
</ref>
<ref id="B84">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Moons</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Uytterhoeven</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dehaene</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Verhelst</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>14.5 envision: a 0.26-to-10tops/w subword-parallel dynamic-voltage-accuracy-frequency-scalable convolutional neural network processor in 28nm fdsoi</article-title>,&#x201d; in <source>2017 IEEE international solid-state circuits conference (ISSCC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>246</fpage>&#x2013;<lpage>247</lpage>.</citation>
</ref>
<ref id="B85">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mrazek</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Vas&#xed;cek</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sekanina</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hanif</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Shafique</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>ALWANN: automatic layer-wise approximation of deep neural network accelerators without retraining</article-title>,&#x201d; in <source>2019 IEEE/ACM international conference on computer-aided design (ICCAD)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B86">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Neill</surname>
<given-names>J. O.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>An overview of neural network compression</article-title>
</citation>
</ref>
<ref id="B87">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Netzer</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Coates</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bissacco</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>A. Y.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Reading digits in natural images with unsupervised feature learning</article-title>
</citation>
</ref>
<ref id="B88">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>D. T.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.-J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Layer-specific optimization for mixed data flow with mixed precision in fpga design for cnn-based object detectors</article-title>. <source>IEEE Trans. Circuits Syst. Video Technol.</source> <volume>31</volume>, <fpage>2450</fpage>&#x2013;<lpage>2464</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2020.3020569</pub-id>
</citation>
</ref>
<ref id="B89">
<citation citation-type="web">
<collab>Nvidia</collab> (<year>2024</year>). <article-title>TensorRT homepage</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://developer.nvidia.com/tensorrt">https://developer.nvidia.com/tensorrt</ext-link>.</comment>
</citation>
</ref>
<ref id="B90">
<citation citation-type="web">
<collab>ONNX</collab> (<year>2024</year>). <article-title>ONNX homepage</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://onnx.ai/index.html">https://onnx.ai/index.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B91">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Park</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ahn</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Weighted-entropy-based quantization for deep neural networks</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>5456</fpage>&#x2013;<lpage>5464</lpage>.</citation>
</ref>
<ref id="B92">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pham</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q. V.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Meta pseudo labels</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>11557</fpage>&#x2013;<lpage>11568</lpage>.</citation>
</ref>
<ref id="B93">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sebe</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Binary neural networks: a survey</article-title>. <source>Pattern Recognit.</source> <volume>105</volume>, <fpage>107281</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2020.107281</pub-id>
</citation>
</ref>
<ref id="B94">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Qiu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>Going deeper with embedded fpga platform for convolutional neural network</article-title>,&#x201d; in <source>Proceedings of the 2016 ACM/SIGDA international symposium on field-programmable gate arrays</source>, <fpage>26</fpage>&#x2013;<lpage>35</lpage>.</citation>
</ref>
<ref id="B95">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rakka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fouda</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Khargonekar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kurdahi</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Mixed-precision neural networks: a survey</article-title>
</citation>
</ref>
<ref id="B96">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rastegari</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ordonez</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Redmon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>XNOR-Net: imagenet classification using binary convolutional neural networks</article-title>,&#x201d; in <source>European conference on computer vision</source> (<publisher-name>Springer</publisher-name>), <fpage>525</fpage>&#x2013;<lpage>542</lpage>.</citation>
</ref>
<ref id="B97">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rokh</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Azarpeyvand</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Khanteymoori</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A comprehensive survey on model quantization for deep neural networks</article-title>
</citation>
</ref>
<ref id="B98">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rotem</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Fix</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Abdulrasool</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Catron</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dzhabarov</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Glow: graph lowering compiler techniques for neural networks</article-title>.</citation>
</ref>
<ref id="B99">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ryu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Bitblade: energy-efficient variable bit-precision hardware accelerator for quantized neural networks</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>57</volume>, <fpage>1924</fpage>&#x2013;<lpage>1935</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2022.3141050</pub-id>
</citation>
</ref>
<ref id="B100">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ryu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J.-J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Bitblade: area and energy-efficient precision-scalable neural network accelerator with bitwise summation</article-title>,&#x201d; in <source>Proceedings of the 56th annual design automation conference 2019</source>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B101">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ryu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Koo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>A 44.1 tops/w precision-scalable accelerator for quantized neural networks in 28nm cmos</article-title>,&#x201d; in <source>2020 IEEE custom integrated circuits conference (CICC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</citation>
</ref>
<ref id="B102">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sakr</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Venkatesan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zimmer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dally</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Khailany</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Optimal clipping and magnitude-aware differentiation for improved quantization-aware training</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>), <publisher-loc>(Association for Computing Machinery)</publisher-loc>, <fpage>19123</fpage>&#x2013;<lpage>19138</lpage>.</citation>
</ref>
<ref id="B103">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.-C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>MobileNetV2: inverted residuals and linear bottlenecks</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>.</citation>
</ref>
<ref id="B104">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shafiee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nag</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Muralimanohar</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Balasubramonian</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Strachan</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>ISAAC: a convolutional neural network accelerator with <italic>in-situ</italic> analog arithmetic in crossbars</article-title>. <source>ACM SIGARCH Comput. Archit. News</source> <volume>44</volume>, <fpage>14</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1145/3007787.3001139</pub-id>
</citation>
</ref>
<ref id="B105">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Suda</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chau</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chandra</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Bit Fusion: bit-level dynamically composable architecture for accelerating deep neural network</article-title>,&#x201d; in <source>2018 ACM/IEEE 45th annual international symposium on computer architecture (ISCA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>764</fpage>&#x2013;<lpage>775</lpage>.</citation>
</ref>
<ref id="B106">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024a</year>). &#x201c;<article-title>Exploring quantization techniques for large-scale language models: methods, challenges and future directions</article-title>,&#x201d; in <source>Proceedings of the 2024 9th international conference on cyber security and information engineering</source>, <fpage>783</fpage>&#x2013;<lpage>790</lpage>.</citation>
</ref>
<ref id="B107">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024b</year>). &#x201c;<article-title>Are conventional snns really efficient? a perspective from network quantization</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>27538</fpage>&#x2013;<lpage>27547</lpage>.</citation>
</ref>
<ref id="B108">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>H.-J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>14.2 DNPU: an 8.1 tops/w reconfigurable cnn-rnn processor for general-purpose deep neural networks</article-title>,&#x201d; in <source>
<italic>2017 IEEE international solid-state circuits conference (ISSCC)</italic> (IEEE)</source>, <fpage>240</fpage>&#x2013;<lpage>241</lpage>.</citation>
</ref>
<ref id="B109">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Siddegowda</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fournarakis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nagel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Blankevoort</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Khobare</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Neural network quantization with ai model efficiency toolkit aimet</article-title>
</citation>
</ref>
<ref id="B110">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sifre</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Mallat</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Rigid-motion scattering for texture classification</article-title>.</citation>
</ref>
<ref id="B111">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Very deep convolutional networks for large-scale image recognition</article-title>.</citation>
</ref>
<ref id="B112">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smagulova</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Fouda</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Kurdahi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Salama</surname>
<given-names>K. N.</given-names>
</name>
<name>
<surname>Eltawil</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Resistive neural hardware accelerators</article-title>. <source>Proc. IEEE</source> <volume>111</volume>, <fpage>500</fpage>&#x2013;<lpage>527</lpage>. <pub-id pub-id-type="doi">10.1109/jproc.2023.3268092</pub-id>
</citation>
</ref>
<ref id="B113">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>PipeLayer: a pipelined ReRAM-based accelerator for deep learning</article-title>,&#x201d; in <source>2017 IEEE international symposium on high performance computer architecture (HPCA)</source> (<publisher-name>IEEE</publisher-name>), <fpage>541</fpage>&#x2013;<lpage>552</lpage>.</citation>
</ref>
<ref id="B114">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Spallanzani</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cavigelli</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Leonardi</surname>
<given-names>G. P.</given-names>
</name>
<name>
<surname>Bertogna</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Benini</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Additive noise annealing and approximation properties of quantized neural networks</article-title>
</citation>
</ref>
<ref id="B115">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Srivastava</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Salakhutdinov</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Dropout: a simple way to prevent neural networks from overfitting</article-title>. <source>J. Mach. Learn. Res.</source> <volume>15</volume>, <fpage>1929</fpage>&#x2013;<lpage>1958</lpage>.</citation>
</ref>
<ref id="B116">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>S.-E.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>FILM-QNN: efficient FPGA acceleration of deep neural networks with intra-layer, mixed-precision quantization</article-title>, <fpage>134</fpage>, <lpage>145</lpage>. <pub-id pub-id-type="doi">10.1145/3490422.3502364</pub-id>
</citation>
</ref>
<ref id="B117">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P.-Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Seo</surname>
<given-names>J.-s.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018a</year>). &#x201c;<article-title>Fully parallel rram synaptic array for implementing binary neural network with (&#x2b;1,- 1) weights and (&#x2b;1, 0) neurons</article-title>,&#x201d; in <source>2018 23rd asia and south pacific design automation conference</source> (<publisher-name>ASP-DAC IEEE</publisher-name>), <fpage>574</fpage>&#x2013;<lpage>579</lpage>.</citation>
</ref>
<ref id="B118">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Ultra-low precision 4-bit training of deep neural networks</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>1796</fpage>&#x2013;<lpage>1807</lpage>.</citation>
</ref>
<ref id="B119">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Seo</surname>
<given-names>J.-s.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018b</year>). &#x201c;<article-title>XNOR-RRAM: a scalable and parallel resistive synaptic architecture for binary neural networks</article-title>,&#x201d; in <source>2018 design, automation and test in europe conference and exhibition (DATE)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1423</fpage>&#x2013;<lpage>1428</lpage>.</citation>
</ref>
<ref id="B120">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>EfficientNet: rethinking model scaling for convolutional neural networks</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>), <publisher-loc>(Association for Computing Machinery)</publisher-loc>, <fpage>6105</fpage>&#x2013;<lpage>6114</lpage>.</citation>
</ref>
<ref id="B121">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>How to train a compact binary neural network with high accuracy?</article-title>,&#x201d; in <source>Thirty-First AAAI conference on artificial intelligence</source>.</citation>
</ref>
<ref id="B122">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Teng</surname>
<given-names>C.-F.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.-H. D.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>A. K.-S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>A.-Y. A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Low-complexity recurrent neural network-based polar decoder with weight quantization mechanism</article-title>,&#x201d; in <source>ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (ICASSP)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1413</fpage>&#x2013;<lpage>1417</lpage>.</citation>
</ref>
<ref id="B123">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ueyoshi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Ando</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hirose</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Takamaeda-Yamazaki</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kadomoto</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Miyata</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>QUEST: a 7.49 tops multi-purpose log-quantized dnn inference engine stacked on 96mb 3d sram using inductive-coupling technology in 40nm cmos</article-title>,&#x201d; in <source>
<italic>2018 IEEE international solid-state circuits conference-(ISSCC)</italic> (IEEE)</source>, <fpage>216</fpage>&#x2013;<lpage>218</lpage>.</citation>
</ref>
<ref id="B124">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Umuroglu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Akhauri</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fraser</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Blott</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>LogicNets: Co-designed neural networks and circuits for extreme-throughput applications</article-title>,&#x201d; in <source>2020 30th international conference on field-programmable logic and applications (FPL)</source> (<publisher-name>IEEE</publisher-name>), <fpage>291</fpage>&#x2013;<lpage>297</lpage>.</citation>
</ref>
<ref id="B125">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Umuroglu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fraser</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Gambardella</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Blott</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Leong</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jahre</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>FINN: a framework for fast, scalable binarized neural network inference</article-title>,&#x201d; in <source>Proceedings of the 2017 ACM/SIGDA international symposium on field-programmable gate arrays</source>, <fpage>65</fpage>&#x2013;<lpage>74</lpage>.</citation>
</ref>
<ref id="B126">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Venkataramani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ranjan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Raghunathan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Axnn: energy-efficient neuromorphic systems using approximate computing</article-title>,&#x201d; in <source>Proceedings of the 2014 international symposium on Low power electronics and design</source>, <fpage>27</fpage>&#x2013;<lpage>32</lpage>.</citation>
</ref>
<ref id="B127">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Eckert</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Subramaniyan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Das</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Blaauw</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2019a</year>). <article-title>A 28-nm compute sram with bit-serial logic/arithmetic operations for programmable in-memory vector computing</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>55</volume>, <fpage>76</fpage>&#x2013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2019.2939682</pub-id>
</citation>
</ref>
<ref id="B128">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Hardware-centric automl for mixed-precision quantization</article-title>. <source>Int. J. Comput. Vis.</source> <volume>128</volume>, <fpage>2035</fpage>&#x2013;<lpage>2048</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-020-01339-6</pub-id>
</citation>
</ref>
<ref id="B129">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Brand</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Gopalakrishnan</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Training deep neural networks with 8-bit floating point numbers</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>31</volume>.</citation>
</ref>
<ref id="B130">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>F.-H.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>W. D.</given-names>
</name>
</person-group> (<year>2019b</year>). &#x201c;<article-title>A deep neural network accelerator based on tiled rram architecture</article-title>,&#x201d; in <source>2019 IEEE international electron devices meeting (IEDM)</source> (<publisher-name>IEEE</publisher-name>), <fpage>14</fpage>&#x2013;<lpage>4</lpage>.</citation>
</ref>
<ref id="B131">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Warden</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Situnayake</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <source>TinyML</source>. <publisher-name>O&#x2019;Reilly Media, Inc</publisher-name>.</citation>
</ref>
<ref id="B132">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhuang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>FPGA-based hybrid-type implementation of quantized neural networks for remote sensing applications</article-title>. <source>Sensors</source> <volume>19</volume>, <fpage>924</fpage>. <pub-id pub-id-type="doi">10.3390/s19040924</pub-id>
</citation>
</ref>
<ref id="B133">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhuang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>MP-OPU: a mixed precision FPGA-based overlay processor for convolutional neural networks</article-title>,&#x201d; in <source>2021 31st international conference on field-programmable logic and applications (FPL)</source> (<publisher-name>IEEE</publisher-name>), <fpage>33</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B134">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>T. P.</given-names>
</name>
<name>
<surname>Feinberg</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Bennett</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Prabhakar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Saxena</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>On the accuracy of analog neural network inference accelerators</article-title>. <source>IEEE Circuits Syst. Mag.</source> <volume>22</volume>, <fpage>26</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1109/mcas.2022.3214409</pub-id>
</citation>
</ref>
<ref id="B135">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hoi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Regnet: self-regulated network for image classification</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>34</volume>, <fpage>9562</fpage>&#x2013;<lpage>9567</lpage>. <pub-id pub-id-type="doi">10.1109/tnnls.2022.3158966</pub-id>
</citation>
</ref>
<ref id="B136">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Quantization networks</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>7308</fpage>&#x2013;<lpage>7316</lpage>.</citation>
</ref>
<ref id="B137">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Searching for low-bit weights in quantized neural networks</article-title>. <source>Adv. neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>4091</fpage>&#x2013;<lpage>4102</lpage>.</citation>
</ref>
<ref id="B138">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Fully hardware-implemented memristor convolutional neural network</article-title>. <source>Nature</source> <volume>577</volume>, <fpage>641</fpage>&#x2013;<lpage>646</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-1942-4</pub-id>
</citation>
</ref>
<ref id="B139">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Gholami</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>HAWQ-V3: dyadic neural network quantization</article-title>,&#x201d; in <source>International conference on machine learning</source> (<publisher-name>PMLR</publisher-name>), <publisher-loc>(Association for Computing Machinery)</publisher-loc>, <fpage>11875</fpage>&#x2013;<lpage>11886</lpage>.</citation>
</ref>
<ref id="B140">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yin</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Seo</surname>
<given-names>J.-S.</given-names>
</name>
<name>
<surname>Seok</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>XNOR-SRAM: in-memory computing sram macro for binary/ternary deep neural networks</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>55</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1109/jssc.2019.2963616</pub-id>
</citation>
</ref>
<ref id="B141">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>LQ-Nets: learned quantization for highly accurate and compact deep neural networks</article-title>,&#x201d; in <source>Proceedings of the European conference on computer vision (ECCV)</source>, <fpage>365</fpage>&#x2013;<lpage>382</lpage>.</citation>
</ref>
<ref id="B142">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>FracBNN: accurate and FPGA-efficient binary neural networks with fractional activations</article-title>,&#x201d; in <source>The 2021 ACM/SIGDA international symposium on field-programmable gate arrays</source>, <fpage>171</fpage>&#x2013;<lpage>182</lpage>.</citation>
</ref>
<ref id="B143">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Venkatesan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zimmer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>M.-Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>LNS-Madam: low-precision training in logarithmic number system using multiplicative weight update</article-title>. <source>IEEE Trans. Comput.</source> <volume>71</volume>, <fpage>3179</fpage>&#x2013;<lpage>3190</lpage>. <pub-id pub-id-type="doi">10.1109/tc.2022.3202747</pub-id>
</citation>
</ref>
<ref id="B144">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhijie</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shiming</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shasha</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shuquan</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Bactran: a hardware batch normalization implementation for cnn training engine</article-title>. <source>IEEE Embed. Syst. Lett.</source> <volume>13</volume>, <fpage>29</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1109/les.2020.2975055</pub-id>
</citation>
</ref>
<ref id="B145">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>DoReFa-Net: training low bitwidth convolutional neural networks with low bitwidth gradients</article-title>
</citation>
</ref>
<ref id="B146">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>A configurable multi-precision cnn computing framework based on single bit rram</article-title>,&#x201d; in <source>2019 56th ACM/IEEE design automation conference (DAC)</source> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B147">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhuang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Reid</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Towards effective low-bitwidth convolutional neural networks</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>, <fpage>7920</fpage>&#x2013;<lpage>7928</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>