<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Adv. Opt. Technol.</journal-id>
<journal-title>Advanced Optical Technologies</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Adv. Opt. Technol.</abbrev-journal-title>
<issn pub-type="epub">2192-8584</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1501208</article-id>
<article-id pub-id-type="doi">10.3389/aot.2024.1501208</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Advanced Optical Technologies</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Pruning and optimization of optical neural network as a binary optical trigger</article-title>
<alt-title alt-title-type="left-running-head">Zhao et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/aot.2024.1501208">10.3389/aot.2024.1501208</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhao</surname>
<given-names>Bokun</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2791528/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Dong</surname>
<given-names>Xuening</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2929145/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rahbardar Mojaver</surname>
<given-names>Kaveh</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2423988/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Meyer</surname>
<given-names>Brett H.</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liboiron-Ladouceur</surname>
<given-names>Odile</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2930313/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff>
<institution>Department of Electrical and Computer Engineering</institution>, <institution>McGill University</institution>, <addr-line>Montreal</addr-line>, <addr-line>QC</addr-line>, <country>Canada</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2329707/overview">Roberto Morandotti</ext-link>, Universit&#xe9; du Qu&#xe9;bec, Canada</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2578849/overview">Lianghua Wen</ext-link>, Yibin University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2763925/overview">Shuang Chang</ext-link>, Vanderbilt University, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Bokun Zhao, <email>bokun.zhao@mail.mcgill.ca</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>13</volume>
<elocation-id>1501208</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>09</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Zhao, Dong, Rahbardar Mojaver, Meyer and Liboiron-Ladouceur.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Zhao, Dong, Rahbardar Mojaver, Meyer and Liboiron-Ladouceur</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Optical neural networks implemented with Mach-Zehnder Interferometer (MZI) arrays are a promising solution to enable fast and energy-efficient machine learning inference, yet finding a practical application has proven challenging due to sensitivity to thermal noise and loss. To leverage the distinct advantages of integrated optical processors while avoiding its shortcomings given the current state of optical computing, we propose the binary optical trigger as a promising field of application. Implementable as small-scale application-specific circuitry on edge devices, the binary trigger runs binary classification tasks and output binary signals to decide if a subsequent energy intensive system should activate. Motivated by the limited task complexity, constrained area and power budgets of binary triggers, we perform 1) systematic, application-specific hardware pruning by physically removing specific MZIs, and 2) application-specific optimizations in the form of false negative reduction and weight quantization, as well as 3) sensitivity studies capturing the effect of imperfections in real optical components. The result is a customized MZI-mesh topology, MiniBokun Mesh, whose structure provides adequate performance and robustness for a targeted task complexity. We demonstrate in simulation that the pruning methodology achieves at least 50% less MZI usage compared to Clements and Reck meshes with the same input size, translating to at least between 4.6% and 24.2% savings in power consumption and a 40% reduction in physical circuitry footprint compared to other proposed unitary MZI topologies, sacrificing only 1%&#x2013;2% drop in inference accuracy.</p>
</abstract>
<kwd-group>
<kwd>optical neural network</kwd>
<kwd>Mach-Zehnder interferometer</kwd>
<kwd>pruning</kwd>
<kwd>edge computing</kwd>
<kwd>event-based trigger</kwd>
</kwd-group>
<contract-num rid="cn001">RGPIN-2018-05668 RGPIN-2021-03480</contract-num>
<contract-sponsor id="cn001">Natural Sciences and Engineering Research Council of Canada<named-content content-type="fundref-id">10.13039/501100000038</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Optical processors are known for their fast, efficient computation-by-propagation and high energy efficiency. Applying optical processing to machine learning is particularly promising: while optical processors are sensitive to noise, crosstalk, and optical signal attenuation, machine learning is error-tolerant by definition, and benefits substantially from the low-power matrix-vector multiplication (MVM) made possible by optical neural networks (ONNs) (<xref ref-type="bibr" rid="B18">McMahon, 2023</xref>).</p>
<p>Previous studies on ONN focused on implementing arbitrary weight matrices (<xref ref-type="bibr" rid="B19">Miller, 2013</xref>; <xref ref-type="bibr" rid="B27">Shen et al., 2017</xref>; <xref ref-type="bibr" rid="B34">Zhang et al., 2021</xref>; <xref ref-type="bibr" rid="B3">Banerjee et al., 2023</xref>) similar to the multi-layer perceptron (MLP) (<xref ref-type="bibr" rid="B7">Delashmit et al., 2005</xref>) implemented on a digital computer through singular value decomposition (SVD). This is achieved by inserting a diagonal matrix [<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, implemented by one column of Mach-Zehnder Interferometers (MZIs)] between two unitary rotation matrices (<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="bold">U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, each implemented by a unitary MZI-mesh). <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
<mml:mi mathvariant="bold">U</mml:mi>
<mml:mi mathvariant="bold">&#x3a3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, a similar decomposition process proposed in <xref ref-type="bibr" rid="B35">Zhao et al. (2019)</xref> where <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi mathvariant="bold">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a sparse matrix implemented by a tree-like mesh, reduces component usage by 15%&#x2013;38%. However, challenges associated with scaling up the optical processors are significant (<xref ref-type="bibr" rid="B1">Al-Qadasi et al., 2022</xref>). A deep ONN, such as a three-layer decomposition-based structure, suffers from signal attenuation through the optical path and layer interfaces. This can be mitigated by using only one physical unitary mesh where one reprograms the network weight and loops back the signals to reuse the mesh during each inference. This approach, however, significantly increases the inference latency and compromises the main advantage of computation-by-propagation of an optical processor. On the other hand, a wide ONN with large matrix dimensions poses challenges in the calibration process, with error compounding along optical paths (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>). A small ONN, on the other end, limits the dimension of the input vector and subsequently the potential field of application. For this reason, a tiled-based multiplier (<xref ref-type="bibr" rid="B13">Gu et al., 2020a</xref>; <xref ref-type="bibr" rid="B11">Feng et al., 2022</xref>) was proposed, where each multiplier with limited expressiveness implements one of the sub-matrices of the desired weight matrix. However, such approaches either require multiple copies of the multiplier or require iterations of weight reprogramming during inference when used for deep neural networks on complex multi-class classification tasks, suffering from similar hardware-reuse latency penalties.</p>
<p>To make the most of ONN&#x2019;s low power operation while avoiding the above-mentioned caveats, using the ONN in some efficiency-demanding edge computing tasks is a promising application. Ideally, the task should have low complexity so that a small-scale ONN can be employed. For this reason, we propose using the ONN as an activation trigger for any subsequent energy-intensive system in an edge environment. Similar to the multi-stage architecture for facial recognition proposed in (<xref ref-type="bibr" rid="B5">Bong et al., 2018</xref>) but implemented using optical components, the optical processor will act as an ultra-lightweight neural network that responds to a particular input event (i.e., specific objects appearing in the input image), while being more sophisticated than a conventional motion- or proximity-based detector (<xref ref-type="bibr" rid="B12">Gazivoda and Bilas, 2022</xref>) to avoid unnecessary activation caused by any input fluctuation, such as newly present objects of noninterest in the monitoring area. For example, consider a smart door lock facing a busy pedestrian street: the system ignores passersby, and only activates an energy-intensive system (e.g., face recognition for authentication) when someone directly faces the sensor. By triggering the subsequent complex system only when it is needed, energy consumption can be dramatically reduced.</p>
<p>With the target application in mind, further efforts can be explored to construct a tailored ONN for the task. First, the edge execution environment would benefit tremendously from reduced active component usage and reduced control circuitry bit precision that is constantly drawing power. Second, trade-offs can be made between the rate of false activation and the rate of trigger miss, depending on the specific application. To reduce the number of active components, we explored a pruning approach inspired by machine learning, where low-saliency components with minimal impact on overall system performance are removed from an initially over-parametrized optical neural network while maintaining prediction accuracy. Regarding application-specific trade-offs, we examined methods for reducing false negatives and quantization for the proposed binary optical trigger.</p>
<p>In this paper, given the ONN&#x2019;s fast and efficient computation ability yet with low scalability, we propose the binary optical trigger, a lightweight optical neural network designed for binary classification. The binary optical trigger has a structure similar to a traditional fully connected neural network but is composed of a mesh of MZIs, where the weight matrix is controlled by phase values programmed into the phase shifters. Our proposed ONN application and its associated optimizations diverge from previously reported efforts aimed at moderate classification tasks beyond MNIST, which often results in impractically large photonic circuits or extensive component reuse. Instead, our work focuses on binary classification tasks to trigger subsequent energy-intensive systems. Given the promising energy efficiency of ONNs, despite their early stage of development, this niche and innovative application effectively leverages their advantages in a practical, targeted manner. We then systematically explore the pruning of well-established unitary MZI topologies to optimize it as a trigger, leading to a new, application-specific topology named MiniBokun. Through simulation, we demonstrate that MiniBokun, when used as the binary optical trigger, prunes away at least 50% of the MZIs from a standard unitary at the cost of 1%&#x2013;2% accuracy impairment&#x2014;leading to a conservatively estimated power saving of 24% and an area reduction of 40%. The paper is structured as follows: we first cover the ONN background in <xref ref-type="sec" rid="s2">Section 2</xref>. Our experimental setup, application-specific optimization, pruning approach, phase noise considerations, and estimations regarding a physical system (power, latency and area) are described in <xref ref-type="sec" rid="s3">Section 3</xref>. The results, obtained through the methodology described in <xref ref-type="sec" rid="s3">Section 3</xref>, are presented and analyzed in <xref ref-type="sec" rid="s4">Section 4</xref>. Followed by the conclusion in <xref ref-type="sec" rid="s5">Section 5</xref>.</p>
</sec>
<sec id="s2">
<title>2 Background</title>
<sec id="s2-1">
<title>2.1 MZI basics</title>
<p>Our optical processor adheres to an MZI-based architecture, taking advantage of its capability of realizing signed, complex-valued weights (<xref ref-type="bibr" rid="B23">Mourgias-Alexandris et al., 2022</xref>). The MZI-based neural network accelerator consists of a mesh of <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> reconfigurable MZI building blocks topologically arranged to form an optical processor unit, as shown at the top of <xref ref-type="fig" rid="F1">Figure 1</xref>. Each building block splits the optical signal and adjusts the relative phase difference through the internal phase shifter (<inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, colored in orange). Next, the phase of the recombined optical signal is programmed through the external phase shifter (<inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, colored in blue). The transformation matrix of a single MZI building block, <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, mapped to the optical processor can be expressed as<disp-formula id="e1">
<mml:math id="m10">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>j</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>sin</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mtext>cos</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mtext>cos</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x2212;</mml:mo>
<mml:mi>sin</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>with <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. The output field is expressed as the multiplication of the transformation matrix and the input field, i.e., a matrix-vector multiplication. Each processor behaves as one fully connected layer and an equal number of inputs and outputs, though only covering unitary or sub-unitary space as opposed to arbitrary linear space.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Structure of an MZI and MZI-based ONN. <bold>(A)</bold> A <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Clements Topology. <bold>(B)</bold> A random subset of the MZIs, shown in transparency, is pruned away from <bold>(A)</bold>, forming a customized topology. <bold>(C)</bold> Specific Pruning of Clements to obtain MiniBokun Topology, for usage as a binary trigger. <bold>(D)</bold> MiniBokun Topology on its own with updated MZI numbering.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g001.tif"/>
</fig>
</sec>
<sec id="s2-2">
<title>2.2 MZI-based optical processor topology and mathematical model</title>
<p>Larger transformation matrices can be realized by organizing these <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> units in different topologies (one example is shown in <xref ref-type="fig" rid="F1">Figure 1A</xref>) such that each <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e1">Equation 1</xref> is multiplied and concatenated (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>). Well-established unitary topologies such as Reck (<xref ref-type="bibr" rid="B25">Reck et al., 1994</xref>), Clements (<xref ref-type="bibr" rid="B6">Clements et al., 2016</xref>), Diamond (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>) and Bokun (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>) each have their advantage and drawback on physical footprint, calibration difficulty, and loss-balance properties (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>).</p>
<p>Considering the specific topology, the transformation matrix, <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, represented by an ONN mesh of size <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, given the placement of the MZIs and the phase shifter value pair <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of each MZI in the mesh, can be defined by:<disp-formula id="e2">
<mml:math id="m18">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where,<disp-formula id="e3">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ef;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22f1;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ef;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22f0;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ee;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22f0;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ef;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22f1;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mo>&#x22ef;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="center">
<mml:mn>1</mml:mn>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>In <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the total number of MZIs in the mesh (e.g., <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>45,21</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="F1">Figures 1A, D</xref>, respectively). The subscript <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes that the MZI&#x2019;s transformation matrix (i.e., <xref ref-type="disp-formula" rid="e1">Equation 1</xref>) occupies a 2-dimensional subspace within a <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> Hilbert space, as shown in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>. As an example, MZI 11 in <xref ref-type="fig" rid="F1">Figure 1A</xref> has its upper and lower branch aligned with waveguides 3 and 4, respectively, therefore, <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a matrix whose four entries located on row and column 3-4 are replaced by <inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, with 1s on the remaining diagonal entries and 0s elsewhere. Note that the waveguide number and matrix entries are zero-indexed.</p>
<p>At most <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> tunable parameters (i.e., phase shifters) are used in representing <inline-formula id="inf24">
<mml:math id="m27">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. However, an arbitrary <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> complex matrix requires <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> parameters. Consequently, as indicated by the SVD process and mentioned in <xref ref-type="sec" rid="s2-1">Section 2.1</xref>, <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is a unitary or sub-unitary transformation matrix as opposed to a linear one. This implies that the entries in the matrix are not completely independent, and thus the weight matrix has a smaller learnable space than a conventional fully-connected layer (<xref ref-type="bibr" rid="B19">Miller, 2013</xref>).</p>
<p>Given an arbitrary mesh formed by the removal of a subset of MZIs from a full <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> mesh (i.e., Clements topology shown in <xref ref-type="fig" rid="F1">Figure 1A</xref>), a more intuitive way of interpreting <xref ref-type="disp-formula" rid="e2">Equation 2</xref> is to group MZIs into vertical columns, as shown in <xref ref-type="fig" rid="F1">Figure 1B</xref>, the layer-wise transformation performed by MZI column <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is denoted by <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> will be a matrix similar to <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, but with potentially more than one <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> block on the diagonal being replaced by transformation matrices of the MZIs existing on that column. <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> can thus alternatively be written as:<disp-formula id="e4">
<mml:math id="m38">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x220f;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>The interpretation in <xref ref-type="disp-formula" rid="e4">Equation 4</xref> gives insight into the transformation performed by each MZI column while providing a clearer picture of how each MZI is ordered in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>.</p>
</sec>
<sec id="s2-3">
<title>2.3 Signal basics</title>
<sec id="s2-3-1">
<title>2.3.1 Phasor term</title>
<p>A phasor is a scalar, complex value sufficient to describe the steady state of a mono-frequency sinusoidal waveform. In the context of an optical signal, this represents the electric field component of the monochromatic laser. A phasor term takes the form of:<disp-formula id="e5">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>E</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>In <xref ref-type="disp-formula" rid="e5">Equation 5</xref>, <inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the phasor term for signal <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf37">
<mml:math id="m42">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is its real amplitude, and <inline-formula id="inf38">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the incoming phase seen by the subsequent optical component on that signal&#x2019;s path. The transformation matrices of size <inline-formula id="inf39">
<mml:math id="m44">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, representing the effect of any combination of optical components, apply directly to the vector composed of <inline-formula id="inf40">
<mml:math id="m45">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> incoming signals&#x2019; phasor. As an example, the incoming signal <inline-formula id="inf41">
<mml:math id="m46">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf42">
<mml:math id="m47">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to an MZI&#x2019;s input ports will become <inline-formula id="inf43">
<mml:math id="m48">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>u</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m49">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>u</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> at the output ports, related by <xref ref-type="disp-formula" rid="e6">Equation 6</xref>:<disp-formula id="e6">
<mml:math id="m50">
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>O</mml:mi>
<mml:mi>u</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>O</mml:mi>
<mml:mi>u</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>I</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mi>I</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Value representation and importance of coherency</title>
<p>Due to difficulties of controlling the absolute phase of optical signal (<xref ref-type="bibr" rid="B15">Ip et al., 2008</xref>), the incoming data (i.e., feature vectors of each data sample) will be solely represented by the intensity <inline-formula id="inf45">
<mml:math id="m51">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of the signal, given by <inline-formula id="inf46">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x221d;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, this implies that in an array of incoming signal represented as:<disp-formula id="e7">
<mml:math id="m53">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mo>&#x20d7;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mtable class="matrix">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mo>&#x2026;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>In <xref ref-type="disp-formula" rid="e7">Equation 7</xref>, only the <inline-formula id="inf47">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>in</mml:mtext>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> terms vary across different data, and each <inline-formula id="inf48">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> takes the same value in the range <inline-formula id="inf49">
<mml:math id="m56">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> for all <inline-formula id="inf50">
<mml:math id="m57">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>1,2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. In theory, the zero phase difference ensures each MZI can split power to an arbitrary ratio between its two outputs <inline-formula id="inf51">
<mml:math id="m58">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out1</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out2</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. To achieve so, a layer of phase shifters is assumed to be present before the MZI mesh, albeit this is omitted in the diagram.</p>
<p>For the correct functioning of any trained network, not only the zero phase difference across input channels is required, but the absolute phase of each input signal should also remain constant throughout the network&#x2019;s training and operation. As the MZI mesh works by the principle of interference, an incoherent or varying initial phase difference between signals will affect the intended splitting ratio learned from network training, making the resulting output signal array drastically different from the expectation.</p>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Imperfect operation</title>
<p>The actual implementation of optical processors faces various aspects of imperfections, and the presence of imperfections significantly degrades the computation accuracy of ONNs (<xref ref-type="bibr" rid="B26">Shafiee et al., 2024</xref>; <xref ref-type="bibr" rid="B14">Gu et al., 2020b</xref>). In this work, our investigation of the impacts of imperfections focuses on two main sources, optical loss and phase value programming deviations.</p>
<sec id="s2-4-1">
<title>2.4.1 Optical loss</title>
<p>During ONN inference, when light couples through the waveguide, the processor suffers from inherent propagation loss. The propagation loss eventually leads to a challenging optical power budget, limiting the signal-to-noise ratio at the photodetectors and reducing the classification accuracy of ONN. The linear loss values <inline-formula id="inf52">
<mml:math id="m59">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>linear</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, transformed from loss in dB-scale <inline-formula id="inf53">
<mml:math id="m60">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dB</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> by <inline-formula id="inf54">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>linear</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dB</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, are applied to the ONNs and remain constant at a per MZI basis. The lossy transformation matrix can be expressed as<disp-formula id="e8">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>linear</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-4-2">
<title>2.4.2 Phase shifter programming deviation</title>
<p>The programmed phase shift can deviate from its intended value due to thermal crosstalk (<xref ref-type="bibr" rid="B26">Shafiee et al., 2024</xref>). When programming a targeted waveguide, the heat from resistive heaters can propagate to other waveguides, creating unintended phase changes. To capture these imperfections, we model the programmed phases with a Gaussian distribution <inline-formula id="inf55">
<mml:math id="m63">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf56">
<mml:math id="m64">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> [rad] are phases obtained after training and quantization and <inline-formula id="inf57">
<mml:math id="m65">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are the phase variations due to thermal crosstalk.</p>
</sec>
</sec>
<sec id="s2-5">
<title>2.5 Neural network pruning</title>
<p>In practice, pruning often implies the removal of neurons and weighted connections in a structured or unstructured fashion (<xref ref-type="bibr" rid="B24">Nagel et al., 2021</xref>). For neural networks (NNs) implemented by digital processors, network pruning has been known for its benefits of simplifying NN&#x2019;s architecture, reducing computation workload and memory footprint, and subsequently improving inference speed and efficiency. ONNs, on the other hand, though composed of physically integrated photonics components (e.g., MZIs), benefit from an analogous set of advantages (<xref ref-type="bibr" rid="B3">Banerjee et al., 2023</xref>). First, pruning in the hardware context means the direct removal of photonic integrated circuit (PIC) components. The feasibility of the layout is not only subject to the number of on-chip components but also complicated by the requirement of a voltage supply line to each active component (e.g., for the thermo-optic phase shifter). As the number of components grows, this poses a significant challenge for the layout routability and manufacturability in a two-dimensional circuit board. The removal of PIC components immediately reduces layout complexity and manufacturing costs. Second, each component introduces loss to the propagating optical signal to various extents. Reducing the number of components on one optical path reduces the total amount of accumulated loss experienced by that signal, improving signal-to-noise ratio (SNR) at detection. Third, reducing the number of active components naturally leads to less power consumption during operation.</p>
<p>The pruning of MZI-based ONNs was explored in previous works. <xref ref-type="bibr" rid="B3">Banerjee et al. (2023)</xref> introduced a pruning algorithm and its variants targeting large-scale SVD-based ONNs for multi-class classification. The algorithm is demonstrated via simulation on networks comprising at least four unitary meshes connected by a non-linear activation function, with 64 as the minimum network width. In particular, their pruning is realized via power-gating or removal of phase shifters, not the entire MZI. This implies that imprecise beam splitters are still present in the actual physical system. Training-time structured pruning was also conducted in the tile-based ONN, such as the block-circulant unit in <xref ref-type="bibr" rid="B13">Gu et al. (2020a)</xref>. However, to the best of our knowledge, no direct MZI-level pruning on well-established unitary meshes was explored. Specifically, we focused on removing entire MZIs from a unitary structure rather than power-gating active components in SVD-based setups or setups involving component reuse. Our pruning strategy enables a reduction in optical depth and insertion loss compared to these previous configurations. Though unitary meshes already have limited expressivity compared to an arbitrary linear weight matrix, our study showed that the application as a binary optical trigger allowed for an ultra-lightweight ONN that is pruned into deep sub-unitary space without significantly affecting classification accuracy.</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="s3">
<title>3 Materials and methods</title>
<sec id="s3-1">
<title>3.1 Neuroptica</title>
<p>We use Neuroptica (<xref ref-type="bibr" rid="B4">Bartlett et al., 2019</xref>) to evaluate the simulated performance of our ONN architectures. Neuroptica is a Python simulation platform for coherent optical neural networks built with integrated components, such as MZI. The platform allows one to explore ONN architecture design, <italic>ex-situ</italic> ONN training (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>), and noise/loss robustness simulation of trained ONNs. In addition to the components simulated within the ONN mesh area, we assume the presence of a laser source and variable optical attenuators at the ONN&#x2019;s input side to produce feature values for each data sample, though these components are not explicitly simulated.</p>
<sec id="s3-1-1">
<title>3.1.1 Hyperparameter selection and training</title>
<p>We first evaluate networks with input sizes <inline-formula id="inf58">
<mml:math id="m66">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of 8, 16, 32, and 64 to understand the required network complexity given the two datasets under consideration (to be introduced in <xref ref-type="sec" rid="s3-2">Section 3.2</xref>). We perform each simulation (training followed by evaluation with the test set) five times with five different random seeds. Each model with a random seed is trained for 50 epochs, and the phase values obtained from the epoch that gives the highest validation accuracy are kept for final tests. Except for special pruned cases, the first two output ports (<inline-formula id="inf59">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf60">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="F1">Figure 1A</xref>) are used for final decision calculations. We use the test set to calculate each model&#x2019;s test accuracy and F1 score. While test accuracy provides an overall measure of the model&#x2019;s ability to classify test samples, the F1 score offers a balanced assessment of the model&#x2019;s performance in correctly predicting both positive and negative classes in a binary classification task.</p>
<p>The limited size of the ONN we are evaluating and the resolution of the images in the selected datasets mean that data must first be compressed in some way prior to inference. Therefore, we use Principal Components Analysis (PCA) to perform dimensionality reduction. Mathematically, PCA maps <inline-formula id="inf61">
<mml:math id="m69">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional data to a <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional subspace <inline-formula id="inf63">
<mml:math id="m71">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x226a;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> by finding the eigenvectors that best represent the feature distributions in the data. These eigenvectors are decided by sorting their corresponding singular values obtained from SVD. The higher the singular value, the more variance in data points in the direction of the eigenvector, making the eigenvector more representative. The top-<inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> eigenvectors are combined and multiplied with the original data matrix to complete the transformation, resulting in <inline-formula id="inf65">
<mml:math id="m73">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> features that are used as the input signal to the <inline-formula id="inf66">
<mml:math id="m74">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> ONN.</p>
<p>We make two assumptions about the input range of ONN based on laser power consumption. The first assumption assumes a fixed per-channel laser input range (<inline-formula id="inf67">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>i</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> mW per channel). This ensures the same input range to all channels regardless of the input feature size <inline-formula id="inf68">
<mml:math id="m76">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and the resultant overall laser power increases with <inline-formula id="inf69">
<mml:math id="m77">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. On the other hand, the second assumption fixes the total laser power to 10&#xa0;mW. For each input channel, <inline-formula id="inf70">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>i</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> mW. As a result, the per-channel input range will decrease as <inline-formula id="inf71">
<mml:math id="m79">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increases.</p>
<p>With these training setups, training time ranged from two to three minutes for <inline-formula id="inf72">
<mml:math id="m80">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> meshes to nearly half an hour for <inline-formula id="inf73">
<mml:math id="m81">
<mml:mrow>
<mml:mn>64</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> meshes on an Apple M2 Pro Processor (10 cores, 16&#xa0;GB memory and 200&#xa0;GB/s memory bandwidth).</p>
</sec>
</sec>
<sec id="s3-2">
<title>3.2 Datasets</title>
<sec id="s3-2-1">
<title>3.2.1 MNIST</title>
<p>The MNIST dataset (<xref ref-type="bibr" rid="B9">Deng, 2012</xref>) consists of <inline-formula id="inf74">
<mml:math id="m82">
<mml:mrow>
<mml:mn>70,000</mml:mn>
<mml:mspace width="0.3333em"/>
<mml:mn>28</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>28</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> grayscale images of handwritten digits 0-9. Given that our focus is binary classification, we modify the 10-class MNIST dataset by aggregating samples with labels <inline-formula id="inf75">
<mml:math id="m83">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and labels <inline-formula id="inf76">
<mml:math id="m84">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>9</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> into samples with labels <inline-formula id="inf77">
<mml:math id="m85">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf78">
<mml:math id="m86">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. Compared to using any two out of the 10 classes, this approach allows us to maximally utilize the available dataset and test the networks&#x2019; generalizability across diverse samples while avoiding biased task complexity caused by choosing two specific classes out of ten. The dataset is split <inline-formula id="inf79">
<mml:math id="m87">
<mml:mrow>
<mml:mn>50,000</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>10,000</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>10,000</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to form the training, validation, and test sets.</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 CIFAR-10</title>
<p>The CIFAR-10 dataset (<xref ref-type="bibr" rid="B16">Krizhevsky and Hinton, 2009</xref>) contains <inline-formula id="inf80">
<mml:math id="m88">
<mml:mrow>
<mml:mn>32</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> images in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck), each with three colour channels (red, green, and blue). Similar to how we process the MNIST dataset, we rearrange the CIFAR-10 labels to make the classification binary by aggregating the original label of &#x201c;airplanes&#x201d;, &#x201c;cars&#x201d;, &#x201c;ships&#x201d;, and &#x201c;trucks&#x201d; into a new group called &#x201c;vehicles&#x201d;; &#x201c;birds&#x201d;, &#x201c;cats&#x201d;, &#x201c;deer&#x201d;, and &#x201c;dogs&#x201d; into a new group called &#x201c;animals&#x201d;. The images originally labelled as &#x201c;frog&#x201d; and &#x201c;horse&#x201d; are removed from the dataset to ensure the balance between data samples in the two classes. This reduces the total image number to 48,000 with 24,000 images in each category. The dataset is split <inline-formula id="inf81">
<mml:math id="m89">
<mml:mrow>
<mml:mn>32,000</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>8,000</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>8,000</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to form the training, validation, and test sets.</p>
</sec>
<sec id="s3-2-3">
<title>3.2.3 Task complexity and pruning efficacy</title>
<p>In both datasets, the original images have sufficient pixels to clearly depict the represented objects. This ensures that the complexity of any formulated task comes from the intrinsic difficulty of distinguishing objects across different classes rather than from low image resolution. Depending on the specific task complexity and the degree of over-parametrization in the network model, varying levels of pruning can be carried out. As a result, our aggregated classification tasks of both datasets provide meaningful task complexity and serve as effective benchmarks for evaluating the ONN capability and the efficacy of the pruning process.</p>
</sec>
</sec>
<sec id="s3-3">
<title>3.3 Application-specific optimization</title>
<p>Apart from a grid search of hyperparameters, we consider the following application-specific optimization methods to further enhance the performance of the models. These optimization methods focus on actual implementation challenges and adapt the trained models to real-world conditions.</p>
<sec id="s3-3-1">
<title>3.3.1 False negative reduction</title>
<p>The binary optical trigger structure is anticipated to be used in event-triggered structures, where the ONN activates the rest of a system when a pre-defined event takes place (e.g., a vehicle is detected by ONN after training on the CIFAR-10 dataset). A key challenge in the implementation of such a system is the optical trigger false negatives: the pre-defined event happens, but the ONN does not send a trigger signal, and the rest of the system fails by default. Therefore, one goal of our work is to minimize the number of False Negatives (FN) while maintaining the classification accuracy of ONN.</p>
<p>The FN reduction method considered in this work changes the weight assigned to each label class in the loss function. We penalize FN more severely, and the binary cross-entropy loss becomes<disp-formula id="e9">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>BCE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mtext>log</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mtext>log</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf82">
<mml:math id="m91">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a constant greater than 1, and <inline-formula id="inf83">
<mml:math id="m92">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is the output from classifier <inline-formula id="inf84">
<mml:math id="m93">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> after Sigmoid activation. Consequently, the gradient of the loss function with respect to the network weight <inline-formula id="inf85">
<mml:math id="m94">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> becomes<disp-formula id="e10">
<mml:math id="m95">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>BCE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x2202;</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf86">
<mml:math id="m96">
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the input to the ONN layer and the complex conjugate is taken for complex-valued neural networks.</p>
<p>During implementations in this work, in order to strike a trade-off between FN reduction and the classification performance, we train ONN models with different weights <inline-formula id="inf87">
<mml:math id="m97">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> assigned to the positive class and record their effects on the FN numbers and the test accuracy.</p>
</sec>
<sec id="s3-3-2">
<title>3.3.2 Post-training quantization (PTQ)</title>
<p>Programming the MZI-based building block of ONNs involves configuring their phase shifters to form a desired transfer matrix. In this work, we consider MZIs with phase shifters controlled by thermally changing the phase using resistive heaters tuned by a voltage supply (<xref ref-type="bibr" rid="B17">Masood et al., 2013</xref>). The relationship between the heater control voltage <inline-formula id="inf88">
<mml:math id="m98">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">bias</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and the intended phase shift <inline-formula id="inf89">
<mml:math id="m99">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> can be formulated as<disp-formula id="e11">
<mml:math id="m100">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">bias</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where <inline-formula id="inf90">
<mml:math id="m101">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c0;</mml:mi>
<mml:mo>/</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B14">Gu et al., 2020b</xref>) and <inline-formula id="inf91">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> refers to the required bias voltage for programming <inline-formula id="inf92">
<mml:math id="m103">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c0;</mml:mi>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>).</p>
<p>Practical voltage sources have limited resolution, meaning they can be adjusted only to a finite number of discrete voltage levels equally spaced between the maximum and minimum values. A <inline-formula id="inf93">
<mml:math id="m104">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-bit voltage supply has <inline-formula id="inf94">
<mml:math id="m105">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> achievable voltage levels spaced apart by <inline-formula id="inf95">
<mml:math id="m106">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>res</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with the <inline-formula id="inf96">
<mml:math id="m107">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th voltage level being <inline-formula id="inf97">
<mml:math id="m108">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>res</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> volts. <inline-formula id="inf98">
<mml:math id="m109">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the maximum supply voltage. As we require a phase setting range over <inline-formula id="inf99">
<mml:math id="m110">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (<inline-formula id="inf100">
<mml:math id="m111">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>), all the voltage levels beyond <inline-formula id="inf101">
<mml:math id="m112">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are not used and the effective bit-resolution of the voltage supply further drops by <inline-formula id="inf102">
<mml:math id="m113">
<mml:mrow>
<mml:mo>&#x230a;</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>log</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x230b;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. The resultant quantized phases are obtained by mapping the sampled voltage levels <inline-formula id="inf103">
<mml:math id="m114">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> back to <inline-formula id="inf104">
<mml:math id="m115">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">bias</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e11">Equation 11</xref>.</p>
<p>Models with selected hyperparameters from the previous steps are quantized by rounding the trained phase shifts to their nearest quantized phase values. We choose the least voltage resolution (in <inline-formula id="inf105">
<mml:math id="m116">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>4,16</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> bits) that enables the closest ONN test accuracy to those obtained from full resolution (32-bit) training settings.</p>
</sec>
</sec>
<sec id="s3-4">
<title>3.4 Hardware pruning</title>
<p>In this section, we will use the <inline-formula id="inf106">
<mml:math id="m117">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Clements topology in <xref ref-type="fig" rid="F1">Figure 1A</xref> as a running example to demonstrate a systematic way of performing pre-training pruning of MZI-based ONN. We begin with the Clements mesh, proven to be optimally unitary for its short optical depth and balanced path length (<xref ref-type="bibr" rid="B6">Clements et al., 2016</xref>). First, an important observation is that two output ports are sufficient for carrying out binary training and inferences. As indicated in <xref ref-type="fig" rid="F1">Figure 1A</xref>, regardless of which two ports are used, certain MZIs will remain redundant, as they never receive backward propagating optical gradient signals at either output port, regardless of the states (cross or bar) of other MZIs in the mesh. We refer to these as redundant MZIs. The phase shifters in these MZIs remain at their initialization state and do not contribute to the classification process at all. This observation motivates two aspects for potential improvement: 1) the port choices, and 2) the removal of corresponding redundant MZIs. Given the importance of optical path balance, signals contributing to decision-making should propagate along paths with the same or similar number of MZIs. Typically, the longest optical path in an MZI mesh equals the number of MZI columns (i.e., signals go through one MZI in the current vertical column to arrive at the next column), and conversely, the shortest path &#x201c;falls through&#x201d; as many MZI layers without actually going through any MZIs as possible. In the case of the <inline-formula id="inf107">
<mml:math id="m118">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Clements, the shortest path lies on waveguide 0 and waveguide 9 <inline-formula id="inf108">
<mml:math id="m119">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, using any of the output ports on these waveguides will subject the signal to the maximally imbalanced condition. Furthermore, waveguides located at the edge of the network mesh have only one side for redirecting optical power (towards the center waveguides). In light of the above two considerations, using the central two output ports and pruning away corresponding redundant MZIs (MZI 1, 4, 5 and 9 in <xref ref-type="fig" rid="F1">Figure 1C</xref>) achieves minimal path imbalance <inline-formula id="inf109">
<mml:math id="m120">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>7,10</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and unbiased utilization of the whole expressible space provided by the available optical components.</p>
<p>Upon deciding on port selection and pruning of redundant MZIs, we then perform <bold>layer-wise pruning</bold>. At each pruned step, we monitor the network performance by performing the same training and testing process and record the testing accuracy. The layer-wise pruning stops until we obtain a minimal network topology that still ensures all input signals are able to reach the center two waveguides. This topology consists of MZIs colored in blue in <xref ref-type="fig" rid="F1">Figure 1C</xref>, which is a triangle mesh that marginally allows the diversion of optical power from top/bottom waveguides to the center waveguides, any further removal of MZIs on this topology will either result in wasted waveguide channels, or isolation between two-halves of the input vector causing unwanted dependence in the network&#x2019;s decision making.</p>
<sec id="s3-4-1">
<title>3.4.1 Expressivity study (fidelity analysis)</title>
<p>To gain insight into the trade-off between reducing component usage and disruption in network expressivity, we look for a suitable metric to evaluate the pruned mesh&#x2019;s expressivity. In previous works, the concept of fidelity was employed (<xref ref-type="bibr" rid="B11">Feng et al., 2022</xref>; <xref ref-type="bibr" rid="B34">Zhang et al., 2021</xref>) for evaluating the similarity between two complex density matrices. Similar metrics include the Frobenius norm, cosine similarity, and correlation coefficients. However, we note the unsuitability of a simple similarity metric in our particular case, as the goal of pruning is not to produce an optical mesh capable of approximating the original unitary matrix. Rather, given the relative simplicity of the binary trigger task and monitoring of only two entries in the output vector, fully unitary ONNs are likely over-parameterized, and completely different sets of optimal weight may exist in the sub-unitary space that have little to no relation to the unitary weight matrices producing similar classification accuracy.</p>
<p>For this reason, we employ a sampling-based benchmark to evaluate the signal routing ability of <inline-formula id="inf110">
<mml:math id="m121">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> sub-unitary meshes: 10,000 random sub-unitary weight matrices implemented by MiniBokun mesh are generated, each is provided with three sets of random input vectors (10 per set), plus an input vector whose power is equally distributed to all ports. Although random, the vectors in each of the three sets have distinct optical power concentrations at specific input ports (with the highest power at ports 0, 3, and 5, respectively). The input vectors are assumed to be coherent with an absolute phase of 0&#xa0;rad, and the total input power is fixed at 10&#xa0;mW. For each topology under test, the power distribution at each output port when subjecting the ONN to the aforementioned artificial input vectors is recorded. The expressivity of each topology can then be inferred based on the attained distribution.</p>
</sec>
</sec>
<sec id="s3-5">
<title>3.5 Imperfection study (sensitivity analysis)</title>
<p>All the previous training and tuning of ONNs are conducted with the assumption of perfect operating conditions. However, in reality, ONNs suffer from various aspects of imperfections. To test the resilience of ONNs to imperfections, we inject and vary the magnitude of the optical loss and phase programming deviations to trained ONN models and check their response. The optical loss, defined at the dB-scale <inline-formula id="inf111">
<mml:math id="m122">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dB</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, varies from 0 to 1&#xa0;dB per MZI. It is converted to the linear scale and applied to the transfer matrices of the MZI using <xref ref-type="disp-formula" rid="e8">Equation 8</xref>. The phase programming deviations are defined as the phase deviations <inline-formula id="inf112">
<mml:math id="m123">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="sec" rid="s2-4-2">Section 2.4.2</xref>, varying from 0 to 1 radian. We sample the deviated phase values from the normal distribution and recalculate transfer matrices with them. Next, the imperfect transfer matrices are applied to ONNs, and we obtain the ONNs&#x2019; test accuracy under imperfect conditions by re-performing the inference. To fully capture the models&#x2019; response to stochastic phase programming deviations, we sample 20 different <inline-formula id="inf113">
<mml:math id="m124">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf114">
<mml:math id="m125">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> values per phase deviation and took the average of the accuracy.</p>
<p>To quantify the tolerance of ONNs to imperfections, we define two Figures of Merits (FoMs) on two sets of imperfect scenarios. The first imperfect scenario assumes only phase programming deviations (<bold>P</bold>hi-<bold>T</bold>heta case), <inline-formula id="inf115">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf116">
<mml:math id="m127">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> vary separately and <inline-formula id="inf117">
<mml:math id="m128">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dB</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is kept at 0&#xa0;dB. The first FoM is defined as the number of <inline-formula id="inf118">
<mml:math id="m129">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf119">
<mml:math id="m130">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> combinations that lead to a test accuracy greater than 60%, times the surface area covered by each <inline-formula id="inf120">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf121">
<mml:math id="m132">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> combination in <inline-formula id="inf122">
<mml:math id="m133">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mtext>rad</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>]. This hard limit (60% test accuracy) is defined as our boundary of random guesses.</p>
<p>The second imperfect scenario (<bold>L</bold>oss-<bold>P</bold>hase <bold>U</bold>ncertainty case) considers both optical loss and phase deviations. We assume <inline-formula id="inf123">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and compute a second FoM by multiplying the number of (<inline-formula id="inf124">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf125">
<mml:math id="m136">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>dB</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) combinations that lead to a test accuracy greater than 60% with the surface area covered by each combination in [rad<inline-formula id="inf126">
<mml:math id="m137">
<mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>dB].</p>
</sec>
<sec id="s3-6">
<title>3.6 Power, latency, and area estimations</title>
<p>The power consumption estimation of ONN takes into account power consumed by the laser, the memory for storing phase shifter values and input, the digital-to-analog and analog-to-digital conversions, optical input modulation, phase programming, and output optical-electrical signal conversion, as expressed in <xref ref-type="disp-formula" rid="e12">Equation 12</xref>.<disp-formula id="e12">
<mml:math id="m138">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>laser</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>mem</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>DAC</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>input_mod</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>phase_prog</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>O</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mtext>E</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>comp</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>ADC</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>Similarly, the latency of one inference on ONN considers the time spent when the electrical and optical signal propagates through the system during one pass of calculation (or one inference). Assuming an always-on laser, the latency is expressed as:<disp-formula id="e13">
<mml:math id="m139">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>total</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>input_mem</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>DAC</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>input_mod</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>phase_mem</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>DAC</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>phase_prog</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>O</mml:mtext>
<mml:mo>-</mml:mo>
<mml:mtext>E</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>comp</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>ADC</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>In <xref ref-type="disp-formula" rid="e13">Equation 13</xref>, <inline-formula id="inf127">
<mml:math id="m140">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MZI</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the time light travels through the MZIs in an optical mesh. The input modulation and phase programming steps can be parallelized as the data are fetched from different memory locations and there is no sharing of components along the data path. We keep the greater time spent by the two processes for latency calculation.</p>
<p>The area estimations focus on the layout area of the optical meshes, containing only the MZIs and their connecting waveguides. The length of the mesh is determined by the maximum number of MZIs and waveguide sections connected in series, and the width is the separation distance between waveguides times the number of gaps between input/output ports.</p>
<p>We account for a 10 dBm C-band laser with a wall plug efficiency of 10% (<xref ref-type="bibr" rid="B1">Al-Qadasi et al., 2022</xref>). This single laser source provides sufficient optical power to all input ports of ONNs while meeting the minimum required optical power sensitivity of the photodetector. The modulation of laser input (or the electrical-to-optical, EO conversion) is assumed to operate at approximately 20 fJ/bit with a rate of 2.5&#xa0;Gb/s (<xref ref-type="bibr" rid="B8">Demirkiran et al., 2023</xref>).</p>
<p>The phase programming power estimation is divided into two scenarios: a conservative estimation using doped-Si heaters without insulation and an aggressive estimation considering heaters with thermal insulation trenches (formed by deep etching) (<xref ref-type="bibr" rid="B17">Masood et al., 2013</xref>). Without the insulation, the heaters consume <inline-formula id="inf128">
<mml:math id="m141">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>21</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> mW per <inline-formula id="inf129">
<mml:math id="m142">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> phase shift with a stabilization time of less than 30&#xa0;&#x3bc;s (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>); meanwhile, with the insulation, the heater responsivity improves to <inline-formula id="inf130">
<mml:math id="m143">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>1.42</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> mW per <inline-formula id="inf131">
<mml:math id="m144">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> phase shift yet the settling time extends to more than 150&#xa0;&#x3bc;s (<xref ref-type="bibr" rid="B17">Masood et al., 2013</xref>). We assume each <inline-formula id="inf132">
<mml:math id="m145">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> MZI has a length of <inline-formula id="inf133">
<mml:math id="m146">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>300</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> &#x3bc;m with phase shifters of 135&#xa0;&#x3bc;m. (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>), and the waveguides are separated by 60&#xa0;&#x3bc;m (<xref ref-type="bibr" rid="B32">Williamson et al., 2020</xref>). The effective index <inline-formula id="inf134">
<mml:math id="m147">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>eff</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of MZIs is 2.8.</p>
<p>The input to ONN is fixed at 8-bit resolution while the phase value resolution will be determined by the post-training quantization. The estimations of digital-to-analog converters (DACs) power consumption patterns are also done in two ways: 1) a conservative FoM-based performance approximation which allows us to consider high-speed DACs (data rate &#x3d; 10&#xa0;GSamples/s) (<xref ref-type="bibr" rid="B8">Demirkiran et al., 2023</xref>), and 2) an aggressive performance estimation using established commercial products with low power consumption.</p>
<p>The optical-to-electrical (OE) circuit at each output port contains a photodetector with a responsivity of 1 A/W and a trans-impedance amplifier. Each channel of EO conversion consumes 100&#xa0;mW of power with a group delay of 100&#xa0;ps (<xref ref-type="bibr" rid="B32">Williamson et al., 2020</xref>). The subsequent comparator and analog-to-digital converter (ADC) circuit requires only binary resolution and consumes only 325 <inline-formula id="inf135">
<mml:math id="m148">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3bc;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>m of power with a propagation delay of 75&#xa0;ns (<xref ref-type="bibr" rid="B30">Texas Instruments, 2018</xref>).</p>
<p>According to <xref ref-type="bibr" rid="B1">Al-Qadasi et al. (2022)</xref>, the input to the ONN is stored in DRAMs while the phase values to be programmed are stored in SRAMs. The ONNs considered in this work are small, with each of them containing less than 256 bytes in total for both input and phase values. Despite this, we set the SRAM size to 16&#xa0;KB and the DRAM size to 64&#xa0;KB to sufficiently hold more than 100 copies of ONNs and a few thousand input samples after dimensionality reduction. The power and latency numbers are calculated based on modeling data from Cacti 7.0 (<xref ref-type="bibr" rid="B31">Thoziyoor et al., 2008</xref>).</p>
</sec>
</sec>
<sec sec-type="results|discussion" id="s4">
<title>4 Results and discussion</title>
<sec id="s4-1">
<title>4.1 Architectural analysis of optical meshes</title>
<p>The architectural parameters of the three topologies are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. Among the three topologies, the MiniBokun shown in <xref ref-type="fig" rid="F1">Figure 1D</xref>, resulting from the pruning process, achieved minimum component usage while demonstrating a size-invariant path length difference of only two MZIs.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Architectural Parameters of Different Topologies of size <inline-formula id="inf136">
<mml:math id="m149">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Topology</th>
<th align="center">Number of MZIs</th>
<th align="center">Optical path length [Min, Max]</th>
<th align="center">Number of redundant MZIs</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Clements</td>
<td align="center">
<inline-formula id="inf137">
<mml:math id="m150">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf138">
<mml:math id="m151">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf139">
<mml:math id="m152">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula id="inf140">
<mml:math id="m153">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">
<inline-formula id="inf141">
<mml:math id="m154">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf142">
<mml:math id="m155">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,2</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf143">
<mml:math id="m156">
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">
<inline-formula id="inf144">
<mml:math id="m157">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">8</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">10</mml:mn>
<mml:mi mathvariant="bold">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn mathvariant="bold">32</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf145">
<mml:math id="m158">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="bold">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="bold">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
<td align="center">
<inline-formula id="inf146">
<mml:math id="m159">
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s4-1-1">
<title>4.1.1 Pruning with accuracy monitoring</title>
<p>Following the method discussed in <xref ref-type="sec" rid="s3-4">Section 3.4</xref>, the monitored average binary MNIST accuracy per <inline-formula id="inf147">
<mml:math id="m160">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> network is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. As indicated by the plateauing part of the curve, up to four MZIs columns can be pruned with less than <inline-formula id="inf148">
<mml:math id="m161">
<mml:mrow>
<mml:mn>0.5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> accuracy drop, pruning six layers leads to the minimal triangle topology discussed in <xref ref-type="sec" rid="s3-4">Section 3.4</xref>, in which significant accuracy degradation is observed. We thus restore two pruned columns, and remove the top and bottom MZIs in the left-most restored column (MZI <inline-formula id="inf149">
<mml:math id="m162">
<mml:mrow>
<mml:mn>23,27</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="F1">Figure 1C</xref>, or equivalently, MZI <inline-formula id="inf150">
<mml:math id="m163">
<mml:mrow>
<mml:mn>19,23</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="fig" rid="F2">Figure 2</xref>) to create diagonal access concerning paths <inline-formula id="inf151">
<mml:math id="m164">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf152">
<mml:math id="m165">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>9</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in practical chip-calibration (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>). With the above steps, we obtain the MiniBokun Topology.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Accuracy Variation as the Clements topology is Pruned column by column, starting with the column closest to the input.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g002.tif"/>
</fig>
</sec>
<sec id="s4-1-2">
<title>4.1.2 The MiniBokun topology</title>
<p>Similar to the full-size Bokun Mesh proposed in (<xref ref-type="bibr" rid="B21">Mojaver et al., 2023</xref>), MiniBokun provides diagonal access for all MZIs in the mesh for practical calibration consideration, yet with no MZI wasted due to being used solely for calibration purposes. Two simple observations can be made for a sufficient formal definition of MiniBokun topology, regardless of network size <inline-formula id="inf153">
<mml:math id="m166">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>:<list list-type="simple">
<list-item>
<p>&#x2022; There are always two MZI columns before the widest column, each containing <inline-formula id="inf154">
<mml:math id="m167">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf155">
<mml:math id="m168">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> MZIs.</p>
</list-item>
<list-item>
<p>&#x2022; The last MZI column always contains two MZIs.</p>
</list-item>
</list>
</p>
<p>The placement of each MZI is thus well-defined, and the number of MZIs in a <inline-formula id="inf156">
<mml:math id="m169">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-input MiniBokun mesh is <inline-formula id="inf157">
<mml:math id="m170">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>10</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, as shown in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
</sec>
<sec id="s4-1-3">
<title>4.1.3 Expressivity analysis</title>
<p>Using the benchmark method presented in <xref ref-type="sec" rid="s3-4-1">Section 3.4.1</xref>. We perform statistical analysis on the collected power distribution at each output port for <inline-formula id="inf158">
<mml:math id="m171">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> 1) Clements (Unitary), 2) Triangle (Over-Pruned) and 3) MiniBokun topologies, Reck topology is omitted as it covers the same unitary space as Clements. The box plot of the result is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Expressivity of three different meshes. Blue, red and green box plots denote the registered power distribution at each of the eight output ports for Clements, Triangle and MiniBokun mesh. The port number in red indicates the corresponding input port had the highest optical power among all input ports, while the last row of figures denotes the case of equally distributed power among the input (at 1.25&#xa0;mW per port). The whiskers on each box instance, from top to bottom, are: {max, third quartile, median, first quartile, min}.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g003.tif"/>
</fig>
<p>As we are not assuming any loss, the average total output power, as expected, sums to 10&#xa0;mW, which matches the total input power assumption. All tested topology-input combinations managed to achieve near 0&#xa0;mW output in all output ports. The maximum registered maximum power difference among tested samples were 1.330, 6.205 and 4.080&#xa0;mW for Clements (port 0), triangle (port 3) and MiniBokun (port 5), respectively. Unitary structures such as the Clements mesh provide full signal routing between any input-output waveguide pairs, thus giving a relatively uniform power distribution profile across each port, even when facing input with power concentrated on particular ports. On the other hand, sub-unitary topologies provide limited signal routing paths, in triangle topology, given the imbalanced number of MZI across different paths and the unbiased random phase setting, the biased weight space manifests as mismatching of maximum detected output power across different output, as well as the varying interquartile ranges. In particular, the low maximum power on edge ports (0, 7) indicates an impaired ability to discard unwanted power as part of the inference process. MiniBokun topology also shows such bias in its power distribution, but to a lighter extent, attributing to the two additional columns providing extra paths to disregard power from inputs 1 through 6, leading to a larger expressible space. This can be validated by the classification accuracy difference between a triangle mesh and a MiniBokun mesh in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
</sec>
</sec>
<sec id="s4-2">
<title>4.2 Performance of optical meshes in ONN</title>
<sec id="s4-2-1">
<title>4.2.1 Hyperparameter selection</title>
<p>We observe that topology sizes of <inline-formula id="inf159">
<mml:math id="m172">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf160">
<mml:math id="m173">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> work best for the binary optical trigger, especially under the assumption of fixed input laser power.</p>
<p>Under the assumption of constant per-channel input power, all three metrics, the accuracy of each model on both validation and test set and the F1 score, increase as <inline-formula id="inf161">
<mml:math id="m174">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increases. As shown in <xref ref-type="fig" rid="F4">Figure 4A</xref>, on average, the accuracy of the model prediction on the MNIST dataset improves by an absolute 6.9% as <inline-formula id="inf162">
<mml:math id="m175">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> reaches 64. The models&#x2019; accuracy on the CIFAR-10 dataset, which is more complex than MNIST, only increases by an absolute 2.2%, as shown in <xref ref-type="fig" rid="F4">Figure 4B</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The variation in test accuracy of ONN models with different topology and input power assumptions trained on <bold>(A)</bold> MNIST dataset, and <bold>(B)</bold> CIFAR-10 dataset. The same set of legends is used in both figures.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g004.tif"/>
</fig>
<p>When the assumption changes to fixed laser power, the actual per-channel input power range decreases as the optical mesh scales up. As <inline-formula id="inf163">
<mml:math id="m176">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> grows from 8 to 64, the maximum input optical power <inline-formula id="inf164">
<mml:math id="m177">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to a channel drops from 1.25&#xa0;mW to 0.16&#xa0;mW. This input range reduction significantly undermines the ability of larger ONNs to learn. As seen in <xref ref-type="fig" rid="F4">Figure 4</xref>, although the MNIST test accuracy increases with <inline-formula id="inf165">
<mml:math id="m178">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the magnitude of the growth in all models drops to an average maximum of 4.5%. On the CIFAR-10 dataset, model accuracy starts to drop after <inline-formula id="inf166">
<mml:math id="m179">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and eventually falls below the accuracy of <inline-formula id="inf167">
<mml:math id="m180">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The increase in ONN classification accuracy with <inline-formula id="inf168">
<mml:math id="m181">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is subject to the perfect operating conditions assumed in the simulations. In reality, the optical loss of an ONN increases linearly with its size and becomes especially significant when <inline-formula id="inf169">
<mml:math id="m182">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>32</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B26">Shafiee et al., 2024</xref>). Moreover, the power consumption of configuring the phase values increases quadratically with the optical mesh sizes (<xref ref-type="bibr" rid="B1">Al-Qadasi et al., 2022</xref>). Finally, under the fixed channel input power assumption, the laser input power obviously increases linearly with the number of input channels. Therefore, given our goal of finding a robust model that balances the overall accuracy and power efficiency, only ONNs of <inline-formula id="inf170">
<mml:math id="m183">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf171">
<mml:math id="m184">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> trained with the fixed laser power assumption are considered.</p>
</sec>
<sec id="s4-2-2">
<title>4.2.2 Application-specific optimization</title>
<p>The weighted class method effectively reduces the number of FNs made by ONNs after training. As shown in <xref ref-type="fig" rid="F5">Figures 5A, C</xref>, the FN count decreases from more than 1,000 to fewer than 10 as <inline-formula id="inf172">
<mml:math id="m185">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e9">Equations 9</xref>, <xref ref-type="disp-formula" rid="e10">10</xref> increases from 1 to 2. Increasing <inline-formula id="inf173">
<mml:math id="m186">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> forces the model to make more positive predictions. Subsequently, there are more false positives, and the overall test accuracy drops, as shown in <xref ref-type="fig" rid="F5">Figures 5B, D</xref>. In this case, sharp declines in overall test accuracy are observed for MNIST <inline-formula id="inf174">
<mml:math id="m187">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>1.4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and CIFAR-10 <inline-formula id="inf175">
<mml:math id="m188">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>1.2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. To ensure a balance between the decrease in FN and the drop in accuracy, we finally selected <inline-formula id="inf176">
<mml:math id="m189">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1,1.4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for the rest of our discussions. Models trained with <inline-formula id="inf177">
<mml:math id="m190">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in this range achieve at most a 75% reduction in FN but less than a 5% drop in accuracy.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>
<bold>(A)</bold> The variation in false negative numbers and <bold>(B)</bold> the test accuracy of ONN models with different topology and input sizes on the MNIST dataset as a result of the weighted class method. <bold>(C)</bold> The variation in false negative numbers and <bold>(D)</bold> the test accuracy of ONN models with different topology and input sizes on the CIFAR-10 dataset as a result of the weighted class method.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g005.tif"/>
</fig>
<p>We also find that an 8-bit voltage supply resolution is sufficient for models with the selected hyperparameters to achieve similar accuracy to those trained with full precision (32-bit), using a voltage supply setting of <inline-formula id="inf178">
<mml:math id="m191">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>4</mml:mn>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.92</mml:mn>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B28">Shokraneh et al., 2020</xref>). According to <xref ref-type="fig" rid="F6">Figure 6</xref>, ONN accuracy increases significantly as the voltage supply resolution increases from 4 to 8 bits and gradually converges to the full-resolution test accuracy. At the 8-bit resolution point, most <inline-formula id="inf179">
<mml:math id="m192">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> models show <inline-formula id="inf180">
<mml:math id="m193">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.5</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> deviation from the full-resolution accuracy while most <inline-formula id="inf181">
<mml:math id="m194">
<mml:mrow>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> models show <inline-formula id="inf182">
<mml:math id="m195">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.8</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> deviation. Therefore, we assume an 8-bit voltage supply resolution for the rest of our discussions.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>The change in test accuracy of ONN models as a result of post-training quantization <bold>(A)</bold> on the MNIST dataset with N &#x3d; 8, <bold>(B)</bold> on the MNIST dataset with N &#x3d; 16, and <bold>(C)</bold> on the CIFAR-10 dataset with N &#x3d; 8, <bold>(D)</bold> on the CIFAR-10 dataset with N &#x3d; 16. The same legend is kept across all four figures.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g006.tif"/>
</fig>
</sec>
<sec id="s4-2-3">
<title>4.2.3 Impact of pruning on classification performance</title>
<p>Based on the selected hyperparameters and optimization parameters, we summarized the accuracy and F1 score of all the models with different topologies in <xref ref-type="table" rid="T2">Table 2</xref>. The numbers labeled in bold are the best-performing topology in each category.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Test accuracy and F1 score of different topologies with different hyperparameters.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="3" align="center">Topology</th>
<th rowspan="3" align="center">
<inline-formula id="inf183">
<mml:math id="m196">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="4" align="center">
<inline-formula id="inf184">
<mml:math id="m197">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="4" align="center">
<inline-formula id="inf185">
<mml:math id="m198">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<th colspan="2" align="center">MNIST</th>
<th colspan="2" align="center">CIFAR-10</th>
<th colspan="2" align="center">MNIST</th>
<th colspan="2" align="center">CIFAR-10</th>
</tr>
<tr>
<th align="center">Accuracy [%]</th>
<th align="center">F1 score</th>
<th align="center">Accuracy [%]</th>
<th align="center">F1 score</th>
<th align="center">Accuracy [%]</th>
<th align="center">F1 score</th>
<th align="center">Accuracy [%]</th>
<th align="center">F1 score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Clements</td>
<td rowspan="3" align="center">1</td>
<td align="center">71.58</td>
<td align="center">70.62</td>
<td align="center">
<bold>72.53</bold>
</td>
<td align="center">71.67</td>
<td align="center">74.46</td>
<td align="center">73.90</td>
<td align="center">
<bold>72.87</bold>
</td>
<td align="center">
<bold>71.34</bold>
</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">
<bold>72.16</bold>
</td>
<td align="center">
<bold>71.92</bold>
</td>
<td align="center">71.46</td>
<td align="center">
<bold>71.72</bold>
</td>
<td align="center">73.86</td>
<td align="center">
<bold>75.25</bold>
</td>
<td align="center">71.76</td>
<td align="center">68.68</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">70.53</td>
<td align="center">71.70</td>
<td align="center">72.12</td>
<td align="center">70.18</td>
<td align="center">
<bold>74.77</bold>
</td>
<td align="center">74.62</td>
<td align="center">71.65</td>
<td align="center">69.84</td>
</tr>
<tr>
<td align="center">Clements</td>
<td rowspan="3" align="center">1.2</td>
<td align="center">71.16</td>
<td align="center">
<bold>72.19</bold>
</td>
<td align="center">
<bold>72.40</bold>
</td>
<td align="center">
<bold>72.41</bold>
</td>
<td align="center">73.98</td>
<td align="center">74.66</td>
<td align="center">71.83</td>
<td align="center">71.97</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">
<bold>71.17</bold>
</td>
<td align="center">71.93</td>
<td align="center">72.13</td>
<td align="center">70.22</td>
<td align="center">74.74</td>
<td align="center">74.46</td>
<td align="center">
<bold>72.49</bold>
</td>
<td align="center">70.36</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">70.37</td>
<td align="center">72.12</td>
<td align="center">71.62</td>
<td align="center">71.84</td>
<td align="center">
<bold>74.87</bold>
</td>
<td align="center">
<bold>74.86</bold>
</td>
<td align="center">71.26</td>
<td align="center">
<bold>72.07</bold>
</td>
</tr>
<tr>
<td align="center">Clements</td>
<td rowspan="3" align="center">1.4</td>
<td align="center">68.47</td>
<td align="center">73.37</td>
<td align="center">67.14</td>
<td align="center">72.32</td>
<td align="center">68.20</td>
<td align="center">74.30</td>
<td align="center">64.99</td>
<td align="center">71.84</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">
<bold>69.62</bold>
</td>
<td align="center">
<bold>73.48</bold>
</td>
<td align="center">
<bold>69.12</bold>
</td>
<td align="center">
<bold>72.92</bold>
</td>
<td align="center">
<bold>72.23</bold>
</td>
<td align="center">
<bold>75.71</bold>
</td>
<td align="center">
<bold>67.09</bold>
</td>
<td align="center">
<bold>72.30</bold>
</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">67.78</td>
<td align="center">73.08</td>
<td align="center">65.17</td>
<td align="center">71.52</td>
<td align="center">69.37</td>
<td align="center">74.90</td>
<td align="center">63.42</td>
<td align="center">71.16</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate the best statistically significant results.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In the <inline-formula id="inf186">
<mml:math id="m199">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> case, despite having fewer programmable phase shifters due to the small input size and the pre-training pruning performed, the MiniBokun mesh only experiences a 1.57% drop in accuracy and a 0.7% drop in F1 score on average compared with other meshes. Even with the slightly undermined learning ability, the performance gap between the best-performing model and MiniBokun is not large. MiniBokun mesh preserves a good balance in classifying both the positive and the negative classes of a dataset.</p>
<p>The performance gap between MiniBokun and the other meshes further closes when <inline-formula id="inf187">
<mml:math id="m200">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increases to 16. The average performance degradation drops to 1.42% in accuracy and 0.63% in F1 score. In certain cases, the MiniBokun mesh outperforms the other two in terms of both accuracy and F1 score.</p>
</sec>
</sec>
<sec id="s4-3">
<title>4.3 Sensitivity analysis</title>
<p>
<xref ref-type="fig" rid="F7">Figure 7</xref> shows the tolerance of investigated topologies towards phase shifter noise and propagation loss in optical components. The models subjected to the analysis are trained with an FN reduction factor <inline-formula id="inf188">
<mml:math id="m201">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of 1.2. The software-based FN reduction method does not alter the physical parameters of ONNs and hence does not impact the imperfection tolerance of an optical mesh with a certain topology and input size, thus the following observations remain consistent on models trained with <inline-formula id="inf189">
<mml:math id="m202">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf190">
<mml:math id="m203">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>The FoMs of three network topologies: <bold>(A)</bold> <inline-formula id="inf191">
<mml:math id="m204">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ONNs, PT Plots, <bold>(B)</bold> <inline-formula id="inf192">
<mml:math id="m205">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ONNs, LPU Plots, <bold>(C)</bold> <inline-formula id="inf193">
<mml:math id="m206">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ONNs, PT Plots, <bold>(D)</bold> <inline-formula id="inf194">
<mml:math id="m207">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> ONNs, LPU Plots.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g007.tif"/>
</fig>
<p>In contrast to <inline-formula id="inf195">
<mml:math id="m208">
<mml:mrow>
<mml:mn>8</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> meshes, all <inline-formula id="inf196">
<mml:math id="m209">
<mml:mrow>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> meshes show weaker tolerance in both PT and LPU analysis due to the accumulation of phase error in longer optical paths. An average of 52.3% decrease in PT FoM area is observed over all topologies and both datasets when moving from <inline-formula id="inf197">
<mml:math id="m210">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf198">
<mml:math id="m211">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and a 49.4% decrease for LPU FoM.</p>
<p>In almost all cases except for the LPU analysis for <inline-formula id="inf199">
<mml:math id="m212">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> network trained on the MNIST dataset, MiniBokun, thanks to the reduced number of components and balanced optical paths, shows greater tolerance of physical component imperfection. Overall, combining mesh sizes and datasets, MiniBokun&#x2019;s average improvement to PT and LPU FoMs is 66.9% and 36.3% over Clements, respectively.</p>
<p>These improvements in the FoMs suggest that the increase in individual weight importance that comes naturally with a pruned neural network is negligible for topologies used in this study. The original network is over-parameterized enough for the pruning benefits to outweigh the errors imposed on the high-saliency phase shifter values.</p>
</sec>
<sec id="s4-4">
<title>4.4 Power, latency, and area estimations</title>
<p>
<xref ref-type="table" rid="T3">Table 3</xref> summarizes the power, latency, and area consumed by each topology with different sizes. Note that the estimation &#x201c;conservative&#x201d; and &#x201c;aggressive&#x201d; are subject to the overall power consumption. Both the ONN input and the phase value programming require 8-bit DACs (<xref ref-type="bibr" rid="B29">Texas Instruments, 2013</xref>) after PTQ.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Power, latency, and area estimations of different topologies and mesh sizes.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">N</th>
<th rowspan="2" align="center">Topology</th>
<th colspan="2" align="center">Aggressive</th>
<th colspan="2" align="center">Conservative</th>
<th rowspan="2" align="center">&#x23;MZIs</th>
<th rowspan="2" align="center">Area (<inline-formula id="inf200">
<mml:math id="m213">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>mm</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>)</th>
</tr>
<tr>
<th align="center">Power [mW]</th>
<th align="center">Latency [&#x3bc;s]</th>
<th align="center">Power [mW]</th>
<th align="center">Latency [&#x3bc;s]</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="center">8</td>
<td align="center">Clements</td>
<td align="center">406.2</td>
<td align="center">154.1</td>
<td align="center">738.6</td>
<td align="center">30.1</td>
<td align="center">28</td>
<td align="center">1.0</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">406.2</td>
<td align="center">154.1</td>
<td align="center">738.6</td>
<td align="center">30.1</td>
<td align="center">28</td>
<td align="center">1.6</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">
<bold>387.4</bold>
</td>
<td align="center">154.1</td>
<td align="center">
<bold>560.1</bold>
</td>
<td align="center">30.1</td>
<td align="center">14</td>
<td align="center">
<bold>0.6</bold>
</td>
</tr>
<tr>
<td rowspan="3" align="center">16</td>
<td align="center">Clements</td>
<td align="center">534.9</td>
<td align="center">154.2</td>
<td align="center">1,930.0</td>
<td align="center">30.2</td>
<td align="center">120</td>
<td align="center">4.3</td>
</tr>
<tr>
<td align="center">Reck</td>
<td align="center">534.9</td>
<td align="center">154.2</td>
<td align="center">1,930.0</td>
<td align="center">30.2</td>
<td align="center">120</td>
<td align="center">7.8</td>
</tr>
<tr>
<td align="center">MiniBokun</td>
<td align="center">
<bold>438.4</bold>
</td>
<td align="center">
<bold>154.1</bold>
</td>
<td align="center">
<bold>1,012.0</bold>
</td>
<td align="center">
<bold>30.1</bold>
</td>
<td align="center">48</td>
<td align="center">
<bold>2.4</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate the best statistically significant results.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s4-4-1">
<title>4.4.1 Power and latency consumption</title>
<p>The pruned MiniBokun mesh has demonstrated a strong capability in reducing overall power consumption. Compared to the conventional Clements and Reck topology, the MiniBokun mesh saves 4.6% power in the aggressive case and 24.2% in the conservative case when <inline-formula id="inf201">
<mml:math id="m214">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. These numbers further grow to 18.0% and 47.6% when <inline-formula id="inf202">
<mml:math id="m215">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increases to 16. On the other hand, the benefits of pruning in saving latency are insignificant, only up to 0.1&#xa0;&#x3bc;s when <inline-formula id="inf203">
<mml:math id="m216">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, as the latency of the slowest components (phase programming) is large <italic>per se</italic> and invariant to the topology.</p>
<p>If we take a closer look at the component-wise power and latency consumption, the phase value programming dominates both calculations. Assuming uniform phase distribution, the programming power is directly proportional to the number of MZIs in a mesh (<xref ref-type="bibr" rid="B1">Al-Qadasi et al., 2022</xref>). Without insulation (the &#x201c;conservative&#x201d; approach), the programming power can take up to 39.8% of the total power consumption in Reck and Clements topology when <inline-formula id="inf204">
<mml:math id="m217">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (as shown in <xref ref-type="fig" rid="F8">Figure 8A</xref>), and this proportion continues to grow as the size of the optical mesh increases. Using the pruning strategy introduced in this work, we can effectively reduce the number of MZIs in the optical mesh by more than half. This subsequently relaxes the power requirement for programming the phase values and reduces the proportion it takes in the total power consumption, as shown in <xref ref-type="fig" rid="F8">Figure 8B</xref>. The power savings by pruning becomes more evident when the optical meshes scale up, as indicated by the growing gap between the two lines in <xref ref-type="fig" rid="F8">Figure 8C</xref>. Using insulated heaters with smaller <inline-formula id="inf205">
<mml:math id="m218">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (the &#x201c;aggressive&#x201d; approach) can effectively reduce the overall power consumption. However, this comes at a cost of <inline-formula id="inf206">
<mml:math id="m219">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> more time spent on the programming stage.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>The power breakdown of <bold>(A)</bold> Clements/Reck topology, <bold>(B)</bold> the MiniBokun topology with <inline-formula id="inf207">
<mml:math id="m220">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and the &#x201c;conservative&#x201d; assumption. <bold>(C)</bold> The variation in programming power (Si-doped heater without insulation) with respect to the change in the optical mesh size. <bold>(D)</bold> The change in the layout area of optical meshes with respect to the change in the optical mesh size.</p>
</caption>
<graphic xlink:href="aot-13-1501208-g008.tif"/>
</fig>
<p>In the latency calculation, the pruned MiniBokun mesh effectively shortens the optical path length that light propagates through and lowers the number of memory read by reducing the number of phase values to be programmed. However, these savings (<inline-formula id="inf208">
<mml:math id="m221">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>0.06</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> &#x3bc;s in total) are comparatively negligible to the programming time itself (<inline-formula id="inf209">
<mml:math id="m222">
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>30,150</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> &#x3bc;s). Disregarding the programming latency, the speed of the optical mesh computation is bottlenecked by the electrical ADCs and DAC. As the pruning technique does not alter the parameters of these devices, the speed of the MiniBokun mesh is still limited by them.</p>
</sec>
<sec id="s4-4-2">
<title>4.4.2 Layout area</title>
<p>The pruning strategy significantly reduces the area of the optical mesh by placing fewer MZIs horizontally along the optical path. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, the MiniBokun mesh employs 50% fewer MZIs when <inline-formula id="inf210">
<mml:math id="m223">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and 60% fewer MZIs when <inline-formula id="inf211">
<mml:math id="m224">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Subsequently, the optical path length reduces from <inline-formula id="inf212">
<mml:math id="m225">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in Reck mesh and <inline-formula id="inf213">
<mml:math id="m226">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in Clements mesh to <inline-formula id="inf214">
<mml:math id="m227">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in MiniBokun, bringing the total layout area down by at least 40%. As shown in <xref ref-type="fig" rid="F8">Figure 8D</xref>, when the optical mesh size increases, the area of the MiniBokun mesh grows less rapidly than the other two topologies. When reaching the <inline-formula id="inf215">
<mml:math id="m228">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> limit, it saves 73.6% and 48.4% layout area when compared to the Clements and Reck topologies.</p>
<p>For practical deployment, a typical smart lock uses a microcontroller comparable in size to an Arduino chip (<xref ref-type="bibr" rid="B2">Arduino, 2014</xref>), implying area constraints on the order of several square centimeters (<xref ref-type="bibr" rid="B22">Motwani et al., 2021</xref>). In contrast, modern smartphone processors, facing stricter area limitations, typically have a footprint of over 100&#xa0;mm<sup>2</sup> (<xref ref-type="bibr" rid="B33">Yang et al., 2024</xref>). By comparison, our estimated mesh area is 2.4&#xa0;mm<sup>2</sup> for the <inline-formula id="inf216">
<mml:math id="m229">
<mml:mrow>
<mml:mn>16</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> MiniBokun, which meets the area constraint of both of these target applications.</p>
</sec>
</sec>
<sec id="s4-5">
<title>4.5 Limitations and future work</title>
<p>Imperfect operating conditions are obstacles to the deployment of proposed systems in real-world applications. In this work, we characterize the resilience of different topologies to two sources of error: optical loss and phase deviations. To achieve a more comprehensive evaluation of the model performance in the future, the model needs to take into account more factors, including the direct impact of fabrication non-uniformity (<xref ref-type="bibr" rid="B20">Mirza et al., 2022</xref>), input phase mismatch (<xref ref-type="bibr" rid="B10">Fang et al., 2019</xref>), and other sources of crosstalk (<xref ref-type="bibr" rid="B26">Shafiee et al., 2024</xref>).</p>
<p>The proposed pruning strategy is tested with well-established image classification datasets, and its performance is compared with existing optical mesh topologies. Although these datasets are sufficiently complex to provide insights into how the ONN trades off accuracy against power/area consumption, the comparisons lack real-world proximity. Future work will explore the use of datasets closer to the actual implementation of the binary optical trigger, for example, face recognition systems (<xref ref-type="bibr" rid="B5">Bong et al., 2018</xref>). These experiments can better reveal the benefits of the pruning strategy and the optical trigger structure itself when compared with existing digital electronic products.</p>
<p>Alternative pruning strategies, such as train-time or post-training pruning, consider parameter saliency before deciding which components to remove. However, further investigation is needed to assess these alternatives. Although these methods could offer comparable power savings while reducing the accuracy loss, they have the potential to create sparse network meshes with less clustered MZI removal and thus may provide limited area savings compared to the current pre-training pruning approach.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>In this work, we propose a pre-training pruning strategy over established optical processor topology subject to the binary optical trigger structure. Motivated by the need for a low-power binary trigger to support machine learning at the edge of the Internet, the pruned structure, &#x201c;MiniBokun&#x201d; mesh, removed at least 50% of MZIs from a standard unitary topology and shortened the optical path length by half. The effect of pruning was tested with the binarized version of two benchmark datasets, MNIST and CIFAR-10, in which we only observed 1%&#x2013;2% accuracy degradation and less than 1% drop in F1 score compared to the unpruned Clements and Reck topologies. In consideration of the practical deployment environment, the impact of limited voltage control precision and the robustness of ONNs toward component imperfections were investigated via weight quantization and a sensitivity study. The MiniBokun mesh showed <inline-formula id="inf217">
<mml:math id="m230">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>30</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf218">
<mml:math id="m231">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>60</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> improvement in phase error and loss tolerance, respectively, while reducing the physical footprint of the mesh by <inline-formula id="inf219">
<mml:math id="m232">
<mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>40</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. With the removal of MZIs, an estimated <inline-formula id="inf220">
<mml:math id="m233">
<mml:mrow>
<mml:mn>4.6</mml:mn>
<mml:mi>%</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>24.2</mml:mn>
<mml:mi>%</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> power saving is achieved.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/Xoreus/neuroptica/tree/6c56736010dcfc271724b10a34c849fed349a598">https://github.com/Xoreus/neuroptica/tree/6c56736010dcfc271724b10a34c849fed349a598</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>BZ: Conceptualization, Data curation, Methodology, Software, Visualization, Writing&#x2013;original draft, Writing&#x2013;review and editing. XD: Data curation, Methodology, Software, Visualization, Writing&#x2013;original draft, Writing&#x2013;review and editing, Conceptualization. KR: Supervision, Writing&#x2013;review and editing. BM: Supervision, Writing&#x2013;review and editing. OL-L: Supervision, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This research is supported by Natural Science and Engineering Research Council of Canada (NSERC) through grants RGPIN-2018-05668 and RGPIN-2021-03480.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Qadasi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chrostowski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shastri</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Shekhar</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Scaling up silicon photonic-based accelerators: challenges and opportunities</article-title>. <source>Apl. Photonics</source> <volume>7</volume>, <fpage>020902</fpage>. <pub-id pub-id-type="doi">10.1063/5.0070992</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="book">
<collab>Arduino</collab> (<year>2014</year>). <source>Nano 33 imu sensor</source>. <publisher-loc>Monza, MB, Italy</publisher-loc>: <publisher-name>Arduino s.r.l</publisher-name>. <comment>[apparatus and software]</comment>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Banerjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nikdast</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pasricha</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chakrabarty</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Pruning coherent integrated photonic neural networks</article-title>. <source>IEEE J. Sel. Top. Quantum Electron.</source> <volume>29</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/JSTQE.2023.3242992</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bartlett</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Minkov</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Williamson</surname>
<given-names>I. A. D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Neuroptica: flexible simulation package for optical neural networks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/fancompute/neuroptica">https://github.com/fancompute/neuroptica</ext-link>.</comment>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bong</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>H.-J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A low-power convolutional neural network face recognition processor and a cis integrated with always-on face detector</article-title>. <source>IEEE J. Solid-State Circuits</source> <volume>53</volume>, <fpage>115</fpage>&#x2013;<lpage>123</lpage>. <pub-id pub-id-type="doi">10.1109/JSSC.2017.2767705</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Clements</surname>
<given-names>W. R.</given-names>
</name>
<name>
<surname>Humphreys</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Metcalf</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Kolthammer</surname>
<given-names>W. S.</given-names>
</name>
<name>
<surname>Walsmley</surname>
<given-names>I. A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Optimal design for universal multiport interferometers</article-title>. <source>Optica</source> <volume>3</volume>, <fpage>1460</fpage>&#x2013;<lpage>1465</lpage>. <pub-id pub-id-type="doi">10.1364/OPTICA.3.001460</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Delashmit</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Missiles</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Manry</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2005</year>). &#x201c;<article-title>Recent developments in multilayer perceptron neural networks</article-title>,&#x201d; in <source>Proceedings of the seventh annual memphis area engineering and science conference, MAESC</source>, <volume>7</volume>, <fpage>33</fpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Demirkiran</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Eris</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Elmhurst</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Harris</surname>
<given-names>N. C.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>An electro-photonic system for accelerating deep neural networks</article-title>. <source>J. Emerg. Technol. Comput. Syst.</source> <volume>19</volume>, <fpage>1</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1145/3606949</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>The mnist database of handwritten digit images for machine learning research [best of the web]</article-title>. <source>IEEE signal Process. Mag.</source> <volume>29</volume>, <fpage>141</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1109/msp.2012.2211477</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname>
<given-names>M. Y.-S.</given-names>
</name>
<name>
<surname>Manipatruni</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wierzynski</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Khosrowshahi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>DeWeese</surname>
<given-names>M. R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Design of optical neural networks with component imprecisions</article-title>. <source>Opt. express</source> <volume>27</volume>, <fpage>14009</fpage>&#x2013;<lpage>14029</lpage>. <pub-id pub-id-type="doi">10.1364/oe.27.014009</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ying</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>D. Z.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>A compact butterfly-style silicon photonic&#x2013;electronic neural chip for hardware-efficient deep learning</article-title>. <source>ACS Photonics</source> <volume>9</volume>, <fpage>3906</fpage>&#x2013;<lpage>3916</lpage>. <pub-id pub-id-type="doi">10.1021/acsphotonics.2c01188</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gazivoda</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bilas</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Always-on sparse event wake-up detectors: a review</article-title>. <source>IEEE Sensors J.</source> <volume>22</volume>, <fpage>8313</fpage>&#x2013;<lpage>8326</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2022.3162319</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R. T.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>D. Z.</given-names>
</name>
</person-group> (<year>2020a</year>). &#x201c;<article-title>Towards area-efficient optical neural networks: an fft-based architecture</article-title>,&#x201d; in <source>2020 25th asia and south pacific design automation conference (ASP-DAC)</source>, <fpage>476</fpage>&#x2013;<lpage>481</lpage>. <pub-id pub-id-type="doi">10.1109/ASP-DAC47756.2020.9045156</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R. T.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>D. Z.</given-names>
</name>
</person-group> (<year>2020b</year>). &#x201c;<article-title>Roq: a noise-aware quantization scheme towards robust optical neural networks with low-bit controls</article-title>,&#x201d; in <source>Date</source> (<publisher-name>IEEE</publisher-name>), <fpage>1586</fpage>&#x2013;<lpage>1589</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ip</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Lau</surname>
<given-names>A. P. T.</given-names>
</name>
<name>
<surname>Barros</surname>
<given-names>D. J. F.</given-names>
</name>
<name>
<surname>Kahn</surname>
<given-names>J. M.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Coherent detection in optical fiber systems</article-title>. <source>Opt. Express</source> <volume>16</volume>, <fpage>753</fpage>&#x2013;<lpage>791</lpage>. <pub-id pub-id-type="doi">10.1364/OE.16.000753</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Learning multiple layers of features from tiny images</source>.</citation>
</ref>
<ref id="B17">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Masood</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pantouvaki</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lepage</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Verheyen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Van Campenhout</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Absil</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). &#x201c;<article-title>Comparison of heater architectures for thermal control of silicon photonic circuits</article-title>,&#x201d; in <source>10th international conference on group IV photonics</source>, <fpage>83</fpage>&#x2013;<lpage>84</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McMahon</surname>
<given-names>P. L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>The physics of optical computing</article-title>. <source>Nat. Rev. Phys.</source> <volume>5</volume>, <fpage>717</fpage>&#x2013;<lpage>734</lpage>. <pub-id pub-id-type="doi">10.1038/s42254-023-00645-5</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miller</surname>
<given-names>D. A. B.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Self-configuring universal linear optical component [invited]</article-title>. <source>Phot. Res.</source> <volume>1</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1364/PRJ.1.000001</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mirza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shafiee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Banerjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chakrabarty</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Pasricha</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nikdast</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Characterization and optimization of coherent mzi-based nanophotonic neural networks under fabrication non-uniformity</article-title>. <source>IEEE Trans. Nanotechnol.</source> <volume>21</volume>, <fpage>763</fpage>&#x2013;<lpage>771</lpage>. <pub-id pub-id-type="doi">10.1109/TNANO.2022.3223915</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mojaver</surname>
<given-names>K. H. R.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Leung</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Safaee</surname>
<given-names>S. M. R.</given-names>
</name>
<name>
<surname>Liboiron-Ladouceur</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Addressing the programming challenges of practical interferometric mesh based optical processors</article-title>. <source>Opt. Express</source> <volume>31</volume>, <fpage>23851</fpage>&#x2013;<lpage>23866</lpage>. <pub-id pub-id-type="doi">10.1364/OE.489493</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Motwani</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Seth</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dixit</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bagubali</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rajesh</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multifactor door locking systems: a review</article-title>. <source>Mater. Today Proc.</source> <volume>46</volume>, <fpage>7973</fpage>&#x2013;<lpage>7979</lpage>. <pub-id pub-id-type="doi">10.1016/j.matpr.2021.02.708</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mourgias-Alexandris</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Moralis-Pegios</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tsakyridis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Simos</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dabos</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Totovic</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Noise-resilient and high-speed deep learning with coherent silicon photonics</article-title>. <source>Nat. Commun.</source> <volume>13</volume>, <fpage>5572</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-022-33259-z</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Nagel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fournarakis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Amjad</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Bondarenko</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Van Baalen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Blankevoort</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). <source>A white paper on neural network quantization</source>. <comment>arXiv preprint arXiv:2106.08295</comment>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Reck</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zeilinger</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bernstein</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Bertani</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>Experimental realization of any discrete unitary operator</article-title>. <source>Phys. Rev. Lett.</source> <volume>73</volume>, <fpage>58</fpage>&#x2013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1103/physrevlett.73.58</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shafiee</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Banerjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chakrabarty</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Pasricha</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nikdast</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Analysis of optical loss and crosstalk noise in mzi-based coherent photonic neural networks</article-title>,&#x201d; in <source>Journal of lightwave technology</source>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Harris</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Skirlo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Prabhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Baehr-Jones</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hochberg</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Deep learning with coherent nanophotonic circuits</article-title>. <source>Nat. photonics</source> <volume>11</volume>, <fpage>441</fpage>&#x2013;<lpage>446</lpage>. <pub-id pub-id-type="doi">10.1038/nphoton.2017.93</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shokraneh</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Nezami</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Liboiron-Ladouceur</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Theoretical and experimental analysis of a 4 &#xd7; 4 reconfigurable MZI-based linear optical processor</article-title>. <source>J. Light. Technol.</source> <volume>38</volume>, <fpage>1258</fpage>&#x2013;<lpage>1267</lpage>. <pub-id pub-id-type="doi">10.1109/JLT.2020.2966949</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<collab>Texas Instruments</collab> (<year>2013</year>). <source>DAC081S101 8-bit micro power digital-to-analog converter with rail-to-rail output. (Rev. C)</source>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<collab>Texas Instruments</collab> (<year>2018</year>). <source>LMV7235 and LMV7239 75-ns, ultra low power, low voltage, rail-to-rail input comparator with open-drain and push-pull output. (Rev. O)</source>.</citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Thoziyoor</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ahn</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Monchiero</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brockman</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Jouppi</surname>
<given-names>N. P.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>A comprehensive memory modeling tool and its application to the design and analysis of future memory hierarchies</article-title>,&#x201d; in <source>2008 international symposium on computer architecture</source>, <fpage>51</fpage>&#x2013;<lpage>62</lpage>. <pub-id pub-id-type="doi">10.1109/ISCA.2008.16</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Williamson</surname>
<given-names>I. A. D.</given-names>
</name>
<name>
<surname>Hughes</surname>
<given-names>T. W.</given-names>
</name>
<name>
<surname>Minkov</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bartlett</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Pai</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Reprogrammable electro-optic nonlinear activation functions for optical neural networks</article-title>. <source>IEEE J. Sel. Top. Quantum Electron.</source> <volume>26</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1109/JSTQE.2019.2930455</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Reducing smart phone environmental footprints with in-memory processing</article-title>,&#x201d; in <source>International conference on hardware/software codesign and system synthesis (CODES&#x2b;ISSS)</source>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Thompson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X. D.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>P. Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Efficient on-chip training of optical neural networks using genetic algorithm</article-title>. <source>ACS Photonics</source> <volume>8</volume>, <fpage>1662</fpage>&#x2013;<lpage>1672</lpage>. <pub-id pub-id-type="doi">10.1021/acsphotonics.1c00035</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ying</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Hardware-software co-design of slimmed optical neural networks</article-title>,&#x201d; in <source>Proceedings of the 24th asia and south pacific design automation conference</source> (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>705</fpage>&#x2013;<lpage>710</lpage>. <comment>ASPDAC &#x2019;19</comment>. <pub-id pub-id-type="doi">10.1145/3287624.3287720</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>