<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1633623</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2025.1633623</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Bioinformatics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Using reinforcement learning in genome assembly: in-depth analysis of a Q-learning assembler</article-title>
<alt-title alt-title-type="left-running-head">Padovani et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2025.1633623">10.3389/fbinf.2025.1633623</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Padovani</surname>
<given-names>Kleber</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1793827/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Borges</surname>
<given-names>Rafael Cabral</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1315682/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xavier</surname>
<given-names>Roberto</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3164343/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Carvalho</surname>
<given-names>Andr&#xe9; Carlos</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/476414/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Reali</surname>
<given-names>Anna</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3108562/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chateau</surname>
<given-names>Annie</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Alves</surname>
<given-names>Ronnie</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/471527/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Center for Higher Studies of Itacoatiara</institution>, <institution>University of the State of Amazonas</institution>, <addr-line>Itacoatiara</addr-line>, <addr-line>Amazonas</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Data Science</institution>, <institution>Vale Institute of Technology</institution>, <addr-line>Bel&#xe9;m</addr-line>, <addr-line>Par&#xe1;</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Institute of Mathematics and Computer Sciences</institution>, <institution>University of S&#xe3;o Paulo</institution>, <addr-line>S&#xe3;o Carlos</addr-line>, <addr-line>S&#xe3;o Paulo</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Polytechnic School</institution>, <institution>University of S&#xe3;o Paulo</institution>, <addr-line>S&#xe3;o Paulo</addr-line>, <country>Brazil</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Laboratory of Computer Science, Robotics and Microelectronics of Montpellier</institution>, <institution>University of Montpellier</institution>, <addr-line>Montpellier</addr-line>, <country>France</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Institute of Natural Sciences</institution>, <institution>Federal University of Par&#xe1;</institution>, <addr-line>Bel&#xe9;m</addr-line>, <addr-line>Par&#xe1;</addr-line>, <country>Brazil</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/106537/overview">David W. Ussery</ext-link>, Oklahoma State University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1915290/overview">Bo-Wei Zhao</ext-link>, Zhejiang University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/602676/overview">Stephen S. -T. Yau</ext-link>, Tsinghua University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3097452/overview">Amit Ruhela</ext-link>, The University of Texas at Austin, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Rafael Cabral Borges, <email>rcabralsan@gmail.com</email>; Kleber Padovani, <email>kleber.padovani@gmail.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>08</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>5</volume>
<elocation-id>1633623</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>07</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Padovani, Borges, Xavier, Carvalho, Reali, Chateau and Alves.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Padovani, Borges, Xavier, Carvalho, Reali, Chateau and Alves</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Genome assembly remains an unsolved problem, and de novo strategies (i.e., those run without a reference) are relevant but computationally complex tasks in genomics. Although de novo assemblers have been previously successfully applied in genomic projects, there is still no &#x201c;best assembler&#x201d;, and the choice and setup of assemblers still rely on bioinformatics experts. Thus, as with other computationally complex problems, machine learning has emerged as an alternative (or complementary) way to develop accurate, fast and autonomous assemblers. Reinforcement learning has proven promising for solving complex activities without supervision, such as games, and there is a pressing need to understand the limits of this approach to &#x201c;real-life&#x201d; problems, such as the DNA fragment assembly problem. In this study, we analyze the boundaries of applying machine learning via reinforcement learning (RL) for genome assembly. We expand upon the previous approach found in the literature to solve this problem by carefully exploring the learning aspects of the proposed intelligent agent, which uses the Q-learning algorithm. We improved the reward system and optimized the exploration of the state space based on pruning and in collaboration with evolutionary computing (&#x3e;300% improvement). We tested the new approaches on 23 environments. Our results suggest the unsatisfactory performance of the approaches, both in terms of assembly quality and execution time, providing strong evidence for the poor scalability of the studied reinforcement learning approaches to the genome assembly problem. Finally, we discuss the existing proposal, complemented by attempts at improvement that also proved insufficient. In doing so, we contribute to the scientific community by offering a clear mapping of the limitations and challenges that should be taken into account in future attempts to apply reinforcement learning to genome assembly.</p>
</abstract>
<kwd-group>
<kwd>reinforcement learning</kwd>
<kwd>genome assembly</kwd>
<kwd>machine learning</kwd>
<kwd>artificial intelligence</kwd>
<kwd>bioinformatics</kwd>
<kwd>q-learning</kwd>
</kwd-group>
<contract-sponsor id="cn001">Coordena&#xe7;&#xe3;o de Aperfei&#xe7;oamento de Pessoal de N&#xed;vel Superior<named-content content-type="fundref-id">10.13039/501100002322</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Genomic Analysis</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The genome of an organism is the sequence of all nucleotides from its DNA molecules. Each isolated nucleotide represents no relevant biological information, and within the organism&#x2019;s genome, there are species genes that define species traits and behaviors (e.g., eye color) (<xref ref-type="bibr" rid="B33">Portin and Wilkins, 2017</xref>). A single DNA fragment cannot represent the complete information from a gene, and genome assembly is the computational task used to order the sequenced DNA fragments (i.e., <italic>reads</italic>) and reconstruct the original DNA sequence (<xref ref-type="bibr" rid="B20">Heather and Chain, 2016</xref>). The size and number of <italic>reads</italic> directly influence the complexity of the assembly process, and illuminating this bottleneck problem has become an important bioinformatics problem for producing a fast, automated and reliable solution.</p>
<p>Genome assemblers adopt comparative and/or <italic>de novo</italic> strategies. A comparative strategy requires a reference genome (<xref ref-type="bibr" rid="B22">Ji et al., 2017</xref>). <italic>De novo</italic> strategies (i.e., do not need a reference) are particularly important given that only a small number of reference genomes are currently available (<xref ref-type="bibr" rid="B48">Wong et al., 2020</xref>). However, this approach is a highly complex combinatorial problem that falls into the theoretically intractable class of computational problems (NP-hard) (<xref ref-type="bibr" rid="B27">Medvedev et al., 2007</xref>). <italic>De novo</italic> assemblers (commonly based on heuristics and graphs) can produce acceptable solutions but require specific bioinformatics knowledge for configuration and parameter setting, and optimal results are not guaranteed (<xref ref-type="bibr" rid="B17">Gurevich et al., 2013</xref>).</p>
<p>Machine learning has emerged as a powerful alternative for addressing computationally complex problems (<xref ref-type="bibr" rid="B21">Jamialahmadi et al., 2024</xref>), where finding exact solutions is often computationally infeasible. Instead of exhaustively exploring all possibilities, machine learning models can learn patterns from data to provide approximate yet acceptable solutions within reasonable time constraints (<xref ref-type="bibr" rid="B41">Souza et al., 2018</xref>). This approach has gained particular relevance in genomics and related fields, where complex computational problems, such as drug repositioning (<xref ref-type="bibr" rid="B54">Zhao et al., 2025</xref>; <xref ref-type="bibr" rid="B53">Zhao et al., 2022</xref>), gene prediction (<xref ref-type="bibr" rid="B38">Silva et al., 2021</xref>), designing of antimicrobial peptides (<xref ref-type="bibr" rid="B46">Wang et al., 2025a</xref>), assembling 3D molecular structures (<xref ref-type="bibr" rid="B47">Wang et al., 2025b</xref>), pose significant computational challenges.</p>
<p>The genome assembly problem is NP-hard because the Shortest Common Superstring Problem, which is NP-hard, can be polynomially reduced to the genome assembly problem (<xref ref-type="bibr" rid="B12">Fernandez et al., 2024</xref>). This means that solving the genome assembly optimally is at least as hard as solving the Shortest Common Superstring Problem. The complexity arises from the need to find an ordering of reads that minimizes the total assembled sequence length, which involves searching through an exponentially large combinatorial space, making exact solutions computationally infeasible for real-world genome sizes (<xref ref-type="bibr" rid="B41">Souza et al., 2018</xref>).</p>
<p>Genome assembly is currently not a fully solved problem. However, few approaches have applied machine learning to achieve better solutions for the assembly problem (<xref ref-type="bibr" rid="B41">Souza et al., 2018</xref>; <xref ref-type="bibr" rid="B51">Yassine and Riffi, 2023</xref>). With the current availability of increased processing and storage power, machine learning applications have grown significantly, and notable results have been reported (<xref ref-type="bibr" rid="B25">LeCun, 2019</xref>). This increase also enabled the resurgence of reinforcement learning applications to address these problems (<xref ref-type="bibr" rid="B5">Botvinick et al., 2019</xref>).</p>
<p>Reinforcement learning (RL) is a basic machine learning paradigm in which intelligent agents take action in a task environment. Ideally, this agent solves the task when it is able to learn how to maximize the rewards from its actions (<xref ref-type="bibr" rid="B42">Sutton and Barto, 2018</xref>). Despite RL&#x2019;s impressive achievements in games, especially those leveraging deep reinforcement learning, translating this success to real-world problems remains a significant challenge (<xref ref-type="bibr" rid="B11">Dulac-Arnold et al., 2019</xref>; <xref ref-type="bibr" rid="B10">Crespo and Wichert, 2020</xref>; <xref ref-type="bibr" rid="B29">Osborne et al., 2022</xref>). The limited adoption of RL in real-world applications is also evident in the specific case of genome assembly, with only a few studies identified (<xref ref-type="bibr" rid="B41">Souza et al., 2018</xref>; <xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>; <xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>; <xref ref-type="bibr" rid="B23">Karami et al., 2023</xref>).</p>
<p>A brief literature review revealed few studies applying reinforcement learning (RL) to the fragment assembly problem, identifying only three attempts (for details, see Section 7 of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)). The first approach, which serves as the foundation for the present study, applies Q-learning with a reward system based on overlap scores, where each action adds an assembled read, yielding optimal results on small datasets (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>). The second uses distributed collaborative agents to address convergence and state space issues, improving the execution time but still facing scalability limitations (<xref ref-type="bibr" rid="B4">Bocicor et al., 2011b</xref>). The third evaluated the scalability of (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>) and, despite persistent complexity challenges, achieved remarkable results on small-to medium-sized datasets (<xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>).</p>
<p>In this context, investigating both the limitations and the potential of this approach is particularly relevant, as it remains innovative despite its conceptual simplicity. This study aims to critically analyze one of the few attempts to apply RL to fragment assembly&#x2014;hereafter referred to as the seminal approach&#x2014;evaluating its scalability, performance, and generalizability through new experiments involving greater complexity and larger data volumes. Furthermore, it proposes enhancements to the original strategy, aiming to overcome identified limitations and further explore the practical potential of reinforcement learning in this domain.</p>
<p>This investigation is motivated both by the originality of this research direction and by the need to understand the extent to which RL can offer practical advantages over established methods when applied to the assembly problem, thus contributing to the expansion of scientific knowledge and guiding future research at the intersection of bioinformatics and artificial intelligence (<xref ref-type="bibr" rid="B21">Jamialahmadi et al., 2024</xref>).</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<p>The seminal approach (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>) proposes an episodic trained agent (whose training has been divided into episodes) applying the Q-learning reinforcement learning algorithm, which allows the agent to learn through the consequences (positive or negative rewards) received after taking action. The ability to obtain intelligent and trained agents via RL using the seminal approach is important because it could eliminate the need for human specialists.</p>
<p>The Q-learning algorithm requires a Markov decision process definition with established parameters of states and actions, together with a reward system to be achieved by the agent at each action in every state (<xref ref-type="bibr" rid="B42">Sutton and Barto, 2018</xref>). The problem was then modeled through a state space capable of representing all possible read arrangements with repetition, with one action for each read in each state (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>). Following these definitions, from graph theory, the proposed state space for n <italic>reads</italic> can be visualized as a complete <italic>n</italic>-ary tree, with a height equal to <italic>n</italic>, as the set of states presents one initial state and forms a connected and acyclic graph (<xref ref-type="bibr" rid="B9">Cormen et al., 2009</xref>). The number of existing states in the state space is represented by <xref ref-type="disp-formula" rid="e1">Equation 1</xref>.<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The reward system depends on the type of state reached after each action (absorbing or nonabsorbing). An absorbing state is one that, once entered, cannot be left; it has no outgoing transitions to other states (<xref ref-type="bibr" rid="B42">Sutton and Barto, 2018</xref>; <xref ref-type="bibr" rid="B15">Grinstead and Snell, 2012</xref>). Each state requiring <italic>n</italic> actions to be reached (with n being the number of <italic>reads</italic>) is an absorbing state. A small and constant reward (i.e., 0.1) is assigned for each action. Finally, actions leading to other absorbing states produce a reward corresponding to the sum of overlaps between all pairs of consecutive <italic>reads</italic> used to reach these states. <xref ref-type="fig" rid="F1">Figure 1</xref> presents a state space example for a set of 2 <italic>reads</italic> (A and B) with a single initial state, two actions associated with nonabsorbing states and four absorbing states (highlighted black circles), achieved after taking two actions.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Example of a state space for a set of two reads, referred to here as A and B.</p>
</caption>
<graphic xlink:href="fbinf-05-1633623-g001.tif">
<alt-text content-type="machine-generated"> Complete binary tree showing an initial state node branching into two nodes, with edges labeled A and B. These edges correspond to the incorporation of read A or read B into the assembly. Each of the two nodes also branches into two other nodes, with edges also labeled A and B, representing that another read, A or B, has been incorporated as the second read of the assembly. Leaf nodes are black, and those corresponding to assemblies without read repetition contain an X.</alt-text>
</graphic>
</fig>
<p>Two absorbing states are highlighted (X). These are the final states, as they are reached directly without repeated actions.</p>
<p>The Smith&#x2012;Waterman algorithm (SW) was applied to obtain the overlaps between pairs of <italic>reads</italic> and added to obtain the rewards of actions leading to the final states (<xref ref-type="bibr" rid="B40">Smith and Waterman, 1981</xref>). The sum of overlaps when reaching a final state <italic>s</italic> (Performance Measure - PM) is described in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, where <italic>reads</italic> correspond to the sequence of <italic>reads</italic> associated with the actions for achieving <italic>s</italic>. In an optimal solution, repeated <italic>reads</italic> overlap completely, and pairs reach the maximum PM.<disp-formula id="e2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Based on these definitions, the seminal approach produced positive results against two sets of 4 and 10 simulated <italic>reads</italic> less than 10 <italic>base pairs (bp)</italic> and 8 <italic>bp,</italic> respectively. A scalability analysis was applied to evaluate the performance of this approach against 18 datasets produced following the same simulation methods (<xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>). The initial set is one of the sets featured in the seminal approach, containing 10 <italic>reads</italic> with 8 <italic>bp</italic> extracted from a 25 <italic>bp</italic> microgenome. Seventeen new datasets were generated from this microgenome and from a novel 50 <italic>bp</italic> microgenome (8 from the minor microgenome and 9 from the major microgenome), each containing 10, 20 or 30 <italic>reads</italic>, with 8 <italic>bp</italic>, 10 <italic>bp</italic> or 15 <italic>bp</italic>.</p>
<p>All the previous definitions were replicated, but <italic>&#x3b1;</italic> and <italic>&#x263;</italic> were set to 0.8 and 0.9, respectively. The former controls how much newly learned values (accumulated rewards) influence the update of the Q-table&#x2014;the closer to 1, the greater the influence&#x2014;and the latter controls how much the agent values estimated future rewards compared with immediate ones&#x2014;the closer to 1, the greater the influence (i.e., the less impulsive the agent becomes). With the chosen values, the agent theoretically learns quickly from new experiences (new sequences of reads) while still valuing potential future rewards, which is suitable for scenarios with sparse rewards and high payoffs concentrated at the end of episodes.</p>
<p>The space of actions was reduced so that actions associated with previously taken <italic>reads</italic> were removed from the available actions (<xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>). In the state space depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, the leftmost and rightmost leaves (i.e., absorbing states) are removed after this change. The number of states decreases because the tree has n nodes at height 1, n (n&#x2212;1) nodes at height 2, n (n&#x2212;1) (n&#x2212;2) nodes at height 3, and so on. Assuming that i corresponds to the height and ranges from 0 to n, we can represent these quantities using the summation given in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>. Although the number of states is reduced, the size of the state space still grows exponentially.<disp-formula id="e3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>!</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>!</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>This confirmed positive results from the seminal approach with the first dataset; however, the performance decreased with increasing size, reaching the target microgenome in only 2 out of the 17 major datasets. This may be related to the high cost required by the agent to explore a vast state space and to failures in the reward system (<xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>). Thus, to investigate the application of reinforcement learning to genome assembly and address the current challenge of applying RL to real-world problems (<xref ref-type="bibr" rid="B11">Dulac-Arnold et al., 2019</xref>), in this study, we analyzed the limits of RL to the Genome Assembly problem, a key problem for scientific development. We corrected previously described issues, explored the performance of an improved reward system and added complementary strategies to be incorporated into the seminal approach to obtain improved and automated genome assemblies through machine learning applications.</p>
<p>In this study, 7 experiments were evaluated against the seminal approach. The experiments were implemented in Python 3.8 using the NumPy package (<xref ref-type="bibr" rid="B19">Harris et al., 2020</xref>). The main goal was to reach an RL-trained agent to correctly identify the order of <italic>reads</italic> from a sequenced genome. <xref ref-type="fig" rid="F2">Figure 2</xref> illustrates this proposal, where the environment represents the set of <italic>reads</italic> to assemble. The agent interacts with the environment by taking actions intended to order the <italic>reads</italic>. For each action taken, the environment is updated and provides a corresponding reward to the agent. The agent learns from the reward received and takes new action until reaching (ideally) the correct order of <italic>reads</italic>. The approaches produced here consider scalability analysis (<xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>), with improvements made to the reward system&#x2014;especially in Approaches 1 &#x2014; and to optimize the agent&#x2019;s exploration&#x2014;approaches 2 and 3.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Illustration of the application of reinforcement learning to the genome assembly problem. <bold>(a)</bold> The set of <italic>reads</italic> is represented computationally by a reinforcement learning environment. <bold>(b)</bold> Successive interactions with the environment caused by taking action. <bold>(c)</bold> The agent ideally learns the correct order of <italic>reads,</italic> i.e., reaching the target genome.</p>
</caption>
<graphic xlink:href="fbinf-05-1633623-g002.tif">
<alt-text content-type="machine-generated">Diagram depicting a reinforcement learning process. On the left, colorful bars labeled &#x22;READS (randomly obtained)&#x22; represent initial data. An arrow leads to a globe symbolizing the environment, interacting with an &#x22;Agent&#x22; represented as a brain icon. &#x22;REWARD, STATE&#x22; flows back to the agent, which then takes an &#x22;ACTION&#x22; back to the environment. On the right, orderly colorful bars labeled &#x22;READS (ordered by the agent)&#x22; depict processed data.</alt-text>
</graphic>
</fig>
<sec id="s2-1">
<title>2.1 Approaches 1: tackling sparse rewards</title>
<p>Approaches 1.1, 1.2, 1.3, and 1.4 aimed at improving the reward system, as given by <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. Optimally, the agent achieves the correct order of <italic>reads</italic> upon learning the set of actions, specifically a permutation of <italic>reads</italic> that maximizes the accumulated reward. Thus, the optimal actions (those leading to the anticipated permutation of reads) must always yield the highest cumulative reward. Nevertheless, this proposition may not hold consistently for the reward system proposed in the seminal approach, as it allows some nonoptimal actions to result in maximum accumulated rewards.<disp-formula id="e4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0.1</mml:mn>
<mml:mtext>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>This inconsistency (details in Section 6 of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, 2020</xref>)) stems from the sequence alignment with the Smith&#x2012;Waterman algorithm (SW), which calculates a score to represent major alignment size (even if partial) but has no constraint on the order between sequences. Thus, the overlap score from the SW might induce the agent to find <italic>read</italic> permutations with high overlap values in pairs of <italic>reads</italic> without any suffix-prefix alignment. Therefore, using the PM score as a reward for training may be ineffective for some datasets.</p>
<p>Thus, to improve the agent&#x2019;s performance, we adjust the reward system through four approaches to explore two aspects: (a) the use of an overlap score that considers the relative order of <italic>reads</italic> and/or (b) the use of dense rewards. These new reward systems are presented in approaches 1.1, 1.2, 1.3 and 1.4.</p>
<p>As in the seminal approach, approach 1.1 defines that actions leading to the final states produce a bonus reward (of 1.0), added to another numerical overlap score between all subsequent <italic>reads</italic> used since the initial state. Thus, a reward corresponding to the sum of the normalized overlap score (ranging from 0 to 1) of each pair of <italic>reads</italic> was produced considering their relative order. Every action leading to nonfinal states produces constant and low rewards (0.1). <xref ref-type="disp-formula" rid="e5">Equation 5</xref> formalizes the reward system for Approach 1.1, with <italic>PM</italic>
<sub>
<italic>norm</italic>
</sub> (<italic>s&#x2032;</italic>) representing the normalized overlap between the <italic>reads</italic> used to reach <italic>s&#x2032;</italic> (details in <xref ref-type="sec" rid="s2">Section 2</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)).<disp-formula id="e5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi mathvariant="italic">norm</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1.0</mml:mn>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mn>0.1</mml:mn>
<mml:mtext>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>Despite the overlap score considering the order of <italic>reads</italic> in approach 1.1, it is susceptible to the sparse rewards problem, as in the seminal approach. Although it often produces small, constant and positive rewards and not a zero-value reward, as applied in sparse reward systems, only few and sparse state&#x2012;action pairs produce higher rewards. In both systems (<xref ref-type="disp-formula" rid="e4">Equations 4</xref>, <xref ref-type="disp-formula" rid="e5">5</xref>) no reward is provided during the learning process (since any <italic>read</italic> incorporated would produce a reward of 0.1).</p>
<p>Thus, the agent&#x2019;s learning process depends exclusively on the sparse actions taken during the exploration of this state space, tending to take a long time because of the sparse reward problem (<xref ref-type="bibr" rid="B44">Trott et al., 2019</xref>). In approaches 1.2, 1.3 and 1.4, we focused on improving it with higher rewards distributed for each action taken in each episode (previously obtained only at the end of the episode). These approaches focus on reducing or eliminating inconsistencies that allow permutations of unaligned reads to produce maximum accumulated rewards. <xref ref-type="disp-formula" rid="e6">Equations 6</xref>&#x2013;<xref ref-type="disp-formula" rid="e8">8</xref> represent the reward systems for approaches 1.2, 1.3 and 1.4, respectively, where <italic>ol</italic>
<sub>
<italic>norm</italic>
</sub>
<italic>(s, s&#x2032;)</italic> represents the normalized overlap between two subsequent reads (which represents the ratio between the overlap length of the two reads and the maximum length of the target genome&#x2014;the ratio between the overlap length of the two reads and the estimated maximum genome size, calculated as the number of reads multiplied by the read length&#x2014;and <italic>PM</italic>
<sub>
<italic>norm</italic>
</sub>(s) corresponds to the sum of the normalized overlaps of all the reads involved.<disp-formula id="e6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mi mathvariant="italic">norm</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1.0</mml:mn>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1.0</mml:mn>
<mml:mtext>&#x2003;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>s</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mtext>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;</mml:mtext>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-2">
<title>2.2 Approach 2: pruning-based elimination action</title>
<p>To reduce the state space from the seminal approach, a heuristic procedure was applied to eliminate fully explored actions where the maximum cumulative reward achieved was smaller than the cumulative reward from taking any other action available. In <xref ref-type="fig" rid="F3">Figure 3</xref>, looking at the changed state space as a tree&#x2014;removing actions associated with used reads, we see 16 states, 6 are absorbing states and the final states (tree base). Note that 3 out of the 6 final states are highlighted in black, whereas the remaining states are highlighted in gray and white. The black states correspond to the explored final states (i.e., visited by the agent). Gray states, such as those reached by taking action <italic>a</italic> in the initial state, represent states where all children were fully visited during the learning process. White states (final or not) are those not yet explored and/or that have unexplored children, e.g., the initial state, where one child is not explored and the other one is partially explored.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Illustration of the pruning procedure for a state space of 16 states referring to the assembly of 3 reads, referred to as a, b and c. Final states are represented by the leaf nodes of the tree. The black states correspond to final states that have already been visited. The gray states represent nonfinal states whose children have all been visited. White states (whether final or not) are nodes that have not yet been visited. The generic pruning procedure is defined in detail in Algorithm 1.</p>
</caption>
<graphic xlink:href="fbinf-05-1633623-g003.tif">
<alt-text content-type="machine-generated">Rooted tree illustrating the pruning process. Each node shows the maximum accumulated reward obtainable from it up to the leaf nodes. The root has a value of 8, with branches labeled a, b, and c, representing incorporated reads. The left-most node via a has value 6; the center node via b shows a question mark, meaning the reward is unknown; the right-most node via c has value 8. Pruning occurs at the node with value 6, marked by scissors, because its sibling via c has a higher value (8), making further exploration from node 6 unnecessary.</alt-text>
</graphic>
</fig>
<p>When an unexplored final state is reached, such as the rightmost final state in <xref ref-type="fig" rid="F3">Figure 3</xref>, the accumulated rewards are maintained and propagated for its predecessors, maintaining only the highest value propagated for the children. Each reward is represented by integer numbers within the states in the figure. In each nonfinal state, the highest accumulated reward achieved during the training process is stored. Thus, it is possible to prune irrelevant actions that do not produce the maximum accumulated reward (e.g., action a of the initial state in <xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<p>Note that all possible achievable states after taking this action were explored and the maximum cumulative reward was 6, whereas the initial state of action <italic>c</italic> alone produces a reward equal to 8. When the agent first goes through the sequence of states corresponding to actions <italic>c</italic>, <italic>a</italic> and <italic>b</italic>, the pruning mechanism propagates the maximum reward value up to the initial state and, at that moment, cuts action <italic>a</italic> from the initial state. The pseudocode presented in Algorithm 1 presents the procedure for updating the pruning process when the last explored final state (<italic>state</italic>) is reached, obtaining the corresponding accumulated reward achieved (<italic>newReward</italic>) (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Description of the code used for building the pruning algorithm.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Algorithm 1 Pruning&#x2019;s algorithm</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">1:&#x2003;procedure Prune (state: treeNode, newReward: float)<break/>2:&#x2003;&#x2003;if <italic>state</italic> &#x2260; <italic>null</italic> and (<italic>state.unseen</italic> or <italic>newReward</italic> &#x3e; <italic>state.maxReward</italic>) then<break/>3:&#x2003;&#x2003;&#x2003;<italic>state.unseen</italic> &#x2190; <italic>false</italic>
<break/>4:&#x2003;&#x2003;&#x2003;<italic>state.maxReward</italic> &#x2190; <italic>newReward</italic>
<break/>5:&#x2003;&#x2003;&#x2003;if <italic>state.final</italic> <bold>then</bold> &#x25b7; prune children where <italic>maxReward</italic> &#x3c; <italic>newReward</italic>
<break/>6:&#x2003;&#x2003;&#x2003;&#x2003;PruneUselessChildren (<italic>state</italic>)<break/>7:&#x2003;&#x2003;&#x2003;end if<break/>8:&#x2003;&#x2003;&#x2003;Prune (<italic>state.parent</italic>, <italic>newReward</italic>)<break/>9:&#x2003;&#x2003;end if<break/>10:&#x2003;end procedure</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-3">
<title>2.3 Approaches 3: evolutionary-based exploration</title>
<p>In these approaches, we explore the potential for mutual collaboration between reinforcement learning and evolutionary computing by applying the elitist selection of the genetic algorithm (<xref ref-type="bibr" rid="B1">Baluja and Caruana, 1995</xref>; <xref ref-type="bibr" rid="B24">Konar, 2005</xref>) to optimize the exploration of the state space. The individual contributions of the genetic algorithm used in this hybrid proposal were divided into two approaches: 3.1 and 3.2.</p>
</sec>
<sec id="s2-4">
<title>2.4 Approach 3.1: evolutionary-aided reinforcement learning assembly</title>
<p>Applying the &#x3f5;-greedy to expand the exploration of agents trained by the Q-learning algorithm allows broader initial exploration, achieving the optimal policy once the state space has been sufficiently explored (<xref ref-type="bibr" rid="B5">Botvinick et al., 2019</xref>). However, the existing trade-off between exploitation and exploration remains a major problem for RL in high-dimensional environments (<xref ref-type="bibr" rid="B14">Gimelfarb et al., 2020</xref>; <xref ref-type="bibr" rid="B31">Peterson and Verstynen, 2019</xref>). Here, for the first time, we introduce the interaction between RL and evolutionary computing into the exploration process based on the operation of the Q-learning algorithm. In each episode, the sequence of actions is stored, and at the end of the episode, the sequence is transformed into a chromosome of an initial population that evolves (see <xref ref-type="fig" rid="F4">Figure 4</xref>).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Illustration of the proposed interaction between reinforcement learning (RL) and the genetic algorithm. At each RL episode, the actions taken by the agent are converted into the chromosome (each action as a gene) of an individual of the initial population of the genetic algorithm, whose size n is predefined. After n episodes (n individuals in the initial population), this population evolves for a predefined number of generations through the genetic algorithm. Then, the most adapted individual of the last generation is obtained. In the end, that individual&#x2019;s chromosomal genes are used as actions in the next RL episode.</p>
</caption>
<graphic xlink:href="fbinf-05-1633623-g004.tif">
<alt-text content-type="machine-generated">Flowchart showing episodes one to six. Arrows labeled &#x2018;List of actions&#x2019; lead from episodes one to four to chromosomes one to four. An arrow points from the chromosomes to &#x2018;GA,&#x2019; indicating that the genetic algorithm will be run on these chromosomes. Another arrow, labeled &#x2018;Injection of most fitted individual,&#x2019; points from &#x2018;GA&#x2019; to episode five, meaning that the most fitted chromosome from &#x2018;GA&#x2019; will be injected into episode five after all algorithm evolutions. This process repeats continuously.</alt-text>
</graphic>
</fig>
<p>New chromosomes are inserted until the number of chromosomes reaches the predefined population size. At this point, agent training is interrupted, and m genetic generations are carried out&#x2014;with m being predefined (see <xref ref-type="sec" rid="s4">Section 4</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)) and applying the normalized sum of overlaps between reads as the adaptive function&#x2014;the same as that applied in <xref ref-type="disp-formula" rid="e8">Equation 8</xref> and detailed in <xref ref-type="sec" rid="s2">Section 2</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref>.</p>
<p>After m generations (objective function), the most fit individual is used for conducting the next episode in the agent&#x2019;s RL training, hitherto interrupted. As each gene of the individual&#x2019;s chromosome corresponds to one possible action, the complete gene sequence will contain distinct successive actions to be taken by the agent in the current episode, producing a mutual collaboration between RL and the genetic algorithm&#x2014;the initial populations of the genetic algorithm are produced by RL and, as a counterpart, the results from the evolution of the genetic algorithm are introduced in an RL episode (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Flowchart representing all the approaches performed in this study. The gray elements represent steps performed by approaches 1.1, 1.2, 1.3, and 1.4 (reward systems analysis). The double-edged dashed element represents steps performed by approach 2 (space state pruning). Single border dashed elements represent steps performed by Approach 3.1 (hybrid approach applying the genetic algorithm - GA).</p>
</caption>
<graphic xlink:href="fbinf-05-1633623-g005.tif">
<alt-text content-type="machine-generated">Flowchart illustrating the approaches in this paper. Gray elements represent steps of approaches 1.1 to 1.4. Double-edged dashed elements represent approach 2 (state space pruning). Single-bordered dashed elements represent Approach 3.1 (using GA). The process starts by setting episode one, resetting Q-values, and managing the initial GA population. It checks if the maximum number of episodes has been reached; if so, it ends. Otherwise, a new episode begins. Actions from previous Q-learning episodes are stored as chromosomes. When the population is full, GA runs if active, injecting the fittest chromosome for Q-learning. Pruning runs after each episode for approach 2.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-5">
<title>2.5 Approach 3.2: evolutionary-based assembly</title>
<p>To estimate the genetic algorithm contribution in Approach 3.1, its assembling performance was evaluated separately, following the same configurations set for the previous approach, but adopting as a starting population a set of individuals whose chromosomes were built from random permutations without repetition of reads.</p>
</sec>
<sec id="s2-6">
<title>2.6 Datasets and analysis</title>
<p>To assess the performance of all the approaches (including the seminal approach), in addition to the 18 datasets from <xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>, 5 novel datasets derived from microgenomes extracted in previous studies (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>; <xref ref-type="bibr" rid="B49">Xavier et al., 2020</xref>) were created. These are not arbitrary genome fragments, as were the case for previously used microgenomes (which had 25 bp and 50 bp), but represent larger fragments of previously annotated genes from the corresponding organism (i.e., <italic>E. coli</italic>). Given that the datasets are simulated data, no cycles in the genome were considered, which is a limitation of the approach.</p>
<p>The experiments were carried out with 23 datasets, detailed in <xref ref-type="table" rid="T2">Table 2</xref>&#x2014;the last 5 lines are gene derived.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Datasets used in the experiments. The first column shows the size (in bp) of the microgenome used to generate the reads of each set; the second column shows the number of reads generated; the third column shows the size of the generated reads; and the fourth column shows the name of the environment built for each set in the OpenAI Gym toolkit.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">&#x3bc;gen. Size</th>
<th align="center">&#x23; reads</th>
<th align="center">read<break/>Size</th>
<th align="center">Gym environment name</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">25</td>
<td align="center">10</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_25_10_8-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">10</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_25_10_10-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">10</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_25_10_15-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">10</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_50_10_8-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">10</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_50_10_10-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">10</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_50_10_15-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">20</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_25_20_8-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">20</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_25_20_10-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">20</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_25_20_15-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">20</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_50_20_8-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">20</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_50_20_10-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">20</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_50_20_15-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">30</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_25_30_8-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">30</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_25_30_10-v2</td>
</tr>
<tr>
<td align="center">25</td>
<td align="center">30</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_25_30_15-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">30</td>
<td align="center">8</td>
<td align="center">GymnomeAssembly_50_30_8-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">30</td>
<td align="center">10</td>
<td align="center">GymnomeAssembly_50_30_10-v2</td>
</tr>
<tr>
<td align="center">50</td>
<td align="center">30</td>
<td align="center">15</td>
<td align="center">GymnomeAssembly_50_30_15-v2</td>
</tr>
<tr>
<td align="center">381</td>
<td align="center">20</td>
<td align="center">75</td>
<td align="center">GymnomeAssembly_381_20_75-v2</td>
</tr>
<tr>
<td align="center">567</td>
<td align="center">30</td>
<td align="center">75</td>
<td align="center">GymnomeAssembly_567_30_75-v2</td>
</tr>
<tr>
<td align="center">726</td>
<td align="center">40</td>
<td align="center">75</td>
<td align="center">GymnomeAssembly_728_40_75-v2</td>
</tr>
<tr>
<td align="center">930</td>
<td align="center">50</td>
<td align="center">75</td>
<td align="center">GymnomeAssembly_930_50_75-v2</td>
</tr>
<tr>
<td align="center">4,224</td>
<td align="center">230</td>
<td align="center">75</td>
<td align="center">GymnomeAssembly_4224_230_75-v2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>An environment for each dataset was created in the OpenAI Gym toolkit (<xref ref-type="bibr" rid="B7">Brockman et al., 2016</xref>) to share such RL challenges. These environments are available online (see <xref ref-type="sec" rid="s1">Section 1</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)), where the reward system proposed in Approach 1.4 is used. The identification names of each environment are presented in the last column of <xref ref-type="table" rid="T2">Table 2</xref>. The seminal reward system is also implemented and available&#x2014;version 1, which replaces v2 with v1 in the environment name field.</p>
<p>Two experiments were carried out to evaluate the approaches. In each experiment, 20 successive runs of each evaluated approach were performed for all 23 existing datasets (460 runs per approach). Given that each approach has different levels of complexity, the real execution time for each approach was considered for comparison. To reduce the interference of external factors in execution time, all experiments were individually and sequentially performed at the same station (with Ubuntu 16.04 in an AWS EC2 instance of the r5a.large type, dual core, 16 GB of RAM and 30 GB of storage).</p>
<p>In the first experiment (Experiment A), the objective was to verify the impact of progressively including new strategies. For this purpose, the performance of the seminal approach was evaluated (according to (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>)) against approaches 1.1, 1.2, 1.3, 1.4 (improved reward system), 2 (pruning dynamic) and 3.1 (genetic algorithm - GA). In the second experiment (Experiment B), the objective was to compare the performance of the new RL-based approaches against the performance of the GA alone. Therefore, in addition to Approaches 1.1, 1.2, 1.3, 1.4, 2 and 3.1, approach 3.2 (which explores the GA alone) was performed in an equivalent amount of time.</p>
<p>For the performance measure in each experiment, two percentage measures were calculated: the distance-based measure (DM) and the reward-based measure (RM). Evaluations of <italic>de novo</italic> assembly are typically performed using proper metrics, such as the N50 (<xref ref-type="bibr" rid="B6">Bradnam et al., 2013</xref>). These metrics were created because, as previously indicated, <italic>de novo</italic> assemblies are not supported by a reference genome. In some scenarios, it is not possible to assess the results obtained from the assemblers accurately because the optimal output is unknown. Here, although a <italic>de novo</italic> assembler is evaluated, its assessment environment is restricted, and the target genomes are known; this scenario allows the use of specific (and exact) evaluations, such as DM and RM metrics.</p>
<p>DM considers a successful run when the consensus sequence from the orders of the reads produced is identical to the expected sequence. RM considers any run as a success when the proposed order of reads represents the sum of PMnorm higher than or equal to the sum of PMnorm from the optimal read sequence (for details, see <xref ref-type="sec" rid="s3">Section 3</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)).</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>In Experiment A, the seminal approach consumed the longest running time (23 h and 34 min) and had the lowest average performance; an optimal response was obtained in 16.96% of the runs (i.e., 78 out of the 460 executions) in terms of distance from the expected genome (DM) and 21.30% (98 out of the 460) in terms of maximum reward (RM) (<xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Results of Experiment A. Comparison of the performances of trained agents with different reinforcement learning strategies. The performance of each approach is expressed using distance-based (DM) and reward-based (RM) metrics (see Methods for details).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Experiment a (approach)</th>
<th align="center">Average DM</th>
<th align="center">Average RM</th>
<th align="center">Total<break/>Runtime</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">seminal</td>
<td align="center">16.96%</td>
<td align="center">21.30%</td>
<td align="center">23 h34 m</td>
</tr>
<tr>
<td align="center">1.1</td>
<td align="center">9.57%</td>
<td align="center">13.70%</td>
<td align="center">19 h38 m</td>
</tr>
<tr>
<td align="center">1.2</td>
<td align="center">18.48%</td>
<td align="center">21.30%</td>
<td align="center">19 h38 m</td>
</tr>
<tr>
<td align="center">1.3</td>
<td align="center">20.00%</td>
<td align="center">24.35%</td>
<td align="center">19 h38 m</td>
</tr>
<tr>
<td align="center">1.4</td>
<td align="center">20.43%</td>
<td align="center">24.78%</td>
<td align="center">19 h38 m</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">20.65%</td>
<td align="center">25.00%</td>
<td align="center">18 h41 m</td>
</tr>
<tr>
<td align="center">3.1</td>
<td align="center">73.91%</td>
<td align="center">80.87%</td>
<td align="center">17 h03 m</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Following the updated reward system, the DM and RM performances in Approaches 1.2, 1., 3, and 1.4 surpassed those of the previous approach and consumed approximately 4 h less (19 h and 38 min) of running time. Approach 3.1 presented the shortest running time, with a DM average of approximately 74% and an RM average above 80%, with the highest performance.</p>
<p>In Experiment B, approach 3.2 presented the shortest running time, with a DM average of 87% and an RM average of 95%. Given the superior performance of Approach 3.2, Experiment B applied the time taken by the genetic algorithm as a reference to find an optimal solution in terms of the RM for 22 out of the 23 datasets used (i.e., 95.65%), which corresponded to 1 h and 34 min of running time. Given the dominance of Approach 3.2, we also verified the performance of this approach on only the dataset with no optimal response (reads with 4Kbp). In this experiment, the running time for Approach 3.2 was considerably increased, lasting approximately 38 h (against less than 2 min for the same dataset for approach 3.2 in Experiment B, <xref ref-type="table" rid="T4">Table 4</xref>). No optimal solution was obtained for this dataset; however, it is possible to observe a consistent gain in performance in terms of both DM (where longer runs had shorter distances than most distances reached by shorter runs) and RM (which had higher accumulated rewards in all longer runs).</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Results of Experiment B. Experimental performance considering similar running times (RTs). The performance was expressed using a Distance-based Measure (DM) and Reward-based Measure (RM) (see Methods).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Experiment B (approach)</th>
<th align="center">Average DM</th>
<th align="center">Average RM</th>
<th align="center">Total<break/>Runtime</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1.4</td>
<td align="center">13.91%</td>
<td align="center">17.61%</td>
<td align="center">01 h36 m</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">12.39%</td>
<td align="center">16.30%</td>
<td align="center">01 h36 m</td>
</tr>
<tr>
<td align="center">3.1</td>
<td align="center">14.78%</td>
<td align="center">14.78%</td>
<td align="center">01 h42 m</td>
</tr>
<tr>
<td align="center">3.2</td>
<td align="center">87.83%</td>
<td align="center">95.65%</td>
<td align="center">01 h34 m</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>Genome assembly is among the most complex problems confronted by computer scientists within the context of genomics projects. When machine learning is applied to genome assembly, this complexity allocates the problem of finding optimal permutations of sequenced reads and reaching the target genome into an NP-hard problem, which comprises the most difficult problems in computer science (<xref ref-type="bibr" rid="B35">Roughgarden, 2020</xref>). This high complexity is particularly expressed in the vast state space required for representing the assembly problem in RL models.</p>
<p>In the approach studied here, according to <xref ref-type="disp-formula" rid="e1">Equation 1</xref>, reaching the optimal solution for sets of 30 reads requires the RL agent to explore a state space of approximately 2e44 states (<xref ref-type="bibr" rid="B3">Bocicor et al., 2011a</xref>) (2.10&#x5e;44, which is more than the stars in the universe). In real-world scenarios, genomes are much larger. Applying RL combined with heuristics is a strategy for addressing complex problems, aiming at mapping actions into states that tend to maximize their reward, thus decreasing the computational complexity of the problem.</p>
<p>We aimed to expand the agent learning based on two constraints from the seminal approach for applying RL to the genome assembly problem: (1) the reward system and (2) the agent&#x2019;s exploration strategies. We found that both improving the agent&#x2019;s learning performance and updating the reward systems favored the agent to improve learning. Despite improvements, the system still occasionally produces suboptimal solutions. This is also supported by the fact that RM percentages were higher than DM percentages in some experiments.</p>
<p>The dynamic pruning mechanism showed slight improvement, but the additional processing cost and the benefit from its implementation did not indicate a reasonable net gain from its use as bypass for the problem emerging from the high dimensionality of the state space. Some of the gains were due to the improved agent&#x2019;s performance, where the sum of rewards for the optimal permutation of reads was not maximized in the previous reward system. Despite the gains from the updated reward system, the inconsistencies were not completely resolved. In some of the datasets, the agent reached and even surpassed the maximum expected accumulated rewards without obtaining the target genome. A minor improvement is observed in approach 2, requiring approximately 1 h less processing.</p>
<p>The hybrid approach combining the RL strategy with a GA (Approaches 3) presented better performance. This combination was proven to be advantageous, probably given the curse of dimensionality encountered by the Q-learning algorithm, as strong GA support was observed for the agent while conducting the RL exploration. Despite these improvements, the approaches are not yet suitable for real-world scenarios. This is evident in the experiments performed with the largest dataset. Even the smallest genomes found in living organisms are larger than the largest dataset used in this study.</p>
<p>Nevertheless, none of the proposed approaches yielded an optimal solution for this dataset&#x2014;not even the most effective one (GA)&#x2014;when the execution time was extended. The superiority of the GA alone allows us to draw conclusions on the current infeasibility of applying the Q-learning algorithm to solve the genome assembly problem in search of an optimal read permutation, as proposed in the seminal approach.</p>
<p>Given the absence of approaches in the literature for tackling this problem through RL and considering the optimistic results obtained by RL in other areas (especially when RL is combined with deep learning) (<xref ref-type="bibr" rid="B28">Mnih et al., 2015</xref>), further investigations on the applicability of RL, including the use of different modeling approaches and algorithms, are needed.</p>
<p>One of the major challenges in applying RL to real-world problems is the low sample efficiency of the algorithms (<xref ref-type="bibr" rid="B52">Yu, 2018</xref>). Considering the time required by the agent trained by the Q-learning algorithm to reach an optimal solution, it is possible to perceive a high need for numerous interactions with the data. Considering that genome inputs are larger than those experimentally applied here, obtaining a sample efficient algorithm for the problem is at the core of developing a real-world solution. Additionally, the agent sample efficiency must be optimized to explore the state space, which might be achieved by the application of techniques to remove duplicate reads&#x2014;due to repeats&#x2014;and the use of an intrinsic motivation to bypass the exploration problem, given the high dimensionality of the proposed state space (<xref ref-type="bibr" rid="B52">Yu, 2018</xref>; <xref ref-type="bibr" rid="B2">Barto, 2012</xref>).</p>
<p>Future research should also focus on exploring and systematically comparing different reinforcement learning algorithms for the genome assembly problem (<xref ref-type="bibr" rid="B51">Yassine and Riffi, 2023</xref>). While this study focused on Q-learning, other approaches&#x2014;such as policy gradient methods, actor&#x2012;critic algorithms, and additional reinforcement learning techniques&#x2014;may offer more suitable mechanisms for capturing the sequential decision-making and structural complexity involved in the task (<xref ref-type="bibr" rid="B37">Shakya et al., 2023</xref>; <xref ref-type="bibr" rid="B16">Grondman et al., 2012</xref>). A comparative analysis of these algorithms could provide valuable insights into their effectiveness, limitations, and applicability, ultimately guiding the development of more robust and scalable solutions in this domain.</p>
<p>The use of graph embedding may act as another option for modeling approaches allowing the use of deep RL without requiring the conversion of the problem into an image&#x2014;the genome assembly problem may be represented through a graph, in the shape of the traveling salesman problem (TSP) (<xref ref-type="bibr" rid="B8">Cook, 2012</xref>; <xref ref-type="bibr" rid="B26">Li et al., 2011</xref>).</p>
<p>As highlighted throughout this study, the limitations observed in the application of Q-learning to the fragment assembly problem suggest that traditional reinforcement learning techniques may not be sufficient to handle the complexity and scalability required in real-world scenarios. Therefore, a fundamental direction for future research is the exploration of deep reinforcement learning (DRL) techniques (<xref ref-type="bibr" rid="B29">Osborne et al., 2022</xref>). DRL has the potential to address the high-dimensional state and action spaces inherent to the assembly problem, enabling more robust generalization and improved decision-making (<xref ref-type="bibr" rid="B28">Mnih et al., 2015</xref>; <xref ref-type="bibr" rid="B18">Hafner et al., 2025</xref>).</p>
<p>Another key direction for future research is the exploration of transfer learning in the context of RL-based genome assembly. Leveraging transfer learning techniques could enable the development of more practical and robust assembly models that generalize across different datasets, reducing the need for retraining from scratch for each new scenario (<xref ref-type="bibr" rid="B43">Taylor and Stone, 2009</xref>; <xref ref-type="bibr" rid="B50">Yang et al., 2020</xref>; <xref ref-type="bibr" rid="B55">Zhu et al., 2023</xref>). By allowing previously acquired knowledge to inform new learning tasks, transfer learning has the potential to significantly increase the efficiency and scalability of RL-based genome assemblers, paving the way for broader applicability and real-world use.</p>
<p>Finally, one last aspect to be considered for the adoption of RL in the genome assembly problem is the generalization of the agent&#x2019;s learning, which is a major challenge for the use of RL in real-world problems (<xref ref-type="bibr" rid="B32">Ponsen et al., 2010</xref>). As designed for the RL environment for the genome assembly problem, the learning acquired by the agent when assembling a set of reads will hardly be applied for the assembly of a new set.</p>
<p>Although the results obtained have shown that the application of Q-learning to genome assembly, as proposed in the seminal approach, does not yield satisfactory performance at larger scales, the main scientific contribution of this work lies in addressing a current gap in knowledge. To date, the only existing proposal in the literature has explored this approach using extremely small datasets without assessing its feasibility in more realistic scenarios. By conducting a broader analysis with relatively larger datasets and adaptations to the original algorithm, this study provides a critical and well-founded evaluation of the limitations of this technique. Thus, even though the results do not point to a promising solution, they advance scientific understanding of the subject by more clearly delineating the challenges and constraints involved in applying reinforcement learning methods to genome assembly.</p>
<p>All the experiments and the RL environments used in this study are publicly available and open for reuse (for details, see <xref ref-type="sec" rid="s5">Section 5</xref> of the <xref ref-type="sec" rid="s12">Supplementary Material</xref> (<xref ref-type="bibr" rid="B30">Padovani and Alves, et al., 2020</xref>)) to support future studies.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>This study provides a comprehensive evaluation of the applicability of reinforcement learning (RL), specifically the Q-learning algorithm, to the genome assembly problem. While initial results using the seminal approach confirmed its functionality on small datasets, our expanded analyses revealed critical scalability limitations. Through a series of methodological improvements, including the revised reward systems, dynamic pruning, and the incorporation of evolutionary algorithm (Genetic Algorithm&#x2013;GA), we demonstrated incremental performance gains. However, even the most advanced hybrid strategies failed to deliver optimal results on larger, more realistic datasets. Notably, the genetic algorithm alone outperformed all RL-based strategies, highlighting the current inadequacy of Q-learning for addressing the high-dimensional state spaces inherent to genome assembly. These findings underscore the importance of exploring alternative RL algorithms, such as deep reinforcement learning and policy gradient methods, alongside strategies like transfer learning and intrinsic motivation. Despite the lack of a viable RL-based solution at present, this study contributes with a valuable benchmark for future research by mapping the limitations of current approaches and emphasizing key directions for advancing machine learning applications in genome assembly.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author/s.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>KP: Conceptualization, Data curation, Formal Analysis, Investigation, Methodology, Project administration, Software, Validation, Writing &#x2013; original draft, Writing &#x2013; review and editing. RB: Writing &#x2013; review and editing. RX: Software, Validation, Writing &#x2013; original draft, Writing &#x2013; review and editing. AdC: Writing &#x2013; review and editing. AR: Supervision, Writing &#x2013; review and editing. AnC: Writing &#x2013; review and editing. RA: Conceptualization, Data curation, Formal Analysis, Investigation, Methodology, Supervision, Writing &#x2013; original draft, Writing &#x2013; review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. The study was funded by the Vale Institute of Technology, a non-profit research institution (Data lake project PEP R100603.DTL.08). This study was financed in part by the Coordena&#xe7;&#xe3;o de Aperfei&#xe7;oamento de Pessoal de N&#xed;vel Superior &#x2013; Brasil (CAPES - Finance Code 001).</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s12">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2025.1633623/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2025.1633623/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baluja</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Caruana</surname>
<given-names>R.</given-names>
</name>
</person-group>: <article-title>Removing the genetics from the standard genetic algorithm</article-title>. In: In <source>Proceedings of ICML&#x2019;95</source>, pp. <fpage>38</fpage>&#x2013;<lpage>46</lpage>. <publisher-name>Elsevier</publisher-name>, <publisher-loc>California</publisher-loc> (<year>1995</year>). <pub-id pub-id-type="doi">10.1016/B978-1-55860-377-6.50014-1</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Intrinsic motivation and reinforcement learning</article-title>,&#x201d; in <source>Intrinsically motivated learning in natural and artificial systems</source>, <fpage>17</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-32375-12</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bocicor</surname>
<given-names>M.-I.</given-names>
</name>
<name>
<surname>Czibula</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Czibula</surname>
<given-names>I.-G.</given-names>
</name>
</person-group> (<year>2011a</year>). &#x201c;<article-title>A reinforcement learning approach for solving the fragment assembly problem</article-title>,&#x201d; in <conf-name>2011 13th International Symposium on Symbolic and Numeric Algorithms for Scientific Computing</conf-name> (<publisher-loc>Timisoara, Romania</publisher-loc>: <publisher-name>IEEE</publisher-name>). <pub-id pub-id-type="doi">10.1109/synasc.2011.9</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bocicor</surname>
<given-names>M.-I.</given-names>
</name>
<name>
<surname>Czibula</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Czibula</surname>
<given-names>I. G.</given-names>
</name>
</person-group> (<year>2011b</year>). <article-title>A distributed Q-learning approach to fragment assembly</article-title>. <source>ICI Buchar.</source> <volume>20</volume> (<issue>3</issue>). <pub-id pub-id-type="doi">10.24846/v20i3y201103</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Botvinick</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ritter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J. X.</given-names>
</name>
<name>
<surname>Kurth-Nelson</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Blundell</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hassabis</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Reinforcement learning, fast and slow</article-title>. <source>Trends Cognitive Sci.</source> <volume>23</volume> (<issue>5</issue>), <fpage>408</fpage>&#x2013;<lpage>422</lpage>. <pub-id pub-id-type="doi">10.1016/j.tics.2019.02.006</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bradnam</surname>
<given-names>K. R.</given-names>
</name>
<name>
<surname>Fass</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Alexandrov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Baranay</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bechner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Birol</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Assemblathon 2: evaluating <italic>de novo</italic> methods of genome assembly in three vertebrate species</article-title>. <source>GigaScience</source> <volume>2</volume> (<issue>1</issue>), <fpage>10</fpage>. <pub-id pub-id-type="doi">10.1186/2047-217x-2-10</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="other">
<person-group person-group-type="author">
<name>
<surname>Brockman</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pettersson</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <pub-id pub-id-type="doi">10.48550/arXiv.1606.01540</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cook</surname>
<given-names>W. J.</given-names>
</name>
</person-group> (<year>2012</year>). <source>Pushing the limits</source>, <fpage>211</fpage>&#x2013;<lpage>212</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://www.jstor.org/stable/j.ctt7t8kc.15">http://www.jstor.org/stable/j.ctt7t8kc.15</ext-link>.</comment>
</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cormen</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Leiserson</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Rivest</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Stein</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Introduction to algorithms</source>. <edition>Third Edition</edition>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>. <pub-id pub-id-type="doi">10.5555/580470</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crespo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wichert</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Reinforcement learning applied to games</article-title>. <source>SN Appl. Sci.</source> <volume>2</volume> <fpage>824</fpage>. <pub-id pub-id-type="doi">10.1007/s42452-020-2560-3</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dulac-Arnold</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Mankowitz</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Hester</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Challenges of real-world reinforcement learning</article-title>,&#x201d; in <source>ICML 2019 workshop on reinforcement learning for real life (RLRL)</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1904.12901</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fernandez</surname>
<given-names>L. A.</given-names>
</name>
<name>
<surname>Martin-Mayor</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Yllanes</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Phase transition in the computational complexity of the shortest common superstring and genome assembly</article-title>. <source>Phys. Rev. E</source> <volume>109</volume>, <fpage>014133</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevE.109.014133</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gimelfarb</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sanner</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>C.-G.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Epsilon-bmc: a bayesian ensemble approach to epsilon-greedy exploration in model-free reinforcement learning</article-title>,&#x201d;. <source>Proceedings of machine learning research</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Adams</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Gogate</surname>
<given-names>V.</given-names>
</name>
</person-group> (<publisher-loc>Tel Aviv, Israel</publisher-loc>: <publisher-name>PMLR</publisher-name>), <volume>115</volume>, <fpage>476</fpage>&#x2013;<lpage>485</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Grinstead</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Snell</surname>
<given-names>J. L.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Introduction to probability</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://books.google.com.br/books?id=7ip55ODL72wC">https://books.google.com.br/books?id&#x3d;7ip55ODL72wC</ext-link>.</comment>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grondman</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Busoniu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lopes</surname>
<given-names>G. A. D.</given-names>
</name>
<name>
<surname>Babuska</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A survey of actor-critic reinforcement learning: standard and natural policy gradients</article-title>. <source>IEEE Trans. Syst. Man, Cybern. Part C Appl. Rev.</source> <volume>42</volume> (<issue>6</issue>), <fpage>1291</fpage>&#x2013;<lpage>1307</lpage>. <pub-id pub-id-type="doi">10.1109/TSMCC.2012.2218595</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gurevich</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Saveliev</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Vyahhi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Tesler</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>QUAST: quality assessment tool for genome assemblies</article-title>. <source>Bioinformatics</source> <volume>29</volume> (<issue>8</issue>), <fpage>1072</fpage>&#x2013;<lpage>1075</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btt086</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hafner</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Pasukonis</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ba</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lillicrap</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Mastering diverse control tasks through world models</article-title>. <source>Nature</source>. <volume>640</volume> <issue>8059</issue> <fpage>647</fpage>&#x2013;<lpage>653</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-025-08744-2</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Harris</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Millman</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>van der Walt</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Gommers</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Virtanen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cournapeau</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Array programming with NumPy</article-title>. <source>Nature</source> <volume>585</volume>, <fpage>357</fpage>&#x2013;<lpage>362</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heather</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Chain</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>The sequence of sequencers: the history of sequencing DNA</article-title>. <source>Genomics</source> <volume>107</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1016/j.ygeno.2015.11.003</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jamialahmadi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Khalili-Tanha</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Nazari</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rezaei-Tavirani</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Artificial intelligence and bioinformatics: a journey from traditional techniques to smart approaches</article-title>. <source>Gastroenterology Hepatology Bed Bench</source> <volume>17</volume> (<issue>3</issue>), <fpage>241</fpage>&#x2013;<lpage>252</lpage>. <pub-id pub-id-type="doi">10.22037/ghfbb.v17i3.2977</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ji</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>MetaSort untangles metagenome assembly by reducing microbial community complexity</article-title>. <source>Nat. Commun.</source> <volume>8</volume> (<issue>1</issue>), <fpage>14306</fpage>. <pub-id pub-id-type="doi">10.1038/ncomms14306</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karami</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alizadehsani</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jahanian</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Argha</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dehzangi</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Alinejad-Rokny</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Revolutionizing genomics with reinforcement learning techniques</article-title>. <source>arXiv</source>. <pub-id pub-id-type="doi">10.48550/ARXIV.2302.13268</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Konar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2005</year>). <source>Evolutionary computing algorithms</source>. <publisher-loc>Berlin, Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>, <fpage>323</fpage>&#x2013;<lpage>351</lpage>. <pub-id pub-id-type="doi">10.1007/3-540-27335-212</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>LeCun</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>1.1 deep learning hardware: past, present, and future</article-title>,&#x201d; in <conf-name>2019 IEEE International Solid- State Circuits Conference - (ISSCC)</conf-name>, <fpage>12</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1109/isscc.2019.8662396</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Comparison of the two major classes of assembly algorithms: overlap&#x2013;layout&#x2013;consensus and de-bruijn-graph</article-title>. <source>Briefings Funct. Genomics</source> <volume>11</volume> (<issue>1</issue>), <fpage>25</fpage>&#x2013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1093/bfgp/elr035</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Medvedev</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Georgiou</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Myers</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Brudno</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Computability of models for sequence assembly</article-title>. In: <conf-name>Lecture Notes in Computer Science</conf-name>, <publisher-name>Springer</publisher-name>. <publisher-loc>Berlin</publisher-loc>. <fpage>289</fpage>&#x2013;<lpage>301</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-540-74126-827</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mnih</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bellemare</surname>
<given-names>M. G.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Human-level control through deep reinforcement learning</article-title>. <source>Nature</source> <volume>518</volume>, <fpage>529</fpage>&#x2013;<lpage>533</lpage>. <pub-id pub-id-type="doi">10.1038/nature14236</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Osborne</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2022</year>). <source>Applying reinforcement learning on real-world data with practical examples in Python</source>. <publisher-name>Springer International Publishing</publisher-name>. <pub-id pub-id-type="doi">10.1007/978-3-031-79167-3</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Padovani</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Alves</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Using reinforcement learning in genome assembly: in-depth analysis of a Q-learning assembler</source>. <publisher-name>OSF</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://osf.io/tp4zj/?view_only=18dbb6c733d84939b62ee79d740fd3b2">https://osf.io/tp4zj/?view_only&#x3d;18dbb6c733d84939b62ee79d740fd3b2</ext-link>.</comment>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peterson</surname>
<given-names>E. J.</given-names>
</name>
<name>
<surname>Verstynen</surname>
<given-names>T. D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A way around the exploration-exploitation dilemma</article-title>. <source>bioRxiv</source>. <pub-id pub-id-type="doi">10.1101/671362</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ponsen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Tuyls</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Abstraction and generalization in reinforcement learning: a summary and framework</article-title>,&#x201d; in <source>Adaptive and learning agents</source>, <fpage>1</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-11814-21</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Portin</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wilkins</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The evolving definition of the term &#x201c;gene&#x201d;</article-title>. <source>Genetics</source> <volume>205</volume> (<issue>4</issue>), <fpage>1353</fpage>&#x2013;<lpage>1364</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.116.196956</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Roughgarden</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Algorithms illuminated (Part 4): algorithms for NP-hard problems</source>. <publisher-name>New York, NY</publisher-name>: <publisher-name>Soundlikeyourself Publishing</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://books.google.com.br/books?id=FlmuzQEACAAJ">https://books.google.com.br/books?id&#x3d;FlmuzQEACAAJ</ext-link>.</comment>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shakya</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Pillai</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chakrabarty</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Reinforcement learning algorithms: a brief survey</article-title>. <source>Expert Syst. Appl.</source> <volume>231</volume>. <fpage>120495</fpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2023.120495</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silva</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Padovani</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>G&#xf3;es</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Alves</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>geneRFinder: gene finding in distinct metagenomic data complexities</article-title>. <source>BMC Bioinforma.</source> <volume>22</volume>, <fpage>87</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-021-03997-w</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smith</surname>
<given-names>T. F.</given-names>
</name>
<name>
<surname>Waterman</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<year>1981</year>). <article-title>Identification of common molecular subsequences</article-title>. <source>J. Mol. Biol.</source> <volume>147</volume> (<issue>1</issue>), <fpage>195</fpage>&#x2013;<lpage>197</lpage>. <pub-id pub-id-type="doi">10.1016/0022-2836(81)90087-5</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Souza</surname>
<given-names>K. P.</given-names>
</name>
<name>
<surname>Setubal</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Carvalho</surname>
<given-names>A. C. P.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chateau</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Alves</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Machine learning meets genome assembly</article-title>. <source>Brief. Bioinforma.</source> <volume>20</volume>, <fpage>2116</fpage>&#x2013;<lpage>2129</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bby072</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Reinforcement learning: an introduction</source>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>A Bradford Book</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://incompleteideas.net/book/RLbook2020.pdf">http://incompleteideas.net/book/RLbook2020.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taylor</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Transfer learning for reinforcement learning domains: a survey</article-title>. <source>J. Mach. Learn. Res.</source> <volume>10</volume>, <fpage>1633</fpage>&#x2013;<lpage>1685</lpage>. <pub-id pub-id-type="doi">10.5555/1577069.1755839</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Trott</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Socher</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Keeping your distance: solving sparse reward tasks using self-balancing shaped rewards</article-title>,&#x201d;. <source>Advances in neural information processing systems 32: annual conference on neural information processing systems 2019</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x2019;Alch&#xb4;e-Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E. B.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>BC, Canada</publisher-loc>: <publisher-name>NeurIPS</publisher-name>), <volume>2019</volume>, <fpage>10376</fpage>&#x2013;<lpage>10386</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1911.01417</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2025a</year>). <article-title>Discovery of antimicrobial peptides with notable antibacterial potency by an LLM-based foundation model</article-title>. <source>Sci. Adv.</source> <volume>11</volume>, <fpage>eads8932</fpage>. <pub-id pub-id-type="doi">10.1126/sciadv.ads8932</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2025b</year>). <article-title>3DSMILES-GPT: 3D molecular pocket-based generation with token-only large language model</article-title>. <source>Chem. Sci.</source> <volume>16</volume>, <fpage>637</fpage>&#x2013;<lpage>648</lpage>. <pub-id pub-id-type="doi">10.1039/D4SC06864E</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>H. L.</given-names>
</name>
<name>
<surname>MacLeod</surname>
<given-names>F. I.</given-names>
</name>
<name>
<surname>White</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Visscher</surname>
<given-names>P. T.</given-names>
</name>
<name>
<surname>Burns</surname>
<given-names>B. P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Microbial dark matter filling the niche in hypersaline microbial mats</article-title>. <source>Microbiome</source> <volume>8</volume> (<issue>1</issue>), <fpage>135</fpage>. <pub-id pub-id-type="doi">10.1186/s40168-020-00910-0</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xavier</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Souza</surname>
<given-names>K. P.</given-names>
</name>
<name>
<surname>Chateau</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Alves</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Genome assembly using reinforcement learning</article-title>,&#x201d; in <source>Advances in bioinformatics and computational biology</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Kowada</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>D.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>16</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-46417-22</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Transfer learning in reinforcement learning</article-title>,&#x201d; in <source>Transfer learning</source> (<publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>), <fpage>105</fpage>&#x2013;<lpage>125</lpage>. <pub-id pub-id-type="doi">10.1017/9781139061773.010</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yassine</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Riffi</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A review on machine-learning and nature-inspired algorithms for genome assembly</article-title>. <source>Int. J. Adv. Comput. Sci. Appl.</source> <volume>14</volume> (<issue>7</issue>), <fpage>898</fpage>. <pub-id pub-id-type="doi">10.14569/ijacsa.2023.0140798</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Towards sample efficient reinforcement learning</article-title>. <source>Proc. 27th Int. Jt. Conf. Artif. Intell.</source> <volume>18</volume>, <fpage>5739</fpage>&#x2013;<lpage>5743</lpage>. <pub-id pub-id-type="doi">10.24963/ijcai.2018/820</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A geometric deep learning framework for drug repositioning over heterogeneous information networks</article-title>. <source>Briefings Bioinforma.</source> <volume>23</volume> (<issue>6</issue>), <fpage>bbac384</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbac384</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Regulation-aware graph learning for drug repositioning over heterogeneous biological network</article-title>. <source>Inf. Sci.</source> <volume>686</volume>, <fpage>121360</fpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2024.121360</pub-id>
</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Transfer learning in deep reinforcement learning: a survey</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>45</volume> (<issue>11</issue>), <fpage>13344</fpage>&#x2013;<lpage>13362</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2023.3292075</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>