<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">756041</article-id>
<article-id pub-id-type="doi">10.3389/fdata.2021.756041</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>
<italic>HPTMT</italic> Parallel Operators for High Performance Data Science and Data Engineering</article-title>
<alt-title alt-title-type="left-running-head">Abeykoon et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">HPTMT Parallel Data Science Operators</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Abeykoon</surname>
<given-names>Vibhatha</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1477988/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kamburugamuve</surname>
<given-names>Supun</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1523907/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Widanage</surname>
<given-names>Chathura</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1609975/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Perera</surname>
<given-names>Niranda</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1532856/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Uyar</surname>
<given-names>Ahmet</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1610303/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kanewala</surname>
<given-names>Thejaka Amila</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1610960/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>von Laszewski</surname>
<given-names>Gregor</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1577612/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fox</surname>
<given-names>Geoffrey</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1337205/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Indiana University Alumni</institution>, <addr-line>Bloomington</addr-line>, <addr-line>IN</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Luddy School of Informatics, Computing and Engineering</institution>, <addr-line>Bloomington</addr-line>, <addr-line>IN</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Biocomplexity Institute and Initiative, University of Virginia</institution>, <addr-line>Charlottesville</addr-line>, <addr-line>VA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Computer Science Department, University of Virginia</institution>, <addr-line>Charlottesville</addr-line>, <addr-line>VA</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1008713/overview">Domenico Talia</ext-link>, University of Calabria, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1400752/overview">Giovanni Ponti</ext-link>, Italian National Agency for New Technologies, Energy and Sustainable Economic Development (ENEA), Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1453858/overview">Loris Belcastro</ext-link>, University of Calabria, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1471598/overview">Patrizio Dazzi</ext-link>, National Research Council (CNR), Italy</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Vibhatha Abeykoon, <email>vibhatha@gmail.com</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Data Mining and Management, a section of the journal Frontiers in Big&#x20;Data</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>02</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>4</volume>
<elocation-id>756041</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>08</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>11</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Abeykoon, Kamburugamuve, Widanage, Perera, Uyar, Kanewala, von Laszewski and Fox.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Abeykoon, Kamburugamuve, Widanage, Perera, Uyar, Kanewala, von Laszewski and Fox</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Data-intensive applications are becoming commonplace in all science disciplines. They are comprised of a rich set of sub-domains such as data engineering, deep learning, and machine learning. These applications are built around efficient data abstractions and operators that suit the applications of different domains. Often lack of a clear definition of data structures and operators in the field has led to other implementations that do not work well together. The <italic>HPTMT</italic> architecture that we proposed recently, identifies a set of data structures, operators, and an execution model for creating rich data applications that links all aspects of data engineering and data science together efficiently. This paper elaborates and illustrates this architecture using an end-to-end application with deep learning and data engineering parts working together. Our analysis show that the proposed system architecture is better suited for high performance computing environments compared to the current big data processing systems. Furthermore our proposed system emphasizes the importance of efficient compact data structures such as Apache Arrow tabular data representation defined for high performance. Thus the system integration we proposed scales a sequential computation to a distributed computation retaining optimum performance along with highly usable application programming interface.</p>
</abstract>
<kwd-group>
<kwd>exascale and HPC systems</kwd>
<kwd>cylon</kwd>
<kwd>parallel computation</kwd>
<kwd>deep learning</kwd>
<kwd>Apache software foundation</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Data engineering and data science are two major branches of data-intensive applications. Data engineering deals with collecting, storing, and transforming data. Data science tasks (deep learning, machine learning, data engineering) comprises of several disciplines, out of them machine learning and deep learning are significant. This is the place where we use data to learn and gain insights. These two components, illustrated in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref> are designed on top of data structures and operators around them. The data engineering component primarily works with table data abstractions, while the machine learning and deep learning components mainly use tensors and matrices.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Data science workflow with Jupyter Notebook interface and Data Engineering around Deep Learning.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g001.tif"/>
</fig>
<p>To run applications using multiple computers, we can partition the data and apply distributed operators. Current systems use several different strategies to provide distributed application programming interfaces (APIs) for data-intensive applications. An API for data-intensive applications is a combination of data structures, operators, and an execution model. There are thousands of operators defined around data structures such as vectors and tables by different frameworks. The current data systems use asynchronous and loosely synchronous execution models for running programs at scale. Asynchronous execution is popular in systems such as Spark (<xref ref-type="bibr" rid="B39">Zaharia et&#x20;al., 2010</xref>), Dask (<xref ref-type="bibr" rid="B30">Rocklin, 2015</xref>) and Modin (<xref ref-type="bibr" rid="B29">Petersohn et&#x20;al., 2020</xref>). Loosely synchronous distributed execution is used in systems such as PyTorch (<xref ref-type="bibr" rid="B28">Paszke et&#x20;al., 2019</xref>), Cylon (<xref ref-type="bibr" rid="B35">Widanage et&#x20;al., 2020</xref>) and Twister2 (<xref ref-type="bibr" rid="B19">Fox, 2017</xref>).</p>
<p>In a previous paper (<xref ref-type="bibr" rid="B25">Kamburugamuve et&#x20;al., 2021</xref>), the authors proposed the <italic>HPTMT</italic> (High-Performance Tensors, Matrices, and Tables); an operator-based architecture for data-intensive applications as a scalable and interoperable way for designing rich data-intensive applications. With <italic>HPTMT</italic> we focus, as depicted in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>, on the interoperability of distributed operators and how one can build large-scale applications using different data abstractions. <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> contains two aspects of the data analytics. One is the data processing which includes data loading, data cleaning, feature engineering, etc. which are the main steps followed in obtaining a final dataset which is ready for mathematical evaluations. The big data systems such as Apache Spark, Apache Flink, Apache Storm, are written on Java language. The other aspect is the mathematical computations required to process the data further. Majority of these data structures are fully or partially represented in terms of dataframes (set of arrays: Dask, Pandas) and separate arrays (Numpy, Tensors). Frameworks like Numpy, Dask, Pandas are written on Python to provide much easier access to data scientists to write analytical programs without concerning about the underlying computation models. The likes of Dask and Cylon further enhances the ability to such computations in parallel to support computation intensive jobs. On top of these computations systems such as PyTorch and Tensorflow allows to run complex mathematical models based on machine learning or deep learning algorithms.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The goal of HPTMT to achieve High Performance in each ecosystem and high-performance integration between ecosystems.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g002.tif"/>
</fig>
<p>This paper will showcase the importance of <italic>HPTMT</italic> architecture through an application that uses various data abstractions in a single distributed environment to compose a rich application. It highlights the scalability of the architecture and its applicability to high-performance computing systems.</p>
<p>The rest of the paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> gives an overview of the <italic>HPTMT</italic> architecture. <xref ref-type="sec" rid="s3">Section 3</xref> describes the distributed execution of various frameworks and how they can work together according to the <italic>HPTMT</italic>. <xref ref-type="sec" rid="s4">Section 4</xref> describes an end-to-end application while <xref ref-type="sec" rid="s5">section 5</xref> highlights the performance. In <xref ref-type="sec" rid="s6">section 6</xref> we describe related work and conclude in <xref ref-type="sec" rid="s8">section&#x20;8</xref>.</p>
</sec>
<sec id="s2">
<title>2&#x20;<italic>HPTMT</italic> Architecture</title>
<p>
<italic>HPTMT</italic> architecture defines an operator model along with an execution model for scaling data-intensive applications. The primary goal of <italic>HPTMT</italic> is the efficient composability of distributed operators around different data structures to define complex data engineering applications. We see this architecture as a good candidate for exascale software environments. Its simple premise&#x2014;<italic>put the parallelism into interoperable libraries</italic> seems practical to implement well on heterogeneous collections of accelerators and CPUs. Note that one of the most widely adopted approaches to parallel computing is the use of runtime libraries of well-implemented parallel operations. This was a key tenet of frameworks such as, High-Performance Fortran HPF (<xref ref-type="bibr" rid="B16">Dongarra et&#x20;al., 2003</xref>) and related parallel environments [HPJava (<xref ref-type="bibr" rid="B11">Carpenter et&#x20;al., 1998</xref>), HPC&#x2b;&#x2b; (Johnson and Gannon, 1997), Chapel (<xref ref-type="bibr" rid="B12">Chamberlain et&#x20;al., 2007</xref>), Fortress (<xref ref-type="bibr" rid="B3">Allen et&#x20;al., 2005</xref>), X10 (<xref ref-type="bibr" rid="B13">Charles et&#x20;al., 2005</xref>), Habanero-Java (<xref ref-type="bibr" rid="B24">Imam and Sarkar, 2014</xref>)]. Even though these frameworks became popular in parallel computing/HPC, they had limited success in data engineering. We believe that a major reason behind this is, the lack of well-defined data engineering operators. Historically, such systems were used in sophisticated computational science simulations with large linear algebra operators (ex: BLAS routines). <italic>HPTMT</italic> attempts to bridge this gap between HPC and data-intensive applications by providing a set of well-defined data engineering operators with highly scalable execution&#x20;model.</p>
<sec id="s2-1">
<title>2.1 Principles</title>
<p>
<italic>HPTMT</italic> architecture defines several core principles for a framework to be compatible with it. These are summarized below and more details can be found in the paper (<xref ref-type="bibr" rid="B25">Kamburugamuve et&#x20;al., 2021</xref>).<list list-type="simple">
<list-item>
<p>&#x2022; Use of multiple data abstractions (Tensors, Matrices, Tables) and operators around them that are suitable for each class of applications.</p>
</list-item>
<list-item>
<p>&#x2022; Loosely Synchronous Execution - In an asynchronous framework, operators and the scheduler are coupled making it harder work across different systems.</p>
</list-item>
<list-item>
<p>&#x2022; Operators should be independent of the parallel execution environment&#x2014;A parallel environment manages the processes and various resources required by operators, such as the network. If the implementation of operators is coupled to the execution environment, we can only use the operators specifically designed for&#x20;it.</p>
</list-item>
<list-item>
<p>&#x2022; Same operator on different hardware&#x2014;The same operator can be implemented on GPUs, CPUs or FPGA (Field Programmable Gate Arrays). Also, they should be able to use different networking technologies such as Ethernet and InfiniBand.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-2">
<title>2.2 Operators</title>
<p>An application domain such as deep learning or data engineering comprises of a combination of operators to build the total job. Based on the data distribution, these operators can be categorized into two groups, namely, local operators (single machine) and distributed operators (across multiple machines). Some operators are purely local or purely distributed, and some can be either. A local operator only works with a single piece of data inside the memory of a single node in a cluster. They give rise to what is called embarrassingly or pleasingly parallel models for distributed execution. Operator based methods are not just used to support parallelism but have several other valuable capabilities.<list list-type="simple">
<list-item>
<p>&#x2022; Allow interpreted languages to be efficient as overhead is amortized over the execution of a (typically large) operation</p>
</list-item>
<list-item>
<p>&#x2022; Support mixed language environments where invoking language (e.g., Python) is distinct from the language that implements the operator (e.g., C&#x2b;&#x2b;)</p>
</list-item>
<list-item>
<p>&#x2022; Support proxy models where user programs in an environment that runs not just in a different language but also on a different computing system from the executing operators. This includes the important case where the execution system includes GPUs and other accelerators.</p>
</list-item>
<list-item>
<p>&#x2022; Support excellent performance even in non-parallel environments. This is the case for Numpy and Pandas operators.</p>
</list-item>
</list>
</p>
<p>Recent frameworks such as Apache Arrow (<xref ref-type="bibr" rid="B4">Apache Arrow, 2021</xref>, Apache Software Foundation, Accessed 2021/Aug), and Parquet (<xref ref-type="bibr" rid="B5">Apache Parquet, 2021</xref>, Apache Software Foundation, Accessed 2021/Aug) provide essential tools which are crucial to our approach to <italic>HPTMT</italic>, and they (or equivalent technologies) are vital for any high-performance multi-language multi-operator class system. They provide efficient language-agnostic column storage for Tables and Tensors that allows vectorization for efficiency and performance. Note that distributed parallel computing performance is typically achieved by decomposing the rows of a table across multiple processors. Then within a processor, columns can be vectorized. This, of course, requires a large amount of data so that each processor has a big enough workflow to process efficiently. It is a well-established principle that the problem needs to be large enough for the success of parallel computing (<xref ref-type="bibr" rid="B21">Fox et&#x20;al., 1994</xref>), which the latest Big Data trends also follow. Note that the most compelling parallel algorithms use block (i.e.,&#x20;row and column) decompositions in scientific computing to minimize communication/compute ratios. Such block decompositions can be used in Big Data (<xref ref-type="bibr" rid="B23">Huai et&#x20;al., 2014</xref>) (i.e. table data structures), but could be less natural due to the heterogeneous data within&#x20;it.</p>
<p>For Big Data problems, individual operators are sufficiently computationally intensive to consider the basic job components as parallel operator invocations. Any given problem typically involves the composition of multiple operators into an analytics pipeline or more complex topology. Each node of the workflow may run in parallel. This can be efficiently and elegantly implemented using workflow such as Parsl (<xref ref-type="bibr" rid="B7">Babuji et&#x20;al., 2019</xref>), Swift (<xref ref-type="bibr" rid="B36">Wilde et&#x20;al., 2011</xref>), Pegasus (<xref ref-type="bibr" rid="B15">Deelman et&#x20;al., 2015</xref>), Argo (<xref ref-type="bibr" rid="B6">Argo Home Page, 2021</xref> <ext-link ext-link-type="uri" xlink:href="https://argoproj.github.io/argo-workflows/">https://argoproj.github.io/argo-workflows/</ext-link>, Accessed 2021/Aug), Kubeflow (<xref ref-type="bibr" rid="B26">Kubeflow, 2021</xref> home page <ext-link ext-link-type="uri" xlink:href="https://www.kubeflow.org/">https://www.kubeflow.org/</ext-link>, 2021), Kubernetes (<xref ref-type="bibr" rid="B9">Burns et&#x20;al., 2016</xref>) or dataflow (Spark, Flink, Twister2) preserving the parallelism of <italic>HPTMT</italic>.</p>
<sec id="s2-2-1">
<title>2.2.1 Categorizing Operators</title>
<p>There are thousands of operators defined for arrays, tensors, tables, and matrices. Note that tensors are similar to arrays but have an important deep learning utility. Matrices are similar to arrays and tensors but typically two dimensional. Tables (and dataframes) are characterized by entries of heterogeneous types. This is widely used in databases where the different columns can have strings to dates to numbers. <xref ref-type="table" rid="T1">Table&#x20;1</xref> shows some common operator categories for tensors as defined by PyTorch, Tensorflow or Keras. These deep learning frameworks define over 700 operators on tensors. Numpy lists 1085 array operations. <xref ref-type="table" rid="T2">Table&#x20;2</xref> shows some of the popular operations on tables, where the Python Pandas library has around 224 dataframe operators out of a listed total of 4782. Also, optimized linear algebra operators are used internally in most widely used math and tensor compute libraries. <xref ref-type="table" rid="T3">Table&#x20;3</xref> contains a classification of BLAS operators, which are local or distributed. The (old but standard) library SCALAPACK has 320 functions (operators) at a given precision and a total of over one thousand.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Sample set of tensor operations as specified by PyTorch.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Operation class</th>
<th align="center">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Create</td>
<td align="left">Create tensors from files, in-memory data or other data structures such as NumPy</td>
</tr>
<tr>
<td align="left">Math</td>
<td align="left">Multiplication, addition</td>
</tr>
<tr>
<td align="left">Statistics</td>
<td align="left">Statistical function such as mean, median, std</td>
</tr>
<tr>
<td align="left">Indexing</td>
<td align="left">Different methods to access values of tensors</td>
</tr>
<tr>
<td align="left">Conversion</td>
<td align="left">Convert a tensor to another format such as NumPy or change the shape of a tensor</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Operators on tables.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Operator</th>
<th align="center">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Select</td>
<td align="left">Filters out some records based on the value of one or more columns</td>
</tr>
<tr>
<td align="left">Project</td>
<td align="left">Creates a different view of the table by dropping some of the columns</td>
</tr>
<tr>
<td align="left">Union</td>
<td align="left">Applicable on two tables having similar schemas to keep all the records from both tables and remove the duplicates</td>
</tr>
<tr>
<td align="left">Cartesian Product</td>
<td align="left">Applicable on two tables having similar schemas to keep the set of all possible record pairs that are present in both tables</td>
</tr>
<tr>
<td align="left">Difference</td>
<td align="left">Retains all the records of the first table, while removing the matching records present in the second table</td>
</tr>
<tr>
<td align="left">Intersect</td>
<td align="left">Applicable on two tables having similar schemas to keep only the records that are present in both tables</td>
</tr>
<tr>
<td align="left">Join</td>
<td align="left">Combines two tables based on the values of columns. Includes variations Left, Right, Full, Outer, and Inner joins</td>
</tr>
<tr>
<td align="left">OrderBy</td>
<td align="left">Sorts the records of the table based on a specified column</td>
</tr>
<tr>
<td align="left">Aggregate</td>
<td align="left">Performs a calculation on a set of values (records) and outputs a single value (record). Common aggregations include summation and multiplication</td>
</tr>
<tr>
<td align="left">GroupBy</td>
<td align="left">Groups the data using the given columns; GroupBy is usually followed by aggregate operations</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Operations as specified by BLAS.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Operation</th>
<th align="center">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Level 1</td>
<td align="left">Operations on vectors i.e.,&#x20;adding two vectors</td>
</tr>
<tr>
<td align="left">Level 2</td>
<td align="left">Operations for combination of vectors and matrices. i.e.,&#x20;matrix and vector multiplication</td>
</tr>
<tr>
<td align="left">Level 3</td>
<td align="left">Matrix operations i.e.,&#x20;matrix and matrix multiplication</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Distributed Operators</title>
<p>A distributed operator works across data in multiple processes in many nodes of a cluster. A distributed operator needs communication options and local operators. Compared to the number of local operators defined on a data structure, there are a limited set of communication operators for a given data structure, and some of them are listed in <xref ref-type="table" rid="T4">Table&#x20;4</xref> where 720 MPI operators support classic parallel computing. Higher-level distributed operations are built by combining these communication operations with local operations, as shown in <xref ref-type="table" rid="T5">Table&#x20;5</xref>. These include the famous MapReduce (<xref ref-type="bibr" rid="B14">Dean and Ghemawat, 2008</xref>) which abstraction showed clearly the similarity between distributed operators in the technical and database computing domains. MapReduce and its implementation in Hadoop enabled parallel databases as in Apache Hive. They added Group-By and key-value pairs to the Reduce operation common in the previous HPF family simulation applications. The powerful yet straightforward MapReduce operation was expanded in Big Data systems, primarily through the operators of Databases (union, join, etc.), Pandas, and the Spark, Flink, Twister2 family of systems.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Communication operations for data structures.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Data structure</th>
<th align="center">Operations</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Arrays</td>
<td align="left">Reduce, AllReduce, Gather, AllGather, Scatter, AllToAll, Broadcast, Point-to-Point</td>
</tr>
<tr>
<td align="left">Tables</td>
<td align="left">Shuffle (Similar to AllToAll but specifically designed for Tables), Broadcast, All-gather</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Higher level distributed operations.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Distributed operation</th>
<th align="center">Implementation</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Sorting tables</td>
<td align="left">Shuffle followed by a local sorting operation</td>
</tr>
<tr>
<td align="left">Join tables</td>
<td align="left">Partitioning of records, shuffle and local join operation</td>
</tr>
<tr>
<td align="left">Matrix multiplication</td>
<td align="left">Point to point communication and local multiplication</td>
</tr>
<tr>
<td align="left">Vector addition</td>
<td align="left">AllReduce with SUM</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Distributed Execution</title>
<p>There are two main distributed execution methods used in current systems. They are fully asynchronous execution and loosely synchronous execution (<xref ref-type="bibr" rid="B20">Fox, 1989</xref>; <xref ref-type="bibr" rid="B33">Valiant, 1990</xref>). In an asynchronous system, the parallel task instances can execute independently using task queues to decouple them in time. This is seen in systems like Spark, Dask, and Hadoop. In a loosely synchronous system, the parallel tasks assume they can directly send messages to other similar jobs. It is called loosely synchronous because synchronization only happens when they need to communicate with each other. Otherwise, parallel task instances can work independently. This makes loosely synchronous applications highly scalable and more performant.</p>
<p>The asynchronous execution demands the system to be tightly integrated with a central coordinator and a scheduler. It may also employ &#x201c;mail-boxes&#x201d; or shared storage to fully decouple each task in the execution (this is important because, there may be in-flight messages and the corresponding receiver task would consume them at a later stage). While this model allows features such as fault-tolerance, dynamic resource allocation, effective usage of compute resources, it is susceptible to scheduler overheads, message passing delays, etc. Thereby, the async model incurs a performance penalty, and also makes it harder to develop distributed operators independently and make them work together.</p>
<p>We observe that the current technology and hardware advancements provide more reliable, highly available compute resources with faster networks. And we believe that these trends enable loosely synchronous execution in modern computing environments, and thereby develop high performance and highly scalable data engineering applications. We are seeing this trend being employed successfully in similar complementary domains, such as distributed data parallel deep learning (PyTorch, Tensorflow, Horovood, etc). Therefore, <italic>HPTMT</italic> architecture embraces this execution model, and attempts to broaden horizons of data engineering and data science in terms of performance and scalability.</p>
</sec>
</sec>
<sec id="s3">
<title>3&#x20;<italic>HPTMT</italic> Frameworks</title>
<p>Now let us look at Cylon and Deep Learning frameworks and see how they can work together according to the <italic>HPTMT</italic> architecture. First, we describe how Cylon is designed to support distributed data engineering on a dataframe abstraction. Then we discuss how Cylon can be coupled with state-of-the-art deep learning frameworks to organise end-to-end data analytics workloads.</p>
<sec id="s3-1">
<title>3.1 Cylon</title>
<p>Cylon (<xref ref-type="bibr" rid="B2">Abeykoon et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B35">Widanage et&#x20;al., 2020</xref>) provides a distributed memory DataFrame API on Python for processing data using a tabular format. Cylon provides a Python API around high-performance compute kernels in C&#x2b;&#x2b;. These kernels are written on top of the Apache Arrow based efficient in-memory table representation. It can be deployed with MPI for distributed memory computations processing large datasets in HPC clusters. Operators in Cylon are based on relational algebra and closely resemble the operators in Pandas DataFrame to provide a consistent experience. The user can program with a global view of data by applying operations on them. Also, they can convert the data to local parallel processes and do in-memory operations as well. Cylon can be thought of as a framework that can work across different frameworks, data formats to connect various applications, as shown in <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Cylon for data engineering. Reproduced with permission from <xref ref-type="bibr" rid="B35">Widanage et al. (2020)</xref>.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g003.tif"/>
</fig>
<p>Cylon is different from other table abstractions such as Modin (<xref ref-type="bibr" rid="B29">Petersohn et&#x20;al., 2020</xref>), Dask (<xref ref-type="bibr" rid="B30">Rocklin, 2015</xref>) and Spark <xref ref-type="bibr" rid="B39">Zaharia et&#x20;al. (2010)</xref> because it supports an efficient loosely synchronous execution model. These other frameworks use the asynchronous execution model, which relies on a central scheduler and a coordinator and does not conform to the <italic>HPTMT</italic> architecture. <xref ref-type="fig" rid="F4">Figure&#x20;4</xref> shows how the Cylon Join operator performs compared to other frameworks. This experiment used 200M records per relation (for both left and right tables in a join) and scaled up to 128 processes. Random data were generated by considering the uniqueness of data to be 10% such that the join performs under higher stress feeling hash functions and hash-based shuffles. In the parallel experiments, each process will be loading an equal amount of data such that the total amount is limited to 200M records. The results from <xref ref-type="fig" rid="F4">Figure&#x20;4</xref> show that our distributed join implementation is faster than Dask and Modin implementations. Also, the scalability in Dask and Modin is not very strong compared to the scaling provided by PyCylon. Also, the Modin couldn&#x2019;t be scaled up beyond a single machine and failed in the execution.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Distributed join performance.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g004.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Deep Learning Frameworks</title>
<p>Deep learning workloads are compute-intensive. Most of the existing deep learning frameworks can run codes in a distributed manner. Here, the widely used approach is the distributed data-parallel model. Distributed data-parallel model deals with the distributed memory architecture and has the loosely synchronous execution capability.</p>
<p>PyTorch offers a distributed data-parallel (DDP) model, which allows the user to train large models using many GPUs. It can use distributed frameworks such as MPI, NCCL, or GLOO for the necessary communication operations for deep learning training with multiple GPUs. Tensorflow does loosely synchronous distributed execution <italic>via</italic> frameworks like Horovod. Due to these reasons, we can think of these systems as <italic>HPTMT</italic> when running data-parallel training using the loosely synchronous execution&#x20;model.</p>
</sec>
<sec id="s3-3">
<title>3.3 Deep Learning and Data Engineering</title>
<p>Because the distributed execution of Cylon and deep learning systems such as PyTorch and Tensor conform to the <italic>HPTMT</italic> architecture, they can work together in a single parallel program. This improves productivity and usability in dealing with end-to-end analytical problems. In a data analytics-aware data engineering workload, three main factors govern usability and performance.<list list-type="simple">
<list-item>
<p>&#x2022; Single source, including data engineering and data analytics</p>
</list-item>
<list-item>
<p>&#x2022; Simple execution mode for sequential and distributed computing</p>
</list-item>
<list-item>
<p>&#x2022; Support for CPUs and GPUs for distributed execution</p>
</list-item>
</list>
</p>
<p>The single source refers to writing the data engineering and analytics code in a single script and executing with a single command. This is a beneficial and efficient method to do data exploration based data analytics. For such workloads, feature engineering and data engineering components are extensively modified to see how the data analytics workload performs for different settings. In such cases, the data scientist must have room to write the usual Python script and run the data analytics workload efficiently, not only in a single node but also across multiple nodes. Simple execution mode refers to running the workload with a simple method to spawn the processes to run in parallel.</p>
<p>Data analytics frameworks provide different methods to spawn parallel jobs. For instance, Dask requires that the user start the workers and schedulers on each node and provide host information for distributed communication. MPI allows for a single execution command <italic>mpirun</italic> to spawn all the processes. Such factors are essential in providing a unified interface to do deep learning easily. Also, the execution mode on various accelerators for deep understanding is a vital component. The majority of the frameworks support both CPU and GPU execution, so it is essential to provide the means to seamlessly integrate with these execution models to support data analytics workloads. Figure ?? highlights the high-level component overlay of a data analytics-aware data engineering workload. We have partitioned the workflow into four stages.<list list-type="simple">
<list-item>
<p>&#x2022; Stage 1: In the first stage, the processes must be spawned depending on the parallelism. A unified process spawning mechanism that identifies worker information such as host IP addresses for each machine or network information is identified at this&#x20;stage.</p>
</list-item>
<list-item>
<p>&#x2022; Stage 2: Worker information is extracted, and data engineering operators will run in distributed mode on top of the data engineering platform, which depends on the worker initialization component. Here the operations can be distributed or pleasingly parallel.</p>
</list-item>
<list-item>
<p>&#x2022; Stage 3: For data analytics workloads, the worker information, network information, chosen accelerator, and data must be provided from the corresponding data engineering process. This mapping is 1:1 for data engineering workers to data analytics workers. But this can also be a many-to-many relationship.</p>
</list-item>
<list-item>
<p>&#x2022; Stage 4: The worker information, network information and data will be used to execute the data analytics workload is distributed or pleasingly parallel&#x20;mode.</p>
</list-item>
</list>
</p>
<p>Considering this generic overview on deploying deep learning workloads with data engineering workloads, we have integrated PyCylon with distributed data-parallel models for PyTorch, Horovod-PyTorch, and Horovod-Tensorflow. Horovod is a distributed deep learning framework that supports a unified API for handling distributed deep learning on multiple frameworks. Horovod supports PyTorch, Tensorflow, and MXNet. In our research, we paid close attention to PyTorch and Tensorflow. Horovod internally uses mpirun to spawn the processes, and this model fits very well with PyCylon internals as we relied on mpirun to spawn the processes. This makes PyCylon uniquely qualified as a supportive data engineering framework for Horovod.</p>
<p>The first step is to initialize the runtime. Here either PyTorch distributed initialization, or PyCylon distributed initialization can be called. But especially on CPUs, the PyTorch initialization must be called since PyTorch internally does not handle the MPI initialization check. But if we use NCCL as the back-end, this constraint does not exist. This is one of the bugs we discovered from our previous research. For the PyTorch DDP, the master address and port must be provided because the NCCL back-end needs to identify which work will be designated as the master-worker to coordinate the communication. In addition, the initialization method has to be set. After the distributed initialization in PyTorch, the PyCylon context must be initialized to set to distributed mode. After this stage, we complete the requirements for stage 1 and partial requirements for stage 3 (network information is also passed along with data in stage 3, which is initialized in this step). <xref ref-type="fig" rid="F1">Figure&#x20;1</xref> is a sample code snippet related to the initialization step.<list list-type="simple">
<list-item>
<p>
<bold>Listing 1.</bold> Stage 1: Initialization for PyTorch With PyCylon</p>
</list-item>
</list>
</p>
<p>
<monospace>os.environ[&#x2019;MASTER_ADDR&#x2019;] &#x3d; master_address</monospace>
</p>
<p>
<monospace>os.environ[&#x2019;MASTER_PORT&#x2019;] &#x3d; port</monospace>
</p>
<p>
<monospace>os.environ[&#x201C;LOCAL_RANK&#x201d;] &#x3d; <bold>str</bold>(rank)</monospace>
</p>
<p>
<monospace>os.environ[&#x201C;RANK&#x201d;] &#x3d; <bold>str</bold>(rank)</monospace>
</p>
<p>
<monospace>os.environ[&#x201C;WORLD_SIZE&#x201d;] &#x3d; <bold>str</bold>(world_size)</monospace>
</p>
<p>
<monospace>dist.init_process_group(backend&#x3d;backend, init_method&#x3d;&#x201C;env://&#x201d;)</monospace>
</p>
<p>
<monospace>mpi_config &#x3d; MPIConfig()</monospace>
</p>
<p>
<monospace>env &#x3d; CylonEnv(config&#x3d;mpi_config, distributed &#x3d;True)</monospace>
</p>
<p>The data engineering workload is done in PyCylon, assuming the distributed mode initialization. We first join two tables and use the join response for a deep learning workload. The distributed join is called by providing the initialized context information to the join function. At the end of this stage, we create the resultant dataframe, and later on, in Stage 3, this dataframe can be used to generate the Numpy array required for deep learning. This stage is typical for any framework, including PyTorch, Tensorflow, etc. <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> details a sample data engineering workload for a data analytics problem.<list list-type="simple">
<list-item>
<p>
<bold>Listing 2.</bold> Stage 2: PyCylon Data Engineering Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>df1 &#x3d; DataFrame(read_csv(&#x201C;...&#x201d;))</monospace>
</p>
<p>
<monospace>df2 &#x3d; DataFrame(read_csv(&#x201C;...&#x201d;))</monospace>
</p>
<p>
<monospace>join_df &#x3d; df1.merge(right&#x3d;df2, left_on&#x3d;[0], right_on&#x3d;[3], algorithm&#x3d;&#x2019;hash&#x2019;)</monospace>
</p>
<p>In Stage 3, Stage 2 is used to create tensors required for the deep learning stage. We also perform the data partitioning for training and testing. This stage is different from framework to framework since the tensor creation and data partitioning steps can have various internal utils. We do not use data loaders or data samplers but note that these tools can be used to generate both. <xref ref-type="fig" rid="F3">Figure&#x20;3</xref> is a sample code snippet for data movement from data engineering workload to data analytics workload.<list list-type="simple">
<list-item>
<p>
<bold>Listing 3.</bold> Stage 3: Moving Data from Data Engineering Workload to Data Analytics Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>data_ar: np.ndarray &#x3d; feature_df.to_numpy()</monospace>
</p>
<p>
<monospace>df_ftrs: np.ndarray &#x3d; data_ar[:, 0:3]</monospace>
</p>
<p>
<monospace>df_lrnr: np.ndarray &#x3d; data_ar[:, 3:4]</monospace>
</p>
<p>
<monospace>x_train, y_train &#x3d; df_ftrs[0:100], df_lrnr[0:100]</monospace>
</p>
<p>
<monospace>x_test, y_test &#x3d; df_ftrs[100:], df_lrnr[100:]</monospace>
</p>
<p>
<monospace>&#x2026;</monospace>
</p>
<p>
<monospace>x_train &#x3d; torch.from_numpy(x_train).to(device)</monospace>
</p>
<p>
<monospace>y_train &#x3d; torch.from_numpy(y_train).to(device)</monospace>
</p>
<p>
<monospace>x_test &#x3d; torch.from_numpy(x_test).to(device)</monospace>
</p>
<p>
<monospace>y_test &#x3d; torch.from_numpy(y_test).to(device)</monospace>
</p>
<p>In Stage 4, we initialize the deep learning model and the DDP model using the sequential model. We pass along device information such that tensors and models are copied to the corresponding devices (if accelerators are involved) for training and testing. This initialization part varies from framework to framework depending on the requirements and APIs. <xref ref-type="fig" rid="F4">Figure&#x20;4</xref> highlights the initialization of a DDP model with PyTorch.<list list-type="simple">
<list-item>
<p>
<bold>Listing 4.</bold> Stage 4: PyTorch Distributed Data Analytics Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>model &#x3d; Network().to(device)</monospace>
</p>
<p>
<monospace>ddp_model &#x3d; DDP(model, device_ids&#x3d;[device])</monospace>
</p>
<p>
<monospace>loss_fn &#x3d; nn.MSELoss()</monospace>
</p>
<p>
<monospace>optimizer &#x3d; optim.SGD(ddp_model.parameters(), lr&#x3d;0.01)</monospace>
</p>
<p>
<monospace>optimizer.zero_grad()</monospace>
</p>
<p>
<monospace>
<bold>for</bold> t <bold>in range</bold>(epochs)<bold>:</bold>
</monospace>
</p>
<p>
<monospace>
<bold>for</bold> x_batch, y_batch <bold>in zip</bold>(x_train, y_train):</monospace>
</p>
<p>
<monospace>prediction &#x3d; ddp_model(x_batch)</monospace>
</p>
<p>
<monospace>loss &#x3d; loss_fn(prediction, y_batch)</monospace>
</p>
<p>
<monospace>loss.backward()</monospace>
</p>
<p>
<monospace>optimizer.step()</monospace>
</p>
<p>
<monospace>optimizer.zero_grad()</monospace>
</p>
<sec id="s3-3-1">
<title>3.3.1 Horovod With PyTorch</title>
<p>Horovod PyTorch provides the ability to scale on both GPUs and CPUs with a unified API. This is significant because PyTorch does not need to be compiled from the source to get MPI capability. Horovod has already offloaded the distributed trainer, optimizer, and allreduce communication packages. The internal DDP mechanism that does this in PyTorch is offloaded.</p>
<p>In Stage 1, the Horovod init method must be called to initialize the environment. After that, the Cylon context can be initialized with distributed runtime true. If GPUs are used, the correct device must be set to PyTorch CUDA configs. To obtain the device IDs, we can either use the rank from Horovod initialization or PyCylon initialization. Still, at the moment, Horovod supports local rank as well, and it is more suitable in terms of effortlessly integrating with the distributed runtime for Horovod-PyTorch. <xref ref-type="fig" rid="F5">Figure&#x20;5</xref> shows a sample code snippet demonstrating how this is accomplished.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Integrating data engineering workload with data analytics workload.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g005.tif"/>
</fig>
<p>
<list list-type="simple">
<list-item>
<p>
<bold>Listing 5.</bold> Stage 1: Initialization for Horovod-PyTorch With PyCylon</p>
</list-item>
</list>
</p>
<p>
<monospace>hvd.init()</monospace>
</p>
<p>
<monospace>mpi_config &#x3d; MPIConfig()</monospace>
</p>
<p>
<monospace>env &#x3d; CylonEnv(config&#x3d;mpi_config, distributed&#x3d;True)</monospace>
</p>
<p>
<monospace>rank &#x3d; env.rank</monospace>
</p>
<p>
<monospace>cuda_available &#x3d; torch.cuda.is_available()</monospace>
</p>
<p>
<monospace>device &#x3d; &#x2019;cuda:&#x2019; &#x2b; <bold>str</bold>(rank) <bold>if</bold> cuda_available <bold>else</bold> &#x2019;cpu&#x2019;</monospace>
</p>
<p>
<monospace>
<bold>if</bold> cuda_available:</monospace>
</p>
<p>
<monospace>&#x23; <italic>Horovod: pin GPU to local rank.</italic>
</monospace>
</p>
<p>
<monospace>torch.cuda.set_device(hvd.local_rank())</monospace>
</p>
<p>
<monospace>torch.cuda.manual_seed(42)</monospace>
</p>
<p>Another essential thing to note is that the data engineering code remains the same for any deep learning framework discussed in this context. Also, as with the PyTorch data engineering section, the output can be converted to a Numpy array using the endpoints from the PyCylon dataframe. Also, the tensors can be created by providing the device IDs obtained from the Horovod runtime, and data can be prepared for a deep learning workload.</p>
<p>In Stage 4, following the tensor creation step, the Horovod-related initialization must be done to prepare the optimizers, network and other utils for distributed training. PyTorch-Horovod integration, PyTorch&#x2019;s default neural network model, loss function, and optimizer can be used as input to the distributed computation-enabled Horovod components. First, the model parameters and optimizer must be broadcast using the Horovod broadcast method from 0<sup>
<italic>th</italic>
</sup> rank. There are two method calls designated for initial network values and optimizer values. Also, Horovod provides a compression algorithm to select whether compression is required for distributed communication. After these steps, the distributed optimizer must be set by passing the initialized values. <xref ref-type="fig" rid="F6">Figure&#x20;6</xref> includes a sample code snippet to initialize the Horovod components for distributed data-parallel deep learning with PyTorch.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Response block module.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g006.tif"/>
</fig>
<p>
<list list-type="simple">
<list-item>
<p>
<bold>Listing 6.</bold> Stage 4: Distributed Data Analytics PyTorch-Horovod Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>optimizer &#x3d; optim.SGD(...)</monospace>
</p>
<p>
<monospace>hvd.broadcast_parameters(model.state_dict(), root_rank&#x3d;0)</monospace>
</p>
<p>
<monospace>hvd.broadcast_optimizer_state(optimizer, root_rank&#x3d;0)</monospace>
</p>
<p>
<monospace>compression &#x3d; hvd.Compression.fp16</monospace>
</p>
<p>
<monospace>model_ps &#x3d; model.named_parameters()</monospace>
</p>
<p>
<monospace>optimizer &#x3d; hvd.DistributedOptimizer(optimizer, named_parameters&#x3d;model_ps, compression&#x3d;compression, op&#x3d;hvd.Adasum, gradient_predivide_factor&#x3d;1.0)</monospace>
</p>
</sec>
<sec id="s3-3-2">
<title>3.3.2 Horovod With Tensorflow</title>
<p>Similar to PyTorch integration, Horovod also supports Tensorflow. Tensorflow has its own distributed training platform. It contains distributed mirrored strategy as the equivalent routine for distributed data-parallel training. To start this run, we initialize Horovod and PyCylon. As with PyTorch, we also need to decide how the device is selected depending on the accelerator. The Tensorflow config API provides a listing of GPUs, and this information is added to the Tensorflow configurations to make all the GPU devices available. <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> is a code snippet for the aforementioned initialization.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Response network.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g007.tif"/>
</fig>
<p>
<list list-type="simple">
<list-item>
<p>
<bold>Listing 7.</bold> Stage 1: Initialization for Tensorflow With PyCylon</p>
</list-item>
</list>
</p>
<p>
<monospace>hvd.init()</monospace>
</p>
<p>
<monospace>assert hvd.mpi_threads_supported()</monospace>
</p>
<p>
<monospace>mpi_config &#x3d; MPIConfig()</monospace>
</p>
<p>
<monospace>env &#x3d; CylonEnv(config&#x3d;mpi_config, distributed&#x3d;True)</monospace>
</p>
<p>
<monospace>rank &#x3d; env.rank</monospace>
</p>
<p>
<monospace>world_size &#x3d; env.world_size</monospace>
</p>
<p>
<monospace>gpus &#x3d; tf.config.experimental.list_physical_devices(&#x2019;GPU&#x2019;)</monospace>
</p>
<p>
<monospace>
<bold>for</bold> gpu <bold>in</bold> gpus:</monospace>
</p>
<p>
<monospace>tf.config.experimental.set_memory_growth(gpu, True)</monospace>
</p>
<p>
<monospace>
<bold>if</bold> gpus:</monospace>
</p>
<p>
<monospace>tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], &#x2019;GPU&#x2019;)</monospace>
</p>
<p>Similar to prior experience, the data engineering component also remains unchanged for Horovod-Tensorflow integration. The data analytics data structure creation is different from framework to framework. Tensorflow has its own set of APIs to make these steps simpler and more structured. The Tensorflow dataset API can be used to create tensors from Numpy arrays, and this API can be used to shuffle and create mini-batches, as expected by the deep learning workload. <xref ref-type="fig" rid="F8">Figure&#x20;8</xref> contains a code snippet detailing this step.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Drug response data processing.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g008.tif"/>
</fig>
<p>
<list list-type="simple">
<list-item>
<p>
<bold>Listing 8.</bold> Stage 3: Moving Data from Data Engineering Workload to Data Analytics Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>&#x2026;</monospace>
</p>
<p>
<monospace>train_dataset &#x3d; tf.data.Dataset.from_tensor_slices((x_train, y_train))</monospace>
</p>
<p>
<monospace>test_dataset &#x3d; tf.data.Dataset.from_tensor_slices((x_test, y_test))</monospace>
</p>
<p>
<monospace>BATCH_SIZE &#x3d; 64</monospace>
</p>
<p>
<monospace>SHUFFLE_BUFFER_SIZE &#x3d; 100</monospace>
</p>
<p>
<monospace>train_dataset &#x3d; train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)</monospace>
</p>
<p>
<monospace>test_dataset &#x3d; test_dataset.batch(BATCH_SIZE)</monospace>
</p>
<p>
<monospace>&#x2026;</monospace>
</p>
<p>Horovod-Tensorflow also requires a set of initialization steps to train a Tensorflow deep learning model. Like PyTorch, the Tensorflow loss function, optimization function and neural network model are compatible with Tensorflow-Horovod internals. The gradient tape from Tensorflow autograd can be used, and for this, Horovod provides a DistributedGradientTape operator, which takes the gradient tape instance as a parameter. In addition, before training, this DistributedGradientTape must be initialized with the model parameters and loss function, and the optimizer values must be set to initial values. Again, the model parameters and optimizer values must be broadcast using designated Horovod broadcast functions. <xref ref-type="fig" rid="F9">Figure&#x20;9</xref> illustrates this.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Drug feature data processing.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g009.tif"/>
</fig>
<p>
<list list-type="simple">
<list-item>
<p>
<bold>Listing 9.</bold> Stage 4: Distributed Data Analytics Horovod-Tensorflow Workload</p>
</list-item>
</list>
</p>
<p>
<monospace>model &#x3d; tf.keras.Sequential(&#x2026; )</monospace>
</p>
<p>
<monospace>loss &#x3d; tf.losses.MeanSquaredError()</monospace>
</p>
<p>
<monospace>opt &#x3d; tf.optimizers.Adam(0.001 &#x2a; hvd.size())</monospace>
</p>
<p>
<monospace>@tf.function</monospace>
</p>
<p>
<monospace>
<bold>def</bold> training_step(images, labels, first_batch):</monospace>
</p>
<p>
<monospace>with tf.GradientTape() as tape:</monospace>
</p>
<p>
<monospace>probs &#x3d; model(images, training&#x3d;True)</monospace>
</p>
<p>
<monospace>loss_value &#x3d; loss(labels, probs)</monospace>
</p>
<p>
<monospace>tape &#x3d; hvd.DistributedGradientTape(tape)</monospace>
</p>
<p>
<monospace>grads &#x3d; tape.gradient(loss_value, model.trainable_variables)</monospace>
</p>
<p>
<monospace>opt.apply_gradients(<bold>zip</bold>(grads, model.trainable_variables))</monospace>
</p>
<p>
<monospace>
<bold>if</bold> first_batch:</monospace>
</p>
<p>
<monospace>hvd.broadcast_variables(model.variables, root_rank&#x3d;0)</monospace>
</p>
<p>
<monospace>hvd.broadcast_variables(opt.variables(), root_rank&#x3d;0)</monospace>
</p>
<p>
<monospace>
<bold>return</bold> loss_value</monospace>
</p>
</sec>
</sec>
</sec>
<sec id="s4">
<title>4 UNOMT Application</title>
<p>To demonstrate an end-to-end <italic>HPTMT</italic> architecture, we implemented a scientific application with a workload containing data engineering and data science computations. Our objective is to showcase how a sequential workload can be designed in a distributed manner using PyCylon and run a deep learning workload seamlessly on only a single script with a unified runtime. For this, we selected an application that uses Pandas dataframe for data engineering and PyTorch for data analytics. The original application is sequentially executed, and we have implemented a parallel version of this application with PyCylon and distributed PyTorch.</p>
<sec id="s4-1">
<title>4.1 Background</title>
<p>UNOMT application is part of CANDLE <xref ref-type="bibr" rid="B37">Wozniak et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B38">Xia et&#x20;al. (2021)</xref> research conducted by Argonne National Laboratory, focusing on automated detection of tumour cells using a deep learning approach. The uniqueness of this approach is the composition of a data engineering workload followed by a deep learning workload written in PyTorch. This provides an ideal scientific experiment to showcase multiple systems working together to facilitate an efficient data pipeline. The goal of the UNOMT application is to give a cross-comparison of cancer studies and integrate it into a unified drug response model. Cell RNA sequences, drug descriptors and drug fingerprints are used as such responses to train the&#x20;model.</p>
<p>In the deep learning component, multiple networks are involved working on small and large datasets in the training process. Our research focuses on the more extensive network designed to calculate the drug response based on the cell-line information.</p>
</sec>
<sec id="s4-2">
<title>4.2 Deep Learning Component</title>
<p>UNOMT refers to a unified deep learning model with multi-tasks to predict drug response as a function of tumour and drug features for personalized cancer treatment. Precision oncology focuses on providing medicines for specific characteristics of a patient&#x2019;s tumour. The drug sensitivity is quantified by drug dose-response values which measure the ratio of treated to untreated cells after treatment with a specific drug concentration. In this application, a set of drug data obtained from the NCI60 human tumour cell line database <xref ref-type="bibr" rid="B32">Shoemaker (2006)</xref> is used to predict the drug response by considering gene expression, protein and microRNA abundance. As per the contemplated scope, the UNOMT application we focus on in the study is conducted on single-drug response prediction using NCI60 and gCSI datasets. We used 1006 drugs from NCI60 database for this evaluation and gCSI for the cross-validation. The original application runs sequentially, and our contribution is providing a parallelized runtime for data engineering and running the deep learning workload alongside&#x20;it.</p>
<p>The drug response model contains a dense input layer of shape 1537 to get the concatenated results of the gene network and the drug network response along with the concentration value. Within the drug response regression network, there is another residual block being used repeatedly. This layer is called the drug response block module, which contains two dense layers followed by a dropout layer and a ReLU activation layer. <xref ref-type="fig" rid="F6">Figure&#x20;6</xref> depicts the response block module.</p>
<p>Residual blocks are stacked, and a set of dense layers are as well. Finally, the regression layer contains a single output dense layer. The number of response blocks can be customized dynamically and the number of dense layers that follow it. All these parameters can be provided as a hyper-parameter in the application configuration file. <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> shows the drug response regression network.</p>
<p>This network is trained in a distributed data-parallel model since it contains a large dataset and a complex network compared to the other examples. The corresponding data engineering component is also distributed data-parallel, which is discussed in detail in <xref ref-type="sec" rid="s4-3">Section&#x20;4.3</xref>.</p>
</sec>
<sec id="s4-3">
<title>4.3 Data Engineering Component</title>
<p>UNOMT application uses 2.5 million samples of cancer data across six research centres. This model analyses the study bias across these samples to design a unified drug response model. Before building this model, the application consists of a data engineering workload written in Pandas. The application consists of a few data engineering operators: concat (inner-join), to_csv, rename, read_csv, astype, set_index, map, isnull, drop, filter, add_prefix, reset_index, drop_duplicates, not_null, isin and dropna.</p>
<p>The existing data engineering workload is written in Pandas and does not run in parallel. We re-engineered this application to a parallel data engineering workload. We designed a seamless integration between data analysis and data engineering workload consuming state-of-the-art high-performance computing resources. We also integrated a Modin-based implementation to showcase the performance comparison with our implementation. The data engineering workload is executed in CPU-based distributed memory, and the data analytical workload can be either run in CPU or GPU. We use Pytorch for data analytics workload and extend it to PyTorch distributed data-parallel training. Our objective is to integrate an HPC-based full stack of data analytics-aware data engineering for scalability. PyCylon only supports this feature at the moment. Also, we stress the importance of designing a BSP-based model for deep learning workloads associated with data engineering components for better performance and scalability in HPC hardware.</p>
<p>The data analytics component requires a set of features to be engineered from the raw data. Here, three primary datasets are necessary to create the complete dataset for the drug response model. <xref ref-type="fig" rid="F8">Figure&#x20;8</xref> refers to the primary dataset, which contains the drug response. The raw dataset possesses additional features, so the data is loaded in the initial stage, and a column filtering operation selects extract the expected features. Then a map operation is performed to preprocess a drug ID column to remove symbols from the columns and create a consistent drug ID. Once the data are cleaned, they are scaled with the Scikit-learn preprocessing library for scaling numerical values. After this, the data are fully converted into a numeric type to provide numeric tensors for the deep learning workload. In the parallel mode, we partition this dataset with the set parallelism, upon which it is passed to the corresponding operators.</p>
<p>To formulate the global dataset, we require two other datasets which act as metadata to filter and process the primary drug response dataset. The first is the drug feature raw dataset, which contains drug features required to be located in the drug response data. Two sub-datasets contribute to formulating the drug feature dataset. We merge them by performing an inner join on the dataset based on the index formed on the drug IDs. After that, we cast the data into numeric types and output them as a numeric array which is later converted to a numeric tensor for deep learning. This data processing workflow is shown in <xref ref-type="fig" rid="F9">Figure&#x20;9</xref>.</p>
<p>The other dataset required is the RNA sequence dataset containing information about RNA sequences. Here the dataset is first processed to remove specific symbols by a map operation, and then duplicate records are dropped by a drop duplicate operator. Then an index is set for this dataset, and later on, scaling is done on the numeric data using the Scikit-Learn preprocessing library. Finally, the data is cast to a numeric type, and preprocessed RNA-sequence data are formulated as a Numpy array, which is later converted into a numeric tensor for the deep learning workload. This data processing pipeline can be found in <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>RNA sequence data processing.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g010.tif"/>
</fig>
<p>Once the drug response data, drug feature data and RNA-sequence data are preprocessed, the final data for the drug response model is engineered as shown in <xref ref-type="fig" rid="F11">Figure&#x20;11</xref>. The processed drug response data are further feature-selected, and a unique operation is applied. Then the RNA sequence data is filtered by checking whether specific drug-related RNA sequences are present. The same is done for the drug feature dataset. These two operations are done by the isin operator. Afterwards, the common drug set is selected by performing an and operation, and later these common drug-related drug response data filters are used to get the final drug response&#x20;data.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Drug response overall data processing.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g011.tif"/>
</fig>
<p>Among the operators applied, since we partitioned the data, each data engineering operator can work independently in a pleasingly parallel manner. But we can rely on the distributed unique operator to ensure no duplicate records are used for deep learning across all processes. Note that the data engineering component of this application is feature engineering metadata, and we use them to filter an extensive dataset converted to formulate the expected input for the drug response&#x20;model.</p>
</sec>
</sec>
<sec id="s5">
<title>5 Performance Evaluation</title>
<p>The original UNOMT application was a single-threaded application implemented on Pandas for data engineering and PyTorch for deep learning. Our first goal was to implement the sequential version of the application and improve the sequential performance. After the first stage, we conducted distributed experiments to see how we could scale our workload on CPUs for data engineering. We also extended the deep learning component of this application by integrating with PyTorch distributed execution framework on both CPUs and GPUs using MPI and NCCL, respectively. Our goal was to seamlessly incorporate a deep learning-aware data engineering workload using a single Python data engineering and deep learning script with a single runtime in this benchmark. Also, note that we used the drug response network-related more extensive data distribution for the application benchmark. At the same time, the smaller networks require a much shorter execution time than this larger&#x20;model.</p>
<sec id="s5-1">
<title>5.1 Setup</title>
<p>For the experiments, we had two sets of clusters for CPUs and GPUs. Victor cluster of Future Systems was used with six nodes and 16 processes per each on the maximum parallelism for CPUs. This cluster contains Intel(R) Xeon(R) Platinum 8160 CPU @ 2.10&#xa0;GHz machine per node. GPU experiments had Tesla K80s with 8 GPU devices on Google Cloud Platform. For single-node single-process executions, we used the same Victor nodes. Pandas, PyCylon (single-core) and Modin (single-core) were deployed for the sequential performance comparisons. Finally, for the distributed performance comparisons, we used PyCylon and Modin on single node multi-core scaling. We selected Modin instead of Dask because it is closer to the data engineering stack proposed by PyCylon due to eager execution and the ability to convert an existing Pandas data engineering workload in a straightforward manner.</p>
</sec>
<sec id="s5-2">
<title>5.2 Sequential Execution Performance</title>
<p>We first conducted experiments to evaluate the proposed systems&#x2019; single process execution, PyCylon, Modin, and Pandas. Modin provides the ability to convert a Pandas data engineering workload utilizing a single line of code. In contrast, PyCylon offers a dynamic API allowing the user to dynamically decide the nature of sequential and parallel operators. We evaluated the data engineering performance for the drug response data preprocessing workload used for the drug response regression network. <xref ref-type="fig" rid="F12">Figure&#x20;12</xref> has the single-core performance for the aforementioned data engineering workload. We observe that the sequential performance of PyCylon and Pandas are very similar, while Modin is much slower. These measurements include data loading efficiency plus overall operator performance improvements. But in a general way, Pandas and PyCylon have almost similar performance in most operators except for data loading, duplicate handling, null handling and search operations involved in this application. Note that both PyCylon and Modin are evolving data engineering frameworks to support data engineering on tabular&#x20;data.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Sequential data engineering.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g012.tif"/>
</fig>
<p>This section of the application entails data normalization, parameterized data partition, and statistical data processing with third-party Python libraries like Scikit-Learn. These libraries integrate with Pandas dataframe seamlessly. Since PyCylon employs zero-copy conversion to and from Pandas, such third-party libraries can be easily integrated without a performance penalty. But for Modin, it cannot go back-and-forth between the Pandas data structure. This caused some of these operations to be relatively slower for Modin, compared to Pandas and PyCylon. This shows that we have to go beyond the dataframe construct and integrate with third-party libraries in implementing real-world applications. And to integrate with such libraries, data engineering frameworks must be very well designed with widely used data structures used by data scientists.</p>
</sec>
<sec id="s5-3">
<title>5.3 Distributed Execution Performance</title>
<p>UNOMT computation can be distributed in a data-parallel setting. For the distributed implementation, the sequential scripts were ported to distributed <italic>HPTMT</italic> (PyCylon) operators. We initially evaluated the performance for a single-node multi-core setup. <xref ref-type="fig" rid="F13">Figure&#x20;13</xref> shows the results for that application. These results show that the PyCylon is scaling well compared to Modin in the distributed data-data parallel setting.</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>Multi-core data parallel data engineering performance.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g013.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F14">Figure&#x20;14</xref> depicts the relative speed-up for each framework. We observe that PyCylon has a much relative speed-up compared to Modin. We observed the similar scaling results when we comparing the performance for distributed-join operation. A reasonable conclusion drawn from these results is that data engineering applications could greatly benefit from employing <italic>HPTMT</italic> architecture.</p>
<fig id="F14" position="float">
<label>FIGURE 14</label>
<caption>
<p>Multi-core data parallel data engineering speed-up.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g014.tif"/>
</fig>
<p>We extended the distributed experiments further for multi-node multi-core. We observed that Modin failed to scale beyond a single node and failed in the cluster set-up. This could be a lack of documentation or an issue with the distributed framework Modin uses. Modin doesn&#x2019;t contain its own distributed runtime but relies on Ray or Dask. But with PyCylon, conveniently gets distributed in multi-node BSP (ex: MPI) execution environment. The distributed data engineering performance for PyCylon is shown in <xref ref-type="fig" rid="F15">Figure&#x20;15</xref>.</p>
<fig id="F15" position="float">
<label>FIGURE 15</label>
<caption>
<p>PyCylon multi-node multi-core distributed data parallel data engineering.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g015.tif"/>
</fig>
</sec>
<sec id="s5-4">
<title>5.4 Deep Learning Execution</title>
<p>The deep learning experiments also extend data engineering runs on CPUs but both CPU and GPUs. As indicated previously, deep learning computation also built on distributed data-parallel (DDP) setting. For these experiments, we used the same number of processes for both data engineering and deep learning. But PyCylon can be further improved to run in many-to-many process mapping for more complex data-parallel executions.</p>
<p>We selected PyTorch distributed communication framework with MPI for CPUs and NCCL for GPUs for the data analytics scaling experiments. The single process experiment results are the similar for PyCylon and Pandas, and both have the same PyTorch codebase. Furthermore, all the data were in memory before the deep learning workload, so there was no overhead in loading data to create mini-batches. The CPU-based DDP experiments scaled well across multi-nodes, but we observed a slight memory overhead, causing the application to scale below the ideal point. We completed more experiments to evaluate an overhead from the data engineering framework, but we observed no significant overheads hindering the scalability on CPUs. <xref ref-type="fig" rid="F16">Figure&#x20;16</xref> highlights the average computation and communication time spent per epoch as we add more resources to the setup. We used a locally built PyTorch binary with MPI execution. One significant factor is that PyTorch becomes an ideal distributed computation deep learning framework for PyCylon since PyCylon also supports an MPI backend for distributed computation.</p>
<fig id="F16" position="float">
<label>FIGURE 16</label>
<caption>
<p>Distributed data parallel deep learning on CPU.</p>
</caption>
<graphic xlink:href="fdata-04-756041-g016.tif"/>
</fig>
<p>The GPU-based DDP experiments were handled with a single-node multi-GPU experiment setting to see how the data analytics workload could be scaled on the NCCL execution framework with PyTorch. <xref ref-type="fig" rid="F17">Figure&#x20;17</xref> displays the results for single GPU and multi-GPU experiments. We observed that the execution time was dominated by the communication time. With the increase of parallelism, the number of communications across devices increases, but the number of batches that has to be sent across devices decreases. This gives an advantage in scaling. When we consider the computation time, we saw that scaling happens closer to the ideal scaling point in all parallel settings. In addition, the computation is much faster in Parallelism 2 than in Parallelism 1, where the memory overhead is 50% less than the sequential execution. When considering CPU vs GPU performance for the deep learning workload, the speed-up from GPUs is 2x compared to CPUs in this network.</p>
<fig id="F17" position="float">
<label>FIGURE 17</label>
<caption>
<p>Distributed data parallel deep learning on GPU</p>
</caption>
<graphic xlink:href="fdata-04-756041-g017.tif"/>
</fig>
</sec>
</sec>
<sec id="s6">
<title>6 Related Work</title>
<p>There are many efforts to build efficient distributed operators for data science and data engineering. Frameworks like Apache Spark (<xref ref-type="bibr" rid="B40">Zaharia et&#x20;al., 2016</xref>), Apache Flink (<xref ref-type="bibr" rid="B10">Carbone et&#x20;al., 2015</xref>) and Map Reduce (<xref ref-type="bibr" rid="B14">Dean and Ghemawat, 2008</xref>) are legacy systems created for data engineering. And many programming models have been developed on top of these big data systems to facilitate data analysis (<xref ref-type="bibr" rid="B8">Belcastro et&#x20;al., 2019</xref>). Later on, these systems adopted the data analytics domain under their umbrella of big data problems. But with the emerging requirement for high-performance computing for data science and data engineering, the existing parallel operators in these frameworks don&#x2019;t provide adequate performance or flexibility (<xref ref-type="bibr" rid="B18">Elshawi et&#x20;al., 2018</xref>). Frameworks like Pandas <xref ref-type="bibr" rid="B27">McKinney (2011)</xref> gained more popularity in the data science community because of their usability. Pandas only provide serial execution, and Dask (<xref ref-type="bibr" rid="B30">Rocklin, 2015</xref>) uses it internally (parallel Pandas) to provide parallel operators. Also, it was re-engineered as Modin (<xref ref-type="bibr" rid="B29">Petersohn et&#x20;al., 2020</xref>) to run the dataframe operators in parallel. But these efforts are mainly focused on a driver-based asynchronous execution model, a well-known bottleneck for distributed applications.</p>
<p>The majority of the data analytics workloads tend to use data-parallel execution or bulk synchronous parallel (loosely synchronous) mode. This idea originated in 1987 from Fox, G.C. in the article &#x201c;What Have We Learnt from Using Real Parallel Machines to Solve Real Problems&#x201d; <xref ref-type="bibr" rid="B20">Fox (1989)</xref>. Later, a similar idea was published in an article by Valiant, L <xref ref-type="bibr" rid="B33">Valiant (1990)</xref> in 1990 which introduced the term &#x201c;<italic>Bulk Synchronous Parallel</italic>&#x201d;. Frameworks like PyTorch (<xref ref-type="bibr" rid="B28">Paszke et&#x20;al., 2019</xref>) adopted this HPC philosophy, and distributed runtimes like Horovod (<xref ref-type="bibr" rid="B31">Sergeev and Del Balso, 2018</xref>) generalized this practice for most of the existing deep learning frameworks. They were adopting this philosophy along the same time HPC-driven big data systems like Twister2 (<xref ref-type="bibr" rid="B19">Fox, 2017</xref>; <xref ref-type="bibr" rid="B1">Abeykoon et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B34">Wickramasinghe et&#x20;al., 2019</xref>) were created to bridge the gap between data engineering and deep learning. But with the language boundaries of Java (<xref ref-type="bibr" rid="B17">Ekanayake et&#x20;al., 2016</xref>) and usability with native-C&#x2b;&#x2b; based Python implementations were favoured over JVM-based systems. PyCylon (<xref ref-type="bibr" rid="B2">Abeykoon et&#x20;al., 2020</xref>) dataframes for distributed CPU computation and Cudf (<xref ref-type="bibr" rid="B22">Hern&#xe1;ndez et&#x20;al., 2020</xref>) dataframes for distributed GPU computation were designed. The seamless integration of data engineering and deep learning was a possibility with such frameworks and nowadays are being widely used in the data science and data engineering sphere to do rapid prototyping and design production-friendly applications.</p>
</sec>
<sec id="s7">
<title>7 Limitations and Future Work</title>
<p>As showcased in the <xref ref-type="sec" rid="s5">Section 5</xref>, <italic>HPTMT</italic> Model scales well in a distributed environment using BSP execution. This requires dedicated resource allocation. Thus it does not support dynamic auto-scaling, which may be an important aspect in a multi-tenant cloud environment. In most of the client-server (fully asynchronous) frameworks such as Dask, Spark, etc. provide the ability to allocate new resources without interrupting current job. Even with a process-memory snapshot mechanism, the system comes to a pause and will be restarted with the new processes. Furthermore, fault-tolerance is another useful aspect in a cloud setup. Even though the cloud hardware are becoming increasingly cheaper, more reliable, and widely available, we believe that dynamic scaling and fault tolerance would be important, and those would be incorporated with <italic>HPTMT</italic> and Cylon in the future.</p>
<p>Our next focus is to provide an enhanced set of collective communication operators on the tabular level to data science/engineering application developers. The main future objective is to provide a set of advanced APIs for the data science application developers to design complex data engineering applications with a high performance toolkit in hand with much better usability.</p>
<p>Furthermore, we believe that both BSP and fully asynchronous execution models are important for complex data engineering pipelines. We are currently integrating an asynchronous execution model into <italic>HPTMT</italic> and Cylon, using workflow management concepts. This would enable creating individual data engineering workflows that runs on BSP, while each of these workflows be scheduled asynchronously. We believe such a system would optimize resource allocation without hindering the overall performance.</p>
</sec>
<sec sec-type="conclusion" id="s8">
<title>8 Conclusion</title>
<p>In this paper, we proposed the <italic>HPTMT</italic> architecture that defines an operator and execution model for scaling data-intensive applications. We showcased the applicability of this architecture in an en end-to-end application using Cylon framework, where data engineering and deep learning operators working together in a single distributed program. We believe that it is important to formulate and clearly define the core concepts used in developing Cylon, which could help in building highly scalable big-data applications in the future. <italic>HPTMT</italic> multi-process experiments&#x2019; results show how well the proposed system architecture can scale compared to the existing systems with the non-synchronous mode of computation. Also, the parallel performance gain ratio is 6:1 in favor of the proposed system. This highlighted the importance of <italic>HPTMT</italic> based distributed and local operators on a different data structure that can work together in a single program. Further, the <italic>HPTMT</italic> style operators are more efficient in executing at scale, due to their loosely synchronous nature and low scheduling/coordination overhead. With the future work proposed for the architecture, we believe that we can elevate Cylon to be a truly high-performance data engineering framework built for the future.</p>
</sec>
</body>
<back>
<sec id="s9">
<title>Data Availability Statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/cylondata/cylon">https://github.com/cylondata/cylon</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://github.com/ECP-CANDLE/Benchmarks/tree/master/Pilot1/UnoMT">https://github.com/ECP-CANDLE/Benchmarks/tree/master/Pilot1/UnoMT</ext-link>.</p>
</sec>
<sec id="s10">
<title>Author Contributions</title>
<p>All authors listed have made a substantial, direct, and intellectual contribution to the work and approved it for publication.</p>
</sec>
<sec id="s11">
<title>Funding</title>
<p>This work is partially supported by the National Science Foundation (NSF) through awards CIF21 DIBBS 1443054, SciDatBench 2038007, CINES 1835598 and Global Pervasive Computational Epidemiology 1918626.</p>
</sec>
<sec sec-type="COI-statement" id="s12">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s14">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdata.2021.756041/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdata.2021.756041/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Image1.TIFF" id="SM1" mimetype="application/TIFF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image2.JPEG" id="SM2" mimetype="application/JPEG" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Abeykoon</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Govindrarajan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wickramasinghe</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Widanage</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Perera</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Streaming Machine Learning Algorithms with Big Data Systems</article-title>,&#x201d; in <conf-name>2019 IEEE International Conference on Big Data (Big Data)</conf-name>, <conf-loc>Los Angeles, CA, USA</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>5661</fpage>&#x2013;<lpage>5666</lpage>. <pub-id pub-id-type="doi">10.1109/bigdata47090.2019.9006337</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Abeykoon</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Perera</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Widanage</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kanewala</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Maithree</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Data Engineering for Hpc with python</article-title>,&#x201d; in <conf-name>2020 IEEE/ACM 9th Workshop on Python for High-Performance and Scientific Computing (PyHPC)</conf-name>, <conf-loc>Atlanta, GA, USA</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>13</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1109/pyhpc51966.2020.00007</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Allen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chase</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hallett</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Luchangco</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Maessen</surname>
<given-names>J.-W.</given-names>
</name>
<name>
<surname>Ryu</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>The Fortress Language Specification</article-title>. <source>Sun Microsystems</source> <volume>139</volume>, <fpage>116</fpage>. </citation>
</ref>
<ref id="B4">
<citation citation-type="web">
<collab>Apache Arrow</collab> (<year>2021</year>). <article-title>Apache Software Foundation (Accessed 2021/Aug)</article-title>. <comment>Apache arrow</comment>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arrow.apache.org/">https://arrow.apache.org/</ext-link>
</comment>(<comment>Accessed Aug 08, 2021)</comment>. </citation>
</ref>
<ref id="B5">
<citation citation-type="web">
<collab>Apache Parquet</collab> (<year>2021</year>). <article-title>Apache Software Foundation (Accessed 2021/Aug)</article-title>. <comment>Apache parquet project</comment>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://parquet.apache.org/">https://parquet.apache.org/</ext-link>
</comment>(<comment>Accessed Aug 08, 2021)</comment>. </citation>
</ref>
<ref id="B6">
<citation citation-type="web">
<collab>Argo Home Page</collab> (<year>2021</year>). <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://argoproj.github.io/argo-workflows/">https://argoproj.github.io/argo-workflows/</ext-link>
</comment>(<comment>Accessed Aug 08, 2021)</comment>.</citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Babuji</surname>
<given-names>Y. N.</given-names>
</name>
<name>
<surname>Chard</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Foster</surname>
<given-names>I. T.</given-names>
</name>
<name>
<surname>Katz</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Wilde</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Woodard</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Scalable Parallel Programming in Python with Parsl</article-title>,&#x201d; in <conf-name>Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning) (PEARC &#x2019;19)</conf-name>, <conf-loc>Chicago, IL, USA</conf-loc>, (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1145/3332186.3332231</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Belcastro</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Marozzo</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Talia</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Programming Models and Systems for Big Data Analysis</article-title>. <source>Int. J.&#x20;Parallel, Emergent Distributed Syst.</source> <volume>34</volume>, <fpage>632</fpage>&#x2013;<lpage>652</lpage>. <pub-id pub-id-type="doi">10.1080/17445760.2017.1422501</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Burns</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grant</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Oppenheimer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Brewer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Wilkes</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Borg, omega, and Kubernetes</article-title>. <source>Queue</source> <volume>14</volume>, <fpage>70</fpage>. <pub-id pub-id-type="doi">10.1145/2898442.2898444</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carbone</surname>
</name>
<name>
<surname>Ewen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Haridi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Katsifodimos</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Markl</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tzoumas</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Apache Flink: Stream and Batch Processing in a Single Engine Paris</article-title>,&#x201d; in <conf-name>Bulletin of the Technical Committee on Data Engineering, IEEE Computer Society Special Issue on Next-Generation Stream Processing</conf-name>, <conf-date>December 2015</conf-date> <volume>38</volume> (<issue>4</issue>). <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://sites.computer.org/debull/A15dec/p28.pdf">http://sites.computer.org/debull/A15dec/p28.pdf</ext-link>
</comment>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Carpenter</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Hpjava: Data Parallel Extensions to Java</article-title>. <source>Concurrency: Pract. Exper.</source> <volume>10</volume>, <fpage>873</fpage>&#x2013;<lpage>877</lpage>. <pub-id pub-id-type="doi">10.1002/(sici)1096-9128(199809/11)10:11/13&#x3c;873:aid-cpe402&#x3e;3.0.co;2-q</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chamberlain</surname>
<given-names>B. L.</given-names>
</name>
<name>
<surname>Callahan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zima</surname>
<given-names>H. P.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Parallel Programmability and the Chapel Language</article-title>. <source>Int. J.&#x20;High Perform. Comput. Appl.</source> <volume>21</volume>, <fpage>291</fpage>&#x2013;<lpage>312</lpage>. <pub-id pub-id-type="doi">10.1177/1094342007078442</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Charles</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Grothoff</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Saraswat</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Donawa</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kielstra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ebcioglu</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>X10: an Object-Oriented Approach to Non-uniform Cluster Computing</article-title>. <source>Acm Sigplan Notices</source> <volume>40</volume>, <fpage>519</fpage>&#x2013;<lpage>538</lpage>. <pub-id pub-id-type="doi">10.1145/1103845.1094852</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ghemawat</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>MapReduce</article-title>. <source>Commun. ACM</source> <volume>51</volume>, <fpage>107</fpage>&#x2013;<lpage>113</lpage>. <pub-id pub-id-type="doi">10.1145/1327452.1327492</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deelman</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Vahi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Juve</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rynge</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Callaghan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Maechling</surname>
<given-names>P. J.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Pegasus, a Workflow Management System for Science Automation</article-title>. <source>Future Generation Comput. Syst.</source> <volume>46</volume>, <fpage>17</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1016/j.future.2014.10.008</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dongarra</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Foster</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gropp</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kennedy</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Torczon</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2003</year>). <source>Sourcebook of Parallel Computing, 3003</source>. <publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers</publisher-name>. </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ekanayake</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wickramasinghe</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>G. C.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Java Thread and Process Performance for Parallel Machine Learning on Multicore Hpc Clusters</article-title>,&#x201d; in <conf-name>2016 IEEE international conference on big data (Big Data)</conf-name>, <conf-loc>Washington, DC, USA</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>347</fpage>&#x2013;<lpage>354</lpage>. <pub-id pub-id-type="doi">10.1109/bigdata.2016.7840622</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Elshawi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Sakr</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Talia</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Trunfio</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Big Data Systems Meet Machine Learning Challenges: Towards Big Data Science as a Service</article-title>. <source>Big Data Res.</source> <volume>14</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1016/j.bdr.2018.04.004</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fox</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Components and Rationale of a Big Data Toolkit Spanning Hpc, Grid, Edge and Cloud Computing</article-title>,&#x201d; in <conf-name>Proceedings of the10th International Conference on Utility and Cloud Computing</conf-name>, <conf-loc>Austin, TX, USA</conf-loc> (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1</fpage>. <comment>UCC &#x2019;17</comment>. <pub-id pub-id-type="doi">10.1145/3147213.3155012</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fox</surname>
<given-names>G. C.</given-names>
</name>
</person-group> (<year>1989</year>). &#x201c;<article-title>What Have We Learnt from Using Real Parallel Machines to Solve Real Problems</article-title>,&#x201d; in <conf-name>Proceedings of the third conference on Hypercube concurrent computers and applications-Volume 2</conf-name>, <conf-loc>Pasadena, CA, USA</conf-loc>, <fpage>897</fpage>&#x2013;<lpage>955</lpage>. <pub-id pub-id-type="doi">10.1145/63047.63048</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fox</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>R. D.</given-names>
</name>
<name>
<surname>Messina</surname>
<given-names>G. C.</given-names>
</name>
</person-group> (<year>1994</year>). <source>Parallel Computing Works!</source>. <publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers</publisher-name>. </citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hern&#xe1;ndez</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Somnath</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Eaton</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Entschev</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Performance Evaluation of python Based Data Analytics Frameworks in summit: Early Experiences</article-title>,&#x201d; in <conf-name>Driving Scientific and Engineering Discoveries Through the Convergence of HPC, Big Data and AI</conf-name>, <conf-loc>Oak Ridge, TN</conf-loc>, <comment>SMC 2020. Communications in Computer and Information Science</comment>. Editors <person-group person-group-type="editor">
<name>
<surname>Nichols</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Verastegui</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Maccabe</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hernandez</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Parete-Koon</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ahearn</surname>
<given-names>T.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>) <volume>Vol. 1315</volume>. <pub-id pub-id-type="doi">10.1007/978-3-030-63393-6_24</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Huai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chauhan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gates</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hagleitner</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hanson</surname>
<given-names>E. N.</given-names>
</name>
<name>
<surname>O&#x2019;Malley</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). &#x201c;<article-title>Major Technical Advancements in Apache Hive</article-title>,&#x201d; in <conf-name>Proceedings of the 2014 ACM SIGMOD international conference on Management of data</conf-name>, <conf-loc>Snowbird, UT, USA</conf-loc>, <fpage>1235</fpage>&#x2013;<lpage>1246</lpage>. <pub-id pub-id-type="doi">10.1145/2588555.2595630</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Imam</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sarkar</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Habanero-java Library: a Java 8 Framework for Multicore Programming</article-title>,&#x201d; in <conf-name>Proceedings of the 2014 International Conference on Principles and Practices of Programming on the Java platform: Virtual machines, Languages, and Tools</conf-name>, <conf-loc>Cracow, Poland</conf-loc>, <fpage>75</fpage>&#x2013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1145/2647508.2647514</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Widanage</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Perera</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Abeykoon</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Uyar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kanewala</surname>
<given-names>T. A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>Hptmt: Operator-Based Architecture for Scalable High-Performance Data-Intensive Frameworks</article-title>,&#x201d; in <conf-name>2021 IEEE 14th International Conference on Cloud Computing (CLOUD)</conf-name>, <conf-loc>Chicago, IL, USA</conf-loc>. <pub-id pub-id-type="doi">10.1109/cloud53861.2021.00036</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="web">
<collab>Kubeflow</collab> (<year>2021</year>). <article-title>Kubeflow home page</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.kubeflow.org/">https://www.kubeflow.org/</ext-link>
</comment>. </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McKinney</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Pandas: A Foundational python Library for Data Analysis and Statistics</article-title>,&#x201d; in <conf-name>Workshop collocated with the 24rd International Conference for High Performance Computing, Networking, Storage and Analysis (SC11)</conf-name>, <conf-loc>Seattle, WA, USA</conf-loc>, <conf-date>November 18, 2011</conf-date> <volume>14</volume>
<comment>. Available at: <ext-link ext-link-type="uri" xlink:href="https://www.dlr.de/sc/en/Portaldata/15/Resources/dokumente/pyhpc2011/submissions/pyhpc2011_submission_9.pdf">https://www.dlr.de/sc/en/Portaldata/15/Resources/dokumente/pyhpc2011/submissions/pyhpc2011_submission_9.pdf</ext-link>
</comment> (<comment>Accessed Dec 23, 2021</comment>). </citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Pytorch: An Imperative Style, High-Performance Deep Learning Library</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems 32 (NeurIPS 2019)</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x27;Alch&#xe9;-Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Vancouver, Canada</publisher-loc>), <fpage>8026</fpage>&#x2013;<lpage>8037</lpage>. </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Petersohn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Macke</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mo</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Towards Scalable Dataframe Systems</article-title>. <comment>arXiv preprint arXiv:2001.00888</comment>. </citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rocklin</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Dask: Parallel Computation with Blocked Algorithms and Task Scheduling</article-title>,&#x201d; in <conf-name>Proceedings of the 14th python in science conference (Citeseer)</conf-name>, <conf-loc>Austin, TX, USA</conf-loc>, <volume>130</volume>, <fpage>136</fpage>. <pub-id pub-id-type="doi">10.25080/majora-7b98e3ed-013</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sergeev</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Del Balso</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Horovod: Fast and Easy Distributed Deep Learning in Tensorflow</article-title>. <comment>arXiv preprint arXiv:1802.05799</comment>. </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shoemaker</surname>
<given-names>R. H.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>The Nci60 Human Tumour Cell Line Anticancer Drug Screen</article-title>. <source>Nat. Rev. Cancer</source> <volume>6</volume>, <fpage>813</fpage>&#x2013;<lpage>823</lpage>. <pub-id pub-id-type="doi">10.1038/nrc1951</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Valiant</surname>
<given-names>L. G.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>A Bridging Model for Parallel Computation</article-title>. <source>Commun. ACM</source> <volume>33</volume>, <fpage>103</fpage>&#x2013;<lpage>111</lpage>. <pub-id pub-id-type="doi">10.1145/79173.79181</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wickramasinghe</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Govindarajan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Abeykoon</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Widanage</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Perera</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Twister2: Tset High-Performance Iterative Dataflow</article-title>,&#x201d; in <conf-name>2019 International Conference on High Performance Big Data and Intelligent Systems (HPBD&#x26;IS)</conf-name>, <conf-loc>Shenzhen, China</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>55</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1109/hpbdis.2019.8735495</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Widanage</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Perera</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Abeykoon</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kamburugamuve</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kanewala</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Maithree</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>High Performance Data Engineering Everywhere</article-title>,&#x201d; in <conf-name>2020 IEEE International Conference on Smart Data Services (SMDS)</conf-name>, <conf-loc>Beijing, China</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>122</fpage>&#x2013;<lpage>132</lpage>. <pub-id pub-id-type="doi">10.1109/smds49396.2020.00022</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wilde</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hategan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wozniak</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<name>
<surname>Clifford</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Katz</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Foster</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Swift: A Language for Distributed Parallel Scripting</article-title>. <source>Parallel Comput.</source> <volume>37</volume>, <fpage>633</fpage>&#x2013;<lpage>652</lpage>. <pub-id pub-id-type="doi">10.1016/j.parco.2011.05.005</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wozniak</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Mohd-Yusof</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nicolae</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Collier</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ozik</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>High-bypass Learning: Automated Detection of Tumor Cells that Significantly Impact Drug Response</article-title>,&#x201d; in <conf-name>2020 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC) and Workshop on Artificial Intelligence and Machine Learning for Scientific Applications (AI4S)</conf-name>, <conf-loc>Virtual location</conf-loc>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/MLHPCAI4S51975.2020.00012</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Allen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Balaprakash</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brettin</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Garcia-Cardona</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Clyde</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>A Cross-Study Analysis of Drug Response Prediction in Cancer Cell Lines</article-title>. <comment>arXiv preprint arXiv:2104.08961</comment>. <pub-id pub-id-type="doi">10.1093/bib/bbab356</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zaharia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chowdhury</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Franklin</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Shenker</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Stoica</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Spark: Cluster Computing with Working Sets</article-title>,&#x201d; in <conf-name>Proceedings of the 2Nd USENIX Conference on Hot Topics in Cloud Computing</conf-name> (<publisher-loc>Berkeley, CA, USA</publisher-loc>: <publisher-name>USENIX Association</publisher-name>), <fpage>10</fpage>. <comment>HotCloud&#x2019;10</comment>. </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zaharia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Xin</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Wendell</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Das</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Armbrust</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dave</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Apache Spark: A Unified Engine for Big Data Processing</article-title>. <source>Commun. ACM</source> <volume>59</volume>, <fpage>56</fpage>&#x2013;<lpage>65</lpage>. <pub-id pub-id-type="doi">10.1145/2934664</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>