<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="methods-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2024.1476506</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Constructing a metadata knowledge graph as an atlas for demystifying AI pipeline optimization</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Venkataramanan</surname> <given-names>Revathy</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2016847/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tripathy</surname> <given-names>Aalap</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kumar</surname> <given-names>Tarun</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2810259/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Serebryakov</surname> <given-names>Sergey</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Justine</surname> <given-names>Annmary</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Shah</surname> <given-names>Arpit</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Bhattacharya</surname> <given-names>Suparna</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Foltin</surname> <given-names>Martin</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Faraboschi</surname> <given-names>Paolo</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Roy</surname> <given-names>Kaushik</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sheth</surname> <given-names>Amit</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1477293/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>AI Institute, University of South Carolina</institution>, <addr-line>Columbia, SC</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Hewlett Packard Enterprise Labs</institution>, <addr-line>Houston, TX</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Suresh Kallam, Jain University, India</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Kamal Berahmand, Institute for Advanced Studies in Basic Sciences (IASBS), Iran</p>
<p>Cristobal Rodolfo Guerra-Tamez, University of Monterrey, Mexico</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Revathy Venkataramanan <email>revathy&#x00040;email.sc.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1476506</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>08</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Venkataramanan, Tripathy, Kumar, Serebryakov, Justine, Shah, Bhattacharya, Foltin, Faraboschi, Roy and Sheth.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Venkataramanan, Tripathy, Kumar, Serebryakov, Justine, Shah, Bhattacharya, Foltin, Faraboschi, Roy and Sheth</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>The emergence of advanced artificial intelligence (AI) models has driven the development of frameworks and approaches that focus on automating model training and hyperparameter tuning of end-to-end AI pipelines. However, other crucial stages of these pipelines such as dataset selection, feature engineering, and model optimization for deployment have received less attention. Improving efficiency of end-to-end AI pipelines requires metadata of past executions of AI pipelines and all their stages. Regenerating metadata history by re-executing existing AI pipelines is computationally challenging and impractical. To address this issue, we propose to source AI pipeline metadata from open-source platforms such as Papers-with-Code, OpenML, and Hugging Face. However, integrating and unifying the varying terminologies and data formats from these diverse sources is a challenge. In this study, we present a solution by introducing Common Metadata Ontology (CMO) which is used to construct an extensive AI Pipeline Metadata Knowledge Graph (AIMKG) consisting of 1.6 million pipelines. Through semantic enhancements, the pipeline metadata in AIMKG is also enriched for downstream tasks such as search and recommendation of AI pipelines. We perform quantitative and qualitative evaluations on AIMKG to search and recommend relevant pipelines to user query. For quantitative evaluation, we propose a custom aggregation model that outperforms other baselines by achieving a retrieval accuracy (R&#x00040;1) of 76.3%. Our qualitative analysis shows that AIMKG-based recommender retrieved relevant pipelines in 78% of test cases compared to the state-of-the-art MLSchema-based recommender which retrieved relevant responses in 51% of the cases. AIMKG serves as an atlas for navigating the evolving AI landscape, providing practitioners with a comprehensive factsheet for their applications. It guides AI pipeline optimization, offers insights and recommendations for improving AI pipelines, and serves as a foundation for data mining and analysis of evolving AI workflows.</p></abstract>
<kwd-group>
<kwd>AI pipeline metadata</kwd>
<kwd>graph learning</kwd>
<kwd>graph recommendation</kwd>
<kwd>AIMKG</kwd>
<kwd>metadata knowledge graphs</kwd>
<kwd>AI pipeline optimization</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="8"/>
<equation-count count="8"/>
<ref-count count="50"/>
<page-count count="18"/>
<word-count count="12268"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>The rapid evolution of artificial intelligence (AI) has led to significant advancements in techniques, necessitating continuous knowledge updates. The submission rates at conferences such as NeurIPS, which constitutes several thousand papers, demonstrate the rapid growth and competitiveness in AI research. Furthermore, there are several versions of generative models being released for various tasks which are difficult to keep track of. The rate of invention of new AI methods also introduces a challenge of suitable model selection for a given task and dataset. The success of AI methods has also led to the development of larger and more complex models to tackle various challenges, due to which training of AI models has become increasing challenging with its increasing complexity. To improve training efficiency, AutoML methods (He et al., <xref ref-type="bibr" rid="B18">2021</xref>; Karmaker et al., <xref ref-type="bibr" rid="B25">2021</xref>) have been introduced for optimizing models and hyperparameter tuning. MLFlow (Zaharia et al., <xref ref-type="bibr" rid="B49">2018</xref>) and MLSchema (Publio et al., <xref ref-type="bibr" rid="B38">2018</xref>) use a model-centric approach for metadata logging, requiring separate instances for each trained model for a given pipeline, say, entity extraction from health records. Openlineage (Hariharan et al., <xref ref-type="bibr" rid="B17">2024</xref>) tracks data lineage through its lifecycle. However, AI pipeline development also includes stages such as dataset selection, preprocessing, feature engineering, and deployment. For reproducibility, metadata logging must encompass all stages, not just model selection and tuning. A comprehensive framework is needed to track all stages, executions, models, and datasets to solve a given AI task.</p>
<p>The Common Metadata Framework (CMF; Koomthanam et al., <xref ref-type="bibr" rid="B27">2024</xref>) addresses this by serving as a pipeline-centric metadata logging system that captures metadata for all stages, executions, models, datasets, and metrics in an integrated manner, enabling the search for the optimal execution path. While CMF provides a holistic approach to metadata logging, a robust framework is required to facilitate AI pipeline optimization. Such optimization can be achieved by leveraging logged metadata to recommend past successful pipeline executions as a seed, reducing the overall experimentation runs. This recommendation requires detailed metadata of numerous pipelines executed, capturing the interactions and dependencies (e.g., input/output datasets, parameters, and configurations) of each stage. Generating such metadata by executing pipelines is not feasible as it demands time and computational resources. On the other hand, open-source platforms such as Papers-with-code (<xref ref-type="bibr" rid="B36">2018</xref>), OpenML (Vanschoren et al., <xref ref-type="bibr" rid="B45">2014</xref>), Hugging Face (<xref ref-type="bibr" rid="B20">2016</xref>), and Kaggle (<xref ref-type="bibr" rid="B23">2010</xref>) expose metadata of already executed pipelines which can be leveraged. To enable metadata interoperability from diverse sources, they must be integrated. However, it poses challenges such as differing nomenclature, data structure variations, and lack of component semantics to understand context and perform reasoning on the entities.</p>
<p>To address these challenges, we introduce the Common Metadata Ontology (CMO), built on the foundations of CMF&#x00027;s pipeline-centric approach. CMO is an unifying schema to integrate metadata from these diverse sources to construct AI pipeline Metadata Knowledge Graph (AIMKG) that enables search and recommendation of relevant AI pipelines for optimization. Knowledge graphs (KG) provide a deeper understanding of relationships and enable context-aware recommendations by capturing both explicit and implicit connections. CMO supports such semantic and multimodal properties which are computed while constructing AIMKG. For example, AIMKG can identify the shared semantics between pipelines for <italic>object detection</italic> and <italic>3d instance segmentation</italic>, recognizing them as vision-based tasks even without explicit naming and facilitate reasoning. As a downstream application, we develop a search and recommender system that demonstrates the potential of AIMKG to recommend relevant pipelines for optimization. The recommender system provides explainable recommendations and ensures reproducibility by providing source information of AI pipelines. The specific contributions of the study are as follows:</p>
<list list-type="order">
<list-item><p>Proposing Common Metadata Ontology with a pipeline-centric view to integrate and aggregate the metadata mined from diverse sources. CMO supports semantic properties and multimodal properties such as text and embedding vectors.</p></list-item>
<list-item><p>Construction of the first of its kind AIMKG using CMO that serves as an atlas to navigate the ever-growing AI field.</p></list-item>
<list-item><p>Enriching the AIMKG with additional knowledge and by computing semantic properties.</p></list-item>
<list-item><p>Introducing custom heuristic ranking function to recommend relevant pipelines using task, dataset, or model.</p></list-item>
<list-item><p>Introducing a custom aggregation model to generate graph embeddings that enable AI pipeline recommendation for natural language queries.</p></list-item>
</list>
<p>Conventional knowledge graphs capture relationships among concepts or entities and their semantic properties. For example, Linked Open Data Cloud (Musto et al., <xref ref-type="bibr" rid="B33">2016</xref>) and DSKG (F&#x000E4;rber and Lamprecht, <xref ref-type="bibr" rid="B13">2021</xref>) capture semantic relationships among datasets. In contrast, AIMKG consists of process graphs that capture procedural interactions of entities in the context of training and execution, such as how datasets and models combine to produce performance metrics (e.g., dataset &#x0002B; model metrics, model weights). AIMKG follows the traditional semantics of entities and extends it further to process graphs. This procedural representation is a notable contribution not typically found in traditional knowledge graphs.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>With the growth of AI models (Menghani, <xref ref-type="bibr" rid="B30">2023</xref>; Mathew et al., <xref ref-type="bibr" rid="B29">2021</xref>; Shrestha and Mahmood, <xref ref-type="bibr" rid="B42">2019</xref>; Mohammadi et al., <xref ref-type="bibr" rid="B31">2024</xref>; Berahmand et al., <xref ref-type="bibr" rid="B6">2024</xref>), several frameworks have been proposed to enable the search and discoverability of these models and architectures. DeepSciKG (Kannan et al., <xref ref-type="bibr" rid="B24">2020</xref>) project proposes a mechanism to create and query knowledge graphs to represent multimodal information from AI publication metadata, i.e., code, pseudocode, tables, images, and diagrams in addition to text/equations in publications. STM-KG (Brack et al., <xref ref-type="bibr" rid="B8">2021</xref>) proceeds along similar lines to demonstrate how science, technology, and medicine papers can be automatically mined to automatically populate a scientific concepts knowledge graph and drive a &#x0201C;citation recommender.&#x0201D; ML Schema (Publio et al., <xref ref-type="bibr" rid="B38">2018</xref>) proposed a model-centric ontology to formalize only OpenML data. Humm and Zender (<xref ref-type="bibr" rid="B21">2021</xref>) developed an ontology to represent ML metadata to organize and store limited number of tasks (15 as compared to 5 k in our study). AI-KG (Dess&#x000EC; et al., <xref ref-type="bibr" rid="B11">2020</xref>) generated AI knowledge graph from published papers consisting of 330k research publications with 14 M triples that describes five types of entities (tasks, methods, metrics, materials, and others). AIMKG consists of combination of published papers (1 million) and also user-recorded metadata from OpenML and HuggingFace.</p>
<p>The extraction of knowledge from publicly available resources remains an active and dynamic area of research. Notably, the GraphGen4Code (Abdelaziz et al., <xref ref-type="bibr" rid="B1">2021</xref>) approach has emerged as a comprehensive toolkit for constructing knowledge graphs from program code, effectively facilitating subsequent endeavors to address the creation of AutoML pipelines utilizing such knowledge graphs (Helali et al., <xref ref-type="bibr" rid="B19">2022</xref>). These endeavors complement our own study, which leverages existing public repositories and published research to infer relations between AI pipeline entities. By constructing a knowledge graph, our approach aims to solve downstream tasks within the field.</p>
<p>In literature, there has been a consistent effort to recommend datasets for scientific problems, e.g., DataHunter (F&#x000E4;rber and Leisinger, <xref ref-type="bibr" rid="B14">2021</xref>) and DataFinder (Viswanathan et al., <xref ref-type="bibr" rid="B48">2023</xref>). Croissant is a high-level format for machine learning datasets that combines metadata, resource file descriptions, data structure, and default ML semantics into a single file for downstream tasks (Akhtar et al., <xref ref-type="bibr" rid="B4">2024</xref>). SIGMOD (Kumar et al., <xref ref-type="bibr" rid="B28">2023</xref>) recommends datasets, models, processing steps etc. along with pipeline lineage. Similarly, other studies such as M&#x000FC;llner et al. (<xref ref-type="bibr" rid="B32">2022</xref>) use the history of AI pipelines to recommend datasets and models to solve new tasks allowing sharing of these artifacts among multiple pipelines. In our study, we integrate and aggregate multiple data sources instead of focusing on a particular data source. Specifically, our study focuses on pipeline optimization, knowledge discovery, search, and recommendation through mining metadata from diverse open sources. HuggingGPT (Shen et al., <xref ref-type="bibr" rid="B41">2023</xref>) is a collaborative system that consists of an LLM as the controller and numerous expert models as collaborative executors from HuggingFace. It uses LLM-based chat interface to recommend models for tasks from different domains. We plan to incorporate an LLM interface similar to HuggingGPT in the future while focusing recommending pipeline that includes task, dataset, dataset preprocessing steps, model, metrics, and hyperparameters.</p>
<p>Several techniques such as CASH (Thornton et al., <xref ref-type="bibr" rid="B44">2013</xref>; Guo et al., <xref ref-type="bibr" rid="B16">2019</xref>) and NAS (Elsken et al., <xref ref-type="bibr" rid="B12">2019</xref>) have been proposed for model optimization and hyperparameter tuning. However, our study distinguishes itself by extending beyond the confines of solely addressing problem-dataset or model-hyperparameter relationships. Instead, it delves into capturing intricate associations among models, datasets, and tasks, encompassing their hierarchical connections.</p>
</sec>
<sec id="s3">
<title>3 AI pipeline Metadata Knowledge Graph construction</title>
<sec>
<title>3.1 Data sources</title>
<p>In this study, we collect AI pipeline metadata from Papers-with-Code, OpenML, and HuggingFace to construct AIMKG. The data availability of each source can be found in <xref ref-type="table" rid="T1">Table 1</xref>. In the future, we also plan to incorporate the metadata from Kaggle.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Availability of pipeline metadata from open-source platforms.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Entities</bold></th>
<th valign="top" align="left"><bold>PWC</bold></th>
<th valign="top" align="left"><bold>OpenML</bold></th>
<th valign="top" align="center"><bold>HF</bold></th>
<th valign="top" align="left"><bold>Kaggle</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">&#x00023; Pipelines</td>
<td valign="top" align="left">1 Million &#x0002B;</td>
<td valign="top" align="left">10 million &#x0002B;</td>
<td valign="top" align="center">267,000</td>
<td valign="top" align="left">160 k</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; Tasks</td>
<td valign="top" align="left">4 k</td>
<td valign="top" align="left">1.6 K</td>
<td valign="top" align="center">41</td>
<td valign="top" align="left">200&#x0002B;</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; Datasets</td>
<td valign="top" align="left">12 k</td>
<td valign="top" align="left">3.4 k</td>
<td valign="top" align="center">56,000</td>
<td valign="top" align="left">173 K</td>
</tr>
<tr>
<td valign="top" align="left">&#x00023; Models</td>
<td valign="top" align="left">2 k</td>
<td valign="top" align="left">16 k</td>
<td valign="top" align="center">267,000</td>
<td valign="top" align="left">NA</td>
</tr></tbody>
</table>
</table-wrap>
<sec>
<title>3.1.1 Papers-with-Code</title>
<p>Papers-with-Code provides extensive metadata for research papers and associated code repositories, encompassing over 1 million entries at the time of this paper submission. The metadata covers various components and stages of AI pipelines described in the papers. Through their API, Papers-with-Code offers metadata including PDF URLs, GitHub repository links, task details, dataset information, methods employed, and evaluation metrics and results. While not all stages of metadata are available for every paper through the API, the information can still be obtained by referring to the research papers and their code repositories.</p>
</sec>
<sec>
<title>3.1.2 OpenML</title>
<p>OpenML provides metadata on ML pipelines logged by users, offering detailed information on tasks, datasets, flows, runs with parameter settings, and evaluations. OpenML encompasses eight major task types executed on various datasets, resulting in 1,600 unique tasks. For each task, the most recent 500 runs have been collected which amounts to a total of 330,000 runs.</p>
</sec>
<sec>
<title>3.1.3 HuggingFace</title>
<p>Huggingface is a model hub that offers users access to numerous pretrained models. It covers a wide range of tasks, including domains such as computer vision, natural language processing, tabular data, reinforcement learning, and multimodal learning. Huggingface provides model-centric information, along with datasets and evaluations, enabling the construction of complete pipelines. At the time of paper submission, &#x0007E;270,000&#x0002B; pipelines have been collected from HuggingFace.</p>
</sec>
</sec>
<sec>
<title>3.2 Common metadata ontology</title>
<p>The metadata from these sources follows different data structures and nomenclatures. For example, the concept &#x0201C;Model&#x0201D; is referred to as &#x0201C;Methods&#x0201D; in Papers-with-Code, &#x0201C;Flow&#x0201D; in OpenML, and &#x0201C;Model&#x0201D; in Hugging Face. We propose the Common Metadata Ontology (CMO), a unifying schema to integrate diverse data structures from Papers-with-Code, OpenML, and Hugging Face. Built on the Common Metadata Framework (CMF; Koomthanam et al., <xref ref-type="bibr" rid="B27">2024</xref>), CMO ensures interoperability of metadata, enabling knowledge discovery, search, and reasoning capabilities. The overview of CMO is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The novel features of CMO are as follows: (i) following a pipeline-centric approach, similar to CMF, to capture multiple experimentation runs for each stage (train, test, validation) with parameter settings, facilitating the identification of the best execution path; (ii) modularity that allows distributed experiments and parallel logging of pipeline metadata, enabling seamless metadata capture across different teams and machines; (iii) support for additional semantic and statistical properties that can be extracted, computed, or generated from entity names (e.g., identifying tasks as image-based, text-based, or audio-based); and (iv) support for multimodal properties, including text and vector embeddings of entity names, to enable keyword and approximate search. A detailed overview of CMO and its properties can be found in Venkataramanan (<xref ref-type="bibr" rid="B47">2024</xref>). MLSchema (Publio et al., <xref ref-type="bibr" rid="B38">2018</xref>) and MLFlow (Zaharia et al., <xref ref-type="bibr" rid="B49">2018</xref>) adopt a model-centric approach. When building a pipeline, say, entity extraction from semi-structured electronic health records and testing it with multiple models, MLSchema and MLFlow require creating several instances &#x0201C;one for each model&#x0201D; to record metadata. In contrast, CMO allows all models, variations, hyperparameters, metrics, and datasets to be documented as a single instance, facilitating a scaleable and flexible metadata recording process by taking a holistic view of the entire pipeline. Hence, CMO builds upon the principles of CMF.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Overview of proposed Common Metadata Ontology. The detailed version of CMO with properties can be found at Venkataramanan (<xref ref-type="bibr" rid="B47">2024</xref>). The ontology consists of a pipeline node executing several stages such as data preprocessing, train, test, and validation. Each stage can have several Executions. Each Execution can use a Model and Dataset to produce Metrics and trained model weights. The Task captures a broader view of pipelines such as anomaly detection or demand forecasting. The code base is captured by Framework node and any published report or papers are present in Report node.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0001.tif"/>
</fig>
</sec>
<sec>
<title>3.3 Problem statement</title>
<p>The goal is to design a correspondence mapping function <italic>F</italic> that maps the Entity-Relationship-Attributes present in the relational database of three data sources to the Nodes-Relationships-Properties of CMO. Each data source <italic>D</italic> &#x0003D; {<italic>E, R, A</italic>} where <italic>E</italic> &#x0003D; {<italic>e</italic><sub>1</sub>, <italic>e</italic><sub>2</sub>, &#x02026;, <italic>e</italic><sub><italic>i</italic></sub>} represent the set of entities, <italic>R</italic> &#x0003D; {<italic>r</italic><sub>1</sub>, <italic>r</italic><sub>2</sub>, &#x02026;, <italic>r</italic><sub><italic>j</italic></sub>} represent the relationship between <italic>e</italic><sub><italic>i</italic></sub> and <italic>e</italic><sub><italic>j</italic></sub>, <italic>A</italic> &#x0003D; {<italic>a</italic><sub>1</sub>, <italic>a</italic><sub>2</sub>, &#x02026;, <italic>a</italic><sub><italic>n</italic></sub>} represents the set of attributes for any entity <italic>e</italic><sub><italic>i</italic></sub>. First, each data source <italic>D</italic> is mapped to a graph <italic>G</italic> using a mapping function <italic>f</italic> : <italic>D</italic> &#x02192; <italic>G</italic>.<italic>G</italic> &#x0003D; {<italic>V, M, K</italic>} where <italic>V</italic> &#x0003D; {<italic>v</italic><sub>1</sub>, <italic>v</italic><sub>2</sub>, &#x02026;, <italic>v</italic><sub><italic>i</italic></sub>} represents the set of vertices, <italic>M</italic> &#x0003D; {<italic>m</italic><sub>1</sub>, <italic>m</italic><sub>2</sub>, &#x02026;, <italic>m</italic><sub><italic>j</italic></sub>} represents the set of edges between vertices, and <italic>K</italic> &#x0003D; {<italic>k</italic><sub>1</sub>, <italic>k</italic><sub>2</sub>, &#x02026;, <italic>k</italic><sub><italic>n</italic></sub>} represents the set of properties of the vertices. The graph <italic>G</italic> of each data source consists of inherent entities, their associations, and properties present in relational database. Then, we compute a correspondence function <italic>F</italic> : <italic>G</italic> &#x02192; <italic>KG</italic> that maps the elements from graph <italic>G</italic> by computing, extracting, or generating necessary information. <italic>KG</italic> &#x0003D; {<italic>N, R, P</italic>} where <italic>N</italic> &#x0003D; {<italic>n</italic><sub>1</sub>, <italic>n</italic><sub>2</sub>, &#x02026;, <italic>n</italic><sub><italic>i</italic></sub>} are the set of nodes, <italic>R</italic> &#x0003D; {<italic>r</italic><sub>1</sub>, <italic>r</italic><sub>2</sub>, &#x02026;, <italic>r</italic><sub><italic>j</italic></sub>} are the set of relationships between nodes, and <italic>P</italic> &#x0003D; {<italic>p</italic><sub>1</sub>, <italic>p</italic><sub>2</sub>, &#x02026;, <italic>p</italic><sub><italic>n</italic></sub>} are the set of properties of the nodes. For each <italic>G</italic>, <italic>F</italic> : <italic>V</italic> &#x02192; <italic>N, F</italic> : <italic>M</italic> &#x02192; <italic>R</italic> and <italic>F</italic> : <italic>K</italic> &#x02192; <italic>P</italic>. Finally, the AIMKG is constructed as (<italic>KG</italic>1 &#x0222A; <italic>KG</italic>2 &#x0222A; <italic>KG</italic>3).</p>
</sec>
<sec>
<title>3.4 AIMKG construction and enrichment</title>
<sec>
<title>3.4.1 Construction</title>
<p>The algorithm for construction of AIMKG is described in <xref ref-type="table" rid="T9">Algorithm 1</xref>, and the system architecture is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. We collect metadata from Papers-with-Code, OpenML, and HuggingFace and represent the data using relational database. The metadata, in the relational database format <italic>D</italic>, is then converted into graph data models <italic>G</italic> through a mapping function <italic>f</italic> : <italic>D</italic> &#x02192; <italic>G</italic> to analyze the inherent graph structure of each data source. To align the concepts of graph <italic>G</italic> to the concepts in the CMO, we implement a correspondence mapping function <italic>F</italic> : <italic>G</italic> &#x02192; <italic>KG</italic>. The mapping function <italic>F</italic> consists of a predefined set of mappings of concepts presented in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>. While specific nodes, relationships, and properties in <italic>G</italic> directly correspond to CMO, additional elements are computed, extracted, or generated by analyzing indirect associations among the entities in each data source. For example, while mapping OpenML data to CMO, the concept node Hyperparameters and Metrics needs to be computed from attributes of Runs given by OpenML. Since these are computed nodes, the relationships need to be computed by studying the associations between tables in relational database provided by OpenML. The mapping of entities from the data sources to CMO can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. Currently, AIMKG exists as both Resource Description Format (RDF) and Labeled Property Graph (LPG), and the results are presented in Section 6.</p>
<table-wrap position="float" id="T9">
<label>Algorithm 1</label>
<caption><p>Construction of AIMKG.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-i0001.tif"/>
</table-wrap>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Overview of AIMKG construction. The data collected from Papers-with-code, OpenML, and Huggingface are translated into their relational database formats and then to their graph data models G1, G2, and G3. Then, they are mapped onto CMO. The pipeline metadata undergoes semantic enhancements before loaded as AIMKG.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0002.tif"/>
</fig>
</sec>
<sec>
<title>3.4.2 AIMKG enrichment</title>
<p>To enable advanced search and recommendation capabilities, we compute semantic properties for pipeline entities, specifically tasks, and datasets. We aim to identify required semantic properties for other nodes in the future work. These semantic properties capture implicit knowledge about entities, providing valuable insights. For instance, the semantic property <italic>modality</italic> identifies the visual nature of tasks such as <italic>object detection</italic> and <italic>video instance segmentation</italic>, even if not explicitly stated. Similarly, capturing task <italic>categories</italic> such as segmentation, classification, or regression clarifies the nature of tasks and aids in organizing and categorizing pipelines by problem type.</p>
<sec>
<title>3.4.2.1 Rule-based entity extraction</title>
<p>To identify the modality and category of tasks, we utilize a rule-based entity extraction approach. An extensive vocabulary is curated that includes synonyms for each modality and category, such as dialogue, translation, and text for the text modality, and terms such as classification and summarization for categories. Task names and descriptions are analyzed to assign modality and category. The main modalities we consider are Image, Text, Audio, Video, and Multimodal. While this information is available in Papers-with-Code and Huggingface, OpenML presents a challenge as task names are generated by combining task type and dataset name, which obscures modality. To address this, we analyze dataset entities in OpenML, marked as nominal or numeric, to infer task modality. Although the method is straightforward and reliable, using a manually curated vocabulary may introduce biases and limit scalability. Future research will focus on developing more scalable, automated approaches that mitigate biases and enhance robustness.</p>
</sec>
<sec>
<title>3.4.2.2 Graph-based labeling of dataset modalities</title>
<p>For the datasets, we determine only <italic>modality</italic> because <italic>category</italic> varies for every pipeline as per the task. For example, MS-COCO dataset can be used for detection, segmentation, or localization. The dataset from all these data sources does not contain sufficient information such as description to identify the modality. Therefore, we study the association between the task and dataset nodes to label <italic>modality</italic> for each dataset. The calculation of dataset modality is as follows:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0222A;</mml:mo><mml:mi>M</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0222A;</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>&#x0222A;</mml:mo><mml:mi>M</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Where <italic>D</italic><sub>1</sub> is the dataset, <italic>T</italic><sub>1</sub> to <italic>T</italic><sub><italic>n</italic></sub> are the associated tasks, and <italic>Mod</italic>(<italic>T</italic><sub>1</sub>) represents the <italic>modality</italic> computed for a given task.</p>
</sec>
<sec>
<title>3.4.2.3 External knowledge</title>
<p>AIMKG also incorporates additional knowledge from various sources to enhance its semantic properties. We crawl task hierarchy information from Papers-with-Code, comprising of three levels: The first level represents task areas such as computer vision, natural language processing, or speech. The second level groups tasks into categories such as segmentation, classification, or detection. Finally, at the leaf nodes, we find the specific tasks provided by Papers-with-Code through their API. This hierarchical structure adds valuable knowledge to AIMKG, enabling a more comprehensive understanding of different task domains.</p>
</sec>
</sec>
<sec>
<title>3.4.3 Node embeddings</title>
<p>To facilitate recommendations or approximate searches, we also compute and store embeddings for the names of tasks, datasets, models, and pipelines. A sentence transformer, <italic>all-mpnet-base-v2</italic> (SBERT Documentation, <xref ref-type="bibr" rid="B40">2023</xref>), with default embedding size 768 was used to create embeddings. The computation of embeddings can be extended to other components of the pipeline as needed. These embeddings, along with the semantic properties, are used in similarity metric calculation to rank relevant recommendation described in the following section. The embeddings are computed and added after standing up AIMKG, allowing flexibility with different models.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<title>4 AI pipeline search and recommendation</title>
<p>In certain cases, the exact pipeline the user is searching for can be found in AIMKG. However, it is not always the case. We propose two different recommender systems to search and recommend relevant pipelines to user input query that can seed the experimentation.</p>
<sec>
<title>4.1 Relevant pipeline recommendation using custom heuristics</title>
<p>In this section, we propose a recommender system that enables user to query a relevant pipeline based on its entities such as tasks, datasets, models, or combinations of them. Currently, we develop a custom heuristic ranking metric for tasks, datasets, and models to identify similar pipeline as these three entities are most indicative of a pipeline.</p>
<sec>
<title>4.1.1 Problem formulation</title>
<p>For a given task <italic>t</italic><sub><italic>i</italic></sub>, dataset <italic>d</italic><sub><italic>i</italic></sub>, or model <italic>m</italic><sub><italic>i</italic></sub>, rank the tasks <italic>T</italic> &#x0003D; {<italic>t</italic><sub>1</sub>, <italic>t</italic><sub>2</sub>, ...<italic>t</italic><sub><italic>n</italic></sub>}, datasets <italic>D</italic> &#x0003D; {<italic>d</italic><sub>1</sub>, <italic>d</italic><sub>2</sub>, ...<italic>d</italic><sub><italic>n</italic></sub>}, or models <italic>M</italic> &#x0003D; {<italic>m</italic><sub>1</sub>, <italic>m</italic><sub>2</sub>, ...<italic>m</italic><sub><italic>n</italic></sub>} present in AIMKG, respectively, using custom heuristics defined below. Once the most similar entities are identified, identify the pipelines associated with top-ranked items by traversing through the graph. Presently, the pipelines recommended consist of coarse-level entities such as tasks, datasets, models, metrics, frameworks, reports, and code repositories. These custom heuristic functions can be used alone or in combination as required. These recommendations act as a seed, reducing the search space for ML practitioners and minimizing the number of experiments needed to achieve optimal solutions.</p>
</sec>
<sec>
<title>4.1.2 Heuristic functions</title>
<p><bold>Task similarity:</bold> Using the <italic>modality</italic> and <italic>category</italic> properties, we compute similarity for the task nodes in AIMKG as follows:</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi><mml:mtext>_</mml:mtext><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>t</italic><sub><italic>i</italic></sub>, <italic>t</italic><sub><italic>j</italic></sub> are any two task nodes, <italic>e</italic><sub><italic>i</italic></sub>, <italic>e</italic><sub><italic>j</italic></sub> are task name embeddings, <italic>name</italic><sub><italic>i</italic></sub>, <italic>name</italic><sub><italic>j</italic></sub> are the task name tokens, <italic>mod</italic><sub><italic>i</italic></sub>, <italic>mod</italic><sub><italic>j</italic></sub> are task modalities (image, text, audio, etc.), and <italic>cat</italic><sub><italic>i</italic></sub>, <italic>cat</italic><sub><italic>j</italic></sub> are task categories (detection, summarization, classification, etc.). <italic>J</italic> is the Jaccard similarity, and <italic>cos</italic> is the cosine similarity of embeddings.</p>
<p><bold>Dataset similarity:</bold> Dataset consists of <italic>modality</italic> calculated using <xref ref-type="disp-formula" rid="E1">Equation 1</xref>. They do not have <italic>category</italic> as a semantic property as a given dataset might be suitable for two task <italic>categories</italic> such as segmentation and detection. Therefore, dataset similarity is calculated as</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>d</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi><mml:mtext>_</mml:mtext><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>d</italic><sub><italic>i</italic></sub>, <italic>d</italic><sub><italic>i</italic></sub> are any two dataset nodes, <italic>e</italic><sub><italic>i</italic></sub>, <italic>e</italic><sub><italic>j</italic></sub> are dataset name embeddings, <italic>name</italic><sub><italic>i</italic></sub>, <italic>name</italic><sub><italic>j</italic></sub> are the dataset name tokens, <italic>mod</italic><sub><italic>i</italic></sub>, <italic>mod</italic><sub><italic>j</italic></sub> are dataset modalities (image, text, audio, etc.) of the dataset names, <italic>url</italic><sub><italic>i</italic></sub>, <italic>url</italic><sub><italic>j</italic></sub> are dataset URLs, and <italic>U</italic> is token-based URL similarity metric that quantifies the degree of resemblance between two URLs.</p>
<p><bold>Model similarity:</bold> Model similarity is computed using the given semantic property <italic>class</italic> such as CNN and GPT. The <italic>URL</italic> given by the sources is also used as in some sources it aids in capturing the root of the model origin (Example: HuggingFace)</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mtext>_</mml:mtext><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>J</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>m</italic><sub><italic>i</italic></sub>, <italic>m</italic><sub><italic>i</italic></sub> are any two model nodes, <italic>e</italic><sub><italic>i</italic></sub>, <italic>e</italic><sub><italic>j</italic></sub> are model name embeddings, <italic>name</italic><sub><italic>i</italic></sub>, <italic>name</italic><sub><italic>j</italic></sub> are model name tokens, and <italic>class</italic><sub><italic>i</italic></sub>, <italic>class</italic><sub><italic>j</italic></sub> are model classes (transformers, CNN, GRU, etc.).</p>
<p>We found through empirical experiments that a combination of embedding and keyword similarity offers the best results. For example, embedding similarity captures that &#x0201C;fault&#x0201D; and &#x0201C;anomaly&#x0201D; are synonyms. Simultaneously, in <xref ref-type="fig" rid="F3">Figure 3A</xref>, segmentation tasks must be closer than classification tasks. Similarly, in <xref ref-type="fig" rid="F3">Figure 3B</xref>, image-based tasks need to be closer than text-based tasks. These semantics are not captured by the embedding similarity but through keyword-based similarity of semantic properties computed for pipeline components. The ability to design and implement meta-similarity based on sets and the proximity of textual embeddings is a unique differentiator compared to existing methods such as Achille et al. (<xref ref-type="bibr" rid="B2">2019</xref>).</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Illustration to show the necessity of embedding and keyword similarity. <bold>(A)</bold> Task category semantics <bold>(B)</bold> Task modality semantics.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0003.tif"/>
</fig>
</sec>
</sec>
<sec>
<title>4.2 Relevant pipeline recommendation using graph learning</title>
<p>In contrast to the recommender discussed in the section above, this section proposes a custom graph embedding learning model that retrieves relevant pipelines to user input queries given in natural language.</p>
<sec>
<title>4.2.1 Problem formulation</title>
<p>The goal is to learn a common embedding space for the natural language query and its corresponding pipeline graph to retrieve relevant pipelines. AIMKG graph consists of several pipeline graphs {<italic>P</italic><sub>1</sub>, <italic>P</italic><sub>2</sub>, <italic>P</italic><sub>3</sub>, &#x02026;<italic>P</italic><sub><italic>n</italic></sub>}. Each <italic>P</italic><sub><italic>i</italic></sub> &#x0003D; {<italic>N, E</italic>}, where <italic>N</italic> is the set of nodes and <italic>E</italic> is the set of edges. The nodes <italic>N</italic> &#x0003D; {<italic>p, s, e, a, d, m, met, f, r, t</italic>} represent different elements: <italic>p</italic> is pipeline, <italic>s</italic> is stages, <italic>e</italic> is executions, <italic>a</italic> is artifacts, <italic>d</italic> is datasets, <italic>m</italic> is model, <italic>met</italic> is metrics, <italic>f</italic> is framework, <italic>r</italic> is reports, and <italic>t</italic> is tasks. For each pipeline, we have a set of queries <italic>Q</italic><sub><italic>i</italic></sub> &#x0003D; {<italic>q</italic><sub>1</sub>, <italic>q</italic><sub>2</sub>, &#x02026;, <italic>q</italic><sub><italic>n</italic></sub>}. The goal is to learn a common embedding space for graph embedding <italic>ge</italic><sub><italic>i</italic></sub> that takes <italic>P</italic><sub><italic>i</italic></sub> &#x0003D; {<italic>N, E</italic>} as input and query embedding <italic>qe</italic><sub><italic>i</italic></sub> that takes in one-sentence query <italic>q</italic><sub><italic>i</italic></sub> as input.</p>
</sec>
<sec>
<title>4.2.2 Query generation</title>
<p>Since there is no ground truth information, ChatGPT was used to generate a one-sentence query that describes the pipeline, which can simulate a user query to search for a pipeline. Similar to studies that involve the Retrieval Augmented Generation (RAG) approach (Jadon and Kumar, <xref ref-type="bibr" rid="B22">2023</xref>; Guo and Chen, <xref ref-type="bibr" rid="B15">2024</xref>), we utilized ChatGPT API to generate queries for a given pipeline based on the name and description of node entities such as pipeline, model, task, dataset, and metrics. These generated queries are different from the title of the paper from Papers-with-Code or title of the report or model cards from HuggingFace. The detailed analysis on queries generated can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. The following prompt was used to generate one-sentence description for each pipeline:</p>
<preformat>
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;PROMPT:
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;Generate&#x000A0;a&#x000A0;vague&#x000A0;two-line&#x000A0;query&#x000A0;summarizing
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;the&#x000A0;pipeline
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;information&#x000A0;below,&#x000A0;utilizing&#x000A0;pipeline
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;description,&#x000A0;list
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;of&#x000A0;tasks,&#x000A0;list&#x000A0;of&#x000A0;datasets&#x000A0;and&#x000A0;list&#x000A0;of
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;methods.&#x000A0;Avoid&#x000A0;forming
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;the&#x000A0;query&#x000A0;as&#x000A0;a&#x000A0;question.&#x000A0;Generate&#x000A0;these
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;queries&#x000A0;as&#x000A0;if&#x000A0;a&#x000A0;user
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;is&#x000A0;searching&#x000A0;for&#x000A0;a&#x000A0;pipeline&#x000A0;based&#x000A0;on&#x000A0;the
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;following&#x000A0;pipeline
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;information.&#x000A0;Note,&#x000A0;these&#x000A0;queries&#x000A0;should&#x000A0;be
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;very&#x000A0;different
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;from&#x000A0;the&#x000A0;pipeline&#x000A0;name&#x000A0;given&#x000A0;below.
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;Return&#x000A0;the&#x000A0;query&#x000A0;as
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;bullets&#x000A0;numbered&#x000A0;as&#x000A0;1.,&#x000A0;2.,&#x000A0;and&#x000A0;3.
&#x000A0;&#x000A0;
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;Pipeline&#x000A0;Description:
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;{data[&#x00027;pipeline_description&#x00027;]}
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;Pipeline&#x000A0;Name:&#x000A0;{data[&#x00027;pipeline_name&#x00027;]}
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;List&#x000A0;of&#x000A0;Tasks:
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;{task_string}
&#x000A0;&#x000A0;
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;List&#x000A0;of&#x000A0;Datasets:
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;{dataset_string}
&#x000A0;&#x000A0;
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;List&#x000A0;of&#x000A0;Models:
&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;{model_string}
&#x000A0;&#x000A0;
&#x000A0;&#x000A0;&#x000A0;
</preformat>
</sec>
<sec>
<title>4.2.3 Dataset</title>
<p>For this evaluation, we randomly picked 5,000 pipelines from Papers-with-Code and HuggingFace each, totaling 10,000 pipelines. Only the pipelines with complete information such as model, dataset, task, and metrics were chosen. The pipelines from Papers-with-Code and HuggingFace are more descriptive which is essential for query generation. For example, Papers-with-Code has abstract, dataset description, task description, and so on. Similarly, HuggingFace has model cards, dataset description, and so on. Such descriptive information was not found in OpenML pipelines, and so they are omitted for this evaluation. For each pipeline, on an average of two queries were generated by ChatGPT using the prompt mentioned in Section 4.2.2</p>
</sec>
<sec>
<title>4.2.4 Model architecture</title>
<p>In this section, we propose a custom model described in <xref ref-type="table" rid="T10">Algorithm 2</xref> that utilizes self-attention based aggregation to learn embedding for each pipeline graphs as described in <xref ref-type="fig" rid="F4">Figure 4</xref>. For each node in N, where <italic>N</italic> &#x0003D; {<italic>p, s, e, a, d, m, met, f, r, t</italic>}, the name and description present as text are converted to 768-dimensional embedding using sentence transformer. Using the semantic properties computed for each pipeline graph nodes (Section 3), we create a knowledge string. The knowledge is then passed to a sentence transformer to create embedding for knowledge. Similarly, the generated queries are passed to the sentence transformer to generate respective embeddings. Through empirical analysis, we found that the sentence transformer embeddings perform better compared to a learnable embedding layer with one-hot embeddings.</p>
<table-wrap position="float" id="T10">
<label>Algorithm 2</label>
<caption><p>Model training.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-i0002.tif"/>
</table-wrap>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Architecture and workflow of the custom aggregation model utilized to learn the graph embedding.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0004.tif"/>
</fig>
<p>Then, these embeddings are given as an input to the self-attention block (<xref ref-type="table" rid="T11">Algorithm 3</xref>) to generate an intermediate graph embedding of 1024-dimensional vector. Similarly, the embeddings generated for knowledge vector are also transformed into 1024-dimensional vector using a learnable fully connected layer. The learnt embeddings of the nodes and the knowledge vector are combined using a weighted sum to generate final graph embedding <italic>ge</italic><sub><italic>i</italic></sub>. We present the results of the model with and without knowledge embedding in <xref ref-type="table" rid="T5">Table 5</xref>. The embeddings generated for query vector using sentence transformer embedding are also transformed into 1024-dimensional vector to obtain <italic>qe</italic><sub><italic>i</italic></sub>. The objective function described in Section 4.2.5 trains the model to such that <italic>ge</italic><sub><italic>i</italic></sub> and <italic>qe</italic><sub><italic>i</italic></sub> are closer in the embedding space.</p>
<table-wrap position="float" id="T11">
<label>Algorithm 3</label>
<caption><p>Self-attention based aggregation model (M).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-i0003.tif"/>
</table-wrap>
<p>In the case of AIMKG, 1.6 million pipeline graphs follow the graph structure described by CMO (Section 3.2). To add, the textual information present in the nodes holds the most information compared to the graph structure. While the connectivity between models, datasets, tasks, and other nodes of the pipeline is essential to learning an appropriate graph embedding, graph-based models such as graph convolutional neural networks or graph attention neural networks prioritize learning graph topology compared to node features (Section 6.4). For this reason, a custom aggregation model was proposed to learn embedding for each pipeline graph. <xref ref-type="table" rid="T5">Table 5</xref> shows the necessity of representing pipelines as a graph.</p>
</sec>
<sec>
<title>4.2.5 Objective function</title>
<p>To train the query embedding <italic>qei</italic> and the corresponding graph embedding <italic>gei</italic> to be closer in the embedding space, we use noise contrastive estimation (NCE) loss (Chen et al., <xref ref-type="bibr" rid="B10">2020</xref>). NCE loss has the ability to normalize large probability distributions making it effective for scalable training datasets. The equation for NCE loss is as follows:</p>
<disp-formula id="E8"><mml:math id="M17"><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">NCE</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle><mml:mrow><mml:mo stretchy="true">(</mml:mo></mml:mrow></mml:mstyle><mml:mo class="qopname">log</mml:mo><mml:mfrac><mml:mrow><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mo class="qopname">log</mml:mo><mml:mfrac><mml:mrow><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>g</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mi>q</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mstyle><mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>where <italic>N</italic> is the batch size, <italic>ge</italic><sub><italic>i</italic></sub> and <italic>qe</italic><sub><italic>i</italic></sub> are the embeddings for the <italic>i</italic>-th instance in the batch, and <italic>k</italic> is the number of negative samples which is <italic>N</italic> &#x02212; 1 where <italic>i</italic> &#x02260; <italic>j</italic>.</p>
</sec>
</sec>
</sec>
<sec id="s5">
<title>5 Experimentation</title>
<p>In this section, we define evaluation metrics performed to test the robustness of AIMKG and recommendation ability of AIMKG.</p>
<sec>
<title>5.1 AIMKG robustness</title>
<p>To evaluate the reliability of our knowledge graph construction, we employ a statistical technique called bootstrapping (Anirudh and Thiagarajan, <xref ref-type="bibr" rid="B5">2019</xref>). We take a random sample of 75% of the data and utilize it to create a partial knowledge graph using our proposed approach (AIMKG). We repeat this process 10 times to generate 10 distinct knowledge graphs. A robust knowledge graph construction method should show low variance in node properties within partial graphs. We measure variance in node degrees and other distributional properties, comparing these with knowledge graphs built using the MLSchema ontology. The performance improvements are detailed in the result section.</p>
</sec>
<sec>
<title>5.2 Custom heuristics: qualitative analysis</title>
<p>Through user evaluation, we evaluate the ability of custom heuristic function to rank similar tasks for a given unknown task and return its associated pipelines. Due to the abscence of ground truth, we rely on domain experts to evaluate the relevance of results returned by the recommender. For comparison, a knowledge graph constructed using same data but using state-of-the-art MLSchema ontology is used. Using the custom heuristic function, the tasks in AIMKG and MLSchema-based KG are ranked and associated pipelines are returned to the domain experts for evaluation. The task nodes in AIMKG contain properties such as Name, Modality, Category, and Description. Modality and Category are computed using NLP techniques. In contrast, MLSchema-based task nodes only have properties such as Name, Description, and custom user-reported properties. Therefore, <italic>S</italic><sub><italic>mod</italic></sub> and <italic>S</italic><sub><italic>cat</italic></sub> from <xref ref-type="disp-formula" rid="E2">Equation 2</xref> are always 0 for the recommender that uses MLSchema-based KG whereas <italic>cos</italic>(<italic>e</italic><sub><italic>i</italic></sub>, <italic>e</italic><sub><italic>j</italic></sub>) and <italic>J</italic>(<italic>T</italic><sub><italic>i</italic></sub>, <italic>T</italic><sub><italic>j</italic></sub>) are calculated using the same procedure. We configure the recommender to return the top-k relevant tasks and n pipelines for each task, where k and n are set to 3. We randomly select query tasks from various AI fields from AIMKG (<xref ref-type="table" rid="T4">Table 4</xref>) and drop the query task node to simulate unknown query task. This ensures the validity of the query task names. Eighteen domain experts aged between 24 and 50 participated in the evaluation study, each assigned 10 query tasks to determine the relevance of the recommendations provided.</p>
</sec>
<sec>
<title>5.3 Graph embedding learning</title>
<sec>
<title>5.3.1 Implementation details</title>
<p>We utilize sentence transformer <italic>all-mpnet-base-v2</italic> to generate the text encodings for node features, semantic properties (knowledge), and queries generated by ChatGPT. The default embedding size of 768 was used. Each of these text encodings are transformed into 1024-dimensional vector using a fully connected layer, one for knowledge vector and another for query encodings. We initialize the network with random weights for training. During training, for each pipeline, we randomly sample one query from the available generated queries. The batch size is set to 512. The Adam optimizer (Kingma and Ba, <xref ref-type="bibr" rid="B26">2014</xref>) was used with learning rate 10<sup>-4</sup> and weight decay set to 1<italic>e</italic><sup>-5</sup>. We employ early stopping to prevent the model from overfitting and train it for several epochs until it converges.</p>
</sec>
<sec>
<title>5.3.2 Evaluation protocols</title>
<p>The custom aggregation model learns a common embedding space to retrieve a process graph given a natural language query. These can be considered two modalities of data, namely, graph and text. Therefore, we evaluate the custom aggregation model described in Section 4.2.4 using retrieval metrics reported by Salvador et al. (<xref ref-type="bibr" rid="B39">2017</xref>). For a given query embedding <italic>qe</italic><sub><italic>i</italic></sub>, we retrieve the k closest graph embeddings <italic>ge</italic><sub>1&#x02026;<italic>k</italic></sub> using cosine similarity and present the results for k = 1, 3, and 5. We perform retrieval evaluation for 1,000 data samples and report results in <xref ref-type="table" rid="T5">Table 5</xref>. The definition of models reported is as follows:</p>
<list list-type="bullet">
<list-item><p><bold>GCN:</bold> A graph convolutional neural network that takes pipeline graph <italic>P</italic><sub><italic>i</italic></sub> with node encodings <italic>Ne</italic><sub><italic>i</italic></sub> to generate <italic>ge</italic><sub><italic>i</italic></sub> obtained using global mean pool of learned node embeddings.</p></list-item>
<list-item><p><bold>GAT:</bold> A graph attention neural network that takes pipeline graph <italic>P</italic><sub><italic>i</italic></sub> with node encodings <italic>Ne</italic><sub><italic>i</italic></sub> to generate <italic>ge</italic><sub><italic>i</italic></sub> obtained using global mean pool of learned node embeddings.</p></list-item>
<list-item><p><bold>Sent_Trans:</bold> Use pretrained sentence transformer to generate <italic>ge</italic><sub><italic>i</italic></sub> using pipeline name and description. Use query text to generate <italic>qe</italic><sub><italic>i</italic></sub>. Both <italic>ge</italic><sub><italic>i</italic></sub> and <italic>qe</italic><sub><italic>i</italic></sub> are 768-dimensional vector as that is the default embedding size for sentence transformers.</p></list-item>
<list-item><p><bold>Sent_Trans_Finetune</bold>: Use <italic>qe</italic><sub><italic>i</italic></sub> and <italic>ge</italic><sub><italic>i</italic></sub> from <bold>Sent_Trans</bold> model and transform them into 1024-dimensional vector using a learnable fully connected layer.</p></list-item>
<list-item><p><bold>Custom_Agg:</bold> Model described in <xref ref-type="table" rid="T11">Algorithm 3</xref> that takes in node encodings <italic>Ne</italic><sub><italic>i</italic></sub> for each pipeline graph <italic>P</italic><sub><italic>i</italic></sub> to learn graph embedding <italic>he</italic><sub><italic>i</italic></sub> (equivalent of <italic>ge</italic><sub><italic>i</italic></sub> for this model).</p></list-item>
<list-item><p><bold>Custom_Agg_Knowledge</bold>: Model described in <xref ref-type="table" rid="T10">Algorithm 2</xref>. It takes the output from <bold>Custom_Agg</bold> <italic>he</italic><sub><italic>i</italic></sub> and transformed knowledge vector <italic>ke</italic><sub><italic>i</italic></sub> to learn <italic>ge</italic><sub><italic>i</italic></sub> &#x0003D; &#x003B1;.<italic>he</italic><sub><italic>i</italic></sub> &#x0002B; &#x003B2;.<italic>ke</italic><sub><italic>i</italic></sub> where &#x003B1; and &#x003B2; are learnable weights.</p></list-item>
</list>
</sec>
</sec>
</sec>
<sec id="s6">
<title>6 Result and discussion</title>
<sec>
<title>6.1 AIMKG overview</title>
<p>The statistical overview of AIMKG can be found in <xref ref-type="table" rid="T2">Table 2</xref>. The AIMKG consisting of knowledge graphs KG1, KG2, and KG3 contains 8 million nodes and 25 million relationships in label property graph (LPG) format. There are &#x0007E;78 million triples in RDF format which include the vector embeddings computed as properties. There are 11 types of nodes that represent each component of AI pipeline metadata and 13 types of relationships among those entities. Currently, the knowledge graph consists of 1.6 million AI pipelines executed for &#x0007E;10 k tasks with &#x0007E;53 k datasets and &#x0007E;270k models. The knowledge graph is currently growing in size to include more pipelines and additional knowledge. A sample pipeline present in AIMKG is described in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. The details of system maintenance and performance are also included in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Overall statistics of AIMKG.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Components</bold></th>
<th valign="top" align="left"><bold>Quantity</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">&#x00023; Nodes (LPG)</td>
<td valign="top" align="left">8 million</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; Relationships (LPG)</td>
<td valign="top" align="left">25 million</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; of triples (RDF)</td>
<td valign="top" align="left">78 million</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; Types of nodes</td>
<td valign="top" align="left">14</td>
</tr> <tr>
<td valign="top" align="left">&#x00023; Types of relationships</td>
<td valign="top" align="left">15</td>
</tr>
<tr>
<td valign="top" align="left">&#x00023; AI piplines</td>
<td valign="top" align="left">&#x0007E;1.6 million</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>6.2 AIMKG robustness</title>
<p>The first row in <xref ref-type="table" rid="T3">Table 3</xref> shows the results obtained from knowledge graph constructed using MLSchema ontology. It is evident that the variance in node degrees is higher compared to AIMKG. AIMKG demonstrates lower variance, confirming the robustness of the knowledge graph construction scheme. Furthermore, we observe that when using the MLSchema ontology, only 71% of the nodes are part of the largest connected component in the knowledge graph, while the remaining nodes are part of other disconnected components. In contrast, AIKMG includes 93% of the nodes in the largest connected component, indicating a more coherent graph structure for performing downstream tasks.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Comparative analysis of robustness of AIMKG and MLS-KG.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>KGs</bold></th>
<th valign="top" align="center"><bold>Mean variance</bold></th>
<th valign="top" align="center"><bold>Median degree</bold></th>
<th valign="top" align="center"><bold>Max degree</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">MLS-KG</td>
<td valign="top" align="center">3.465</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">103</td>
</tr>
<tr>
<td valign="top" align="left">AIMKG</td>
<td valign="top" align="center">2.383</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">389</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>6.3 Relevant pipeline recommendation using custom heuristics</title>
<p>The results of user agreement on the relevance of recommendations provided for query tasks are summarized in <xref ref-type="table" rid="T4">Table 4</xref>. The goal is to return relevant pipelines for a given query of unknown task. For each areas such as computer vision, natural language processing, audio/speech, and video, 20 query tasks were evaluated. Due to limited number of pipelines in AIMKG, 10 queries were evaluated for multimodal and other areas that includes graphs, reasoning, and game-related learning. In total, 100 queries were evaluated for each recommender. According to domain experts, the recommender utilizing AIMKG achieved relevant results for 78% of the queries, while the MLSchema-based recommender had a lower success rate of 51%. The Cohen&#x00027;s kappa score computed for the subset (25%) of the queries was found to be 0.657, which is considered a substantial agreement between the domain experts on the recommendation relevance. The computed semantic properties utilized by the custom heuristic function (<xref ref-type="disp-formula" rid="E2">Equation 2</xref>) played a significant role in understanding task nature and capturing synonyms. For example, a query for <italic>Dialogue Interpretation</italic> returned <italic>Dialogue Understanding</italic> as a relevant task, showcasing the recommender&#x00027;s ability to recognize synonyms.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>User evaluation study.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Areas</bold></th>
<th valign="top" align="center"><bold>AIMKG</bold></th>
<th valign="top" align="center"><bold>MLS-KG</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Computer vision</td>
<td valign="top" align="center">17/20</td>
<td valign="top" align="center">8/20</td>
</tr> <tr>
<td valign="top" align="left">Natural language processing</td>
<td valign="top" align="center">16/20</td>
<td valign="top" align="center">10/20</td>
</tr> <tr>
<td valign="top" align="left">Audio/speech</td>
<td valign="top" align="center">15/20</td>
<td valign="top" align="center">11/20</td>
</tr> <tr>
<td valign="top" align="left">Video</td>
<td valign="top" align="center">15/20</td>
<td valign="top" align="center">10/20</td>
</tr> <tr>
<td valign="top" align="left">Multimodal</td>
<td valign="top" align="center">6/10</td>
<td valign="top" align="center">6/10</td>
</tr> <tr>
<td valign="top" align="left">Other</td>
<td valign="top" align="center">9/10</td>
<td valign="top" align="center">6/10</td>
</tr>
<tr>
<td valign="top" align="left">Total</td>
<td valign="top" align="center">78/100</td>
<td valign="top" align="center">51/100</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Relevance of recommendations produced by AIMKG and MLS-KG recommenders.</p>
</table-wrap-foot>
</table-wrap>
<p>In computer vision queries, there was a notable difference in relevance scores due to significant number of challenging queries which did not explicitly mention the word &#x0201C;image.&#x0201D; For example, <italic>3D object detection</italic> and <italic>3D human pose estimation</italic> do not have the word image in it, but they are image-based tasks. Similarly, NLP-based tasks also benefited from semantic enhancements present in AIMKG. Video-based tasks are extensions of computer vision-based tasks that includes temporal factor. Therefore, like computer vision-based queries, a significant amount of video-based queries did not explicitly have the word &#x0201C;video&#x0201D; in it. Some examples include <italic>motion detection</italic> and <italic>human movement detection</italic>. The category other was challenging for both recommenders as the vocabulary curated for these areas is relatively small to identify modalities of these tasks.</p>
<p>When AIMKG is deployed as an open-source platform, it serves as a curated knowledge repository of open-source AI innovations that are searchable, discoverable, and executable. Users can search among 280 k models, 53 k datasets used for 10 k tasks at one place. It is an AI exploration and experimentation platform that hosts, serves, and refreshes state-of-the-art open-source AI innovations. This enables the reproduction of AI pipelines, including data preprocessing, pretraining, fine-tuning, and model deployment, which are impactful across various use cases. The broader practical impacts of AIMKG in fields such as healthcare, finance, and legal for pipeline optimization through relevant pipeline recommendation can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
</sec>
<sec>
<title>6.4 Relevant pipeline recommendation using graph learning</title>
<p>From <xref ref-type="table" rid="T5">Table 5</xref>, it can be observed that our <bold>Custom_Agg_Knowledge</bold> model performed the best and <bold>Custom_Agg</bold> performed second best against other baseline approaches. The difference is that the former model utilizes semantic properties computed for various pipeline graph component entities as described in Section 3.4.2. Furthermore, the proposed method demonstrated statistically significant improvements over baseline methods, confirmed by both Friedman&#x00027;s test (<italic>p</italic> &#x0003C; 0.01) and pairwise Wilcoxon signed-rank tests (<italic>p</italic> &#x0003C; 0.01). The results of sensitivity analysis and ablation study can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. <bold>GCN</bold> and <bold>GAT</bold> models weigh in more on learning the topological structure of the graphs compared to node features. In AIMKG, all pipeline graphs follow similar graph structure defined by CMO. To add, most information about the pipeline is present as text in the node features. Due to this, the text information gets diluted over graph structure in <bold>GCN</bold> and <bold>GAT</bold> models. As expected, these models have the least retrieval scores. The sentence transformer model was evaluated with and without fine-tuning to test whether pipeline descriptions (abstract, model-card) suffice for relevant retrieval. The fine-tuned model performed better for the HuggingFace and Combined datasets but not for Papers-with-Code, likely due to its detailed description on pipelines already present in abstract. Fine-tuning may have caused embedding instability for Papers-with-Code, while it improved accuracy for HuggingFace and the Combined dataset.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Retrieval results of models for 1,000 datapoints, reported in percentage.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Models</bold></th>
<th valign="top" align="center" colspan="3"><bold>Papers-with-Code</bold></th>
<th valign="top" align="center" colspan="3"><bold>Huggingface</bold></th>
<th valign="top" align="center" colspan="3"><bold>Combined</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#919498;color:#ffffff">
<td/>
<td valign="top" align="center"><bold>R1</bold></td>
<td valign="top" align="center"><bold>R3</bold></td>
<td valign="top" align="center"><bold>R5</bold></td>
<td valign="top" align="center"><bold>R1</bold></td>
<td valign="top" align="center"><bold>R3</bold></td>
<td valign="top" align="center"><bold>R5</bold></td>
<td valign="top" align="center"><bold>R1</bold></td>
<td valign="top" align="center"><bold>R3</bold></td>
<td valign="top" align="center"><bold>R5</bold></td>
</tr> <tr>
<td valign="top" align="left">GAT</td>
<td valign="top" align="center">47.3</td>
<td valign="top" align="center">65.8</td>
<td valign="top" align="center">72.2</td>
<td valign="top" align="center">33</td>
<td valign="top" align="center">51.8</td>
<td valign="top" align="center">60.5</td>
<td valign="top" align="center">44.3</td>
<td valign="top" align="center">60</td>
<td valign="top" align="center">66.2</td>
</tr> <tr>
<td valign="top" align="left">GCN</td>
<td valign="top" align="center">52.3</td>
<td valign="top" align="center">66.1</td>
<td valign="top" align="center">72.5</td>
<td valign="top" align="center">39</td>
<td valign="top" align="center">56.9</td>
<td valign="top" align="center">64.7</td>
<td valign="top" align="center">48.4</td>
<td valign="top" align="center">62.8</td>
<td valign="top" align="center">69</td>
</tr> <tr>
<td valign="top" align="left">Sent_Trans</td>
<td valign="top" align="center">82.6</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">91.4</td>
<td valign="top" align="center">25.8</td>
<td valign="top" align="center">38.5</td>
<td valign="top" align="center">44.8</td>
<td valign="top" align="center">57.6</td>
<td valign="top" align="center">66.8</td>
<td valign="top" align="center">70.7</td>
</tr> <tr>
<td valign="top" align="left">Sent_Trans_Finetune</td>
<td valign="top" align="center">65.4</td>
<td valign="top" align="center">80</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">47.8</td>
<td valign="top" align="center">65.6</td>
<td valign="top" align="center">73.2</td>
<td valign="top" align="center">69.0</td>
<td valign="top" align="center">79.6</td>
<td valign="top" align="center">83.6</td>
</tr> <tr>
<td valign="top" align="left">Custom_Agg</td>
<td valign="top" align="center">85.9</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">92.2</td>
<td valign="top" align="center">55.9</td>
<td valign="top" align="center">69.2</td>
<td valign="top" align="center">73.9</td>
<td valign="top" align="center">74.8</td>
<td valign="top" align="center">82.7</td>
<td valign="top" align="center">85.8</td>
</tr>
<tr>
<td valign="top" align="left">Custom_Agg_Knowledge</td>
<td valign="top" align="center"><bold>87.1</bold></td>
<td valign="top" align="center"><bold>91.5</bold></td>
<td valign="top" align="center"><bold>94.1</bold></td>
<td valign="top" align="center"><bold>58</bold></td>
<td valign="top" align="center"><bold>71.1</bold></td>
<td valign="top" align="center"><bold>75.7</bold></td>
<td valign="top" align="center"><bold>76.3</bold></td>
<td valign="top" align="center"><bold>85.4</bold></td>
<td valign="top" align="center"><bold>87.7</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold indicates the best performing model.</p>
</table-wrap-foot>
</table-wrap>
<p>In summary, the <bold>Sent_Trans</bold> and <bold>Sent_Trans_Finetune</bold> results show that pipeline graphs are essential for effective retrieval, capturing relationships between datasets, models, tasks, and entities. Descriptions from Papers-with-Code and HuggingFace are limited. Traditional graph models such as <bold>GCN</bold> and <bold>GAT</bold> underperformed on AIMKG due to their focus on topology over node features. The proposed custom aggregation model, emphasizing node features, outperformed others with added knowledge-boosting results.</p>
</sec>
<sec>
<title>6.5 Pipeline optimization</title>
<p>In this section, we present the results on utilizing recommendations from AIMKG to seed the AI pipeline experimentations. Existing work Pedretti et al. (<xref ref-type="bibr" rid="B37">2023</xref>) demonstrated the use of novel in-memory accelerator engines to speed-up the inference of tree-based machine learning models for heterogeneous (tabular) data, the most widely used type of data across various industries. We employed seven widely used real-world tabular datasets for binary/multi-class classification and regression problems from research papers. In this section, we demonstrate the improvements in executing the hyperparameter optimization AI pipelines for gradient-boosted trees (XGBoost; Chen and Guestrin, <xref ref-type="bibr" rid="B9">2016</xref>) on several binary and multi-class classification problems from that paper. Concretely, we collected results from hyperparameter optimization pipelines for four datasets (Eye Movement, Gas Concentration, Gesture Phase Segmentation, and Rossmann Stores Sales). We then imported the pipeline performance data into the AIMKG using one of the developed parses and asked it to recommend pipeline configurations for new, previously unseen, similar problems&#x02014;churn modeling, telco customer churn, and forest cover type. We then used these recommendations from AIMKG to warm-up the Bayesian (TPE&#x02014;tree-structured Parzen Estimators; Bergstra et al., <xref ref-type="bibr" rid="B7">2011</xref>) hyperparameter optimization. We compared results with the reference results where no warm-up initialization was made. <xref ref-type="table" rid="T6">Table 6</xref> shows three datasets. For each dataset, we report observed speed-up (wall time) to optimize hyperparameters of respective models to same or lower loss. In addition, we observed that the final loss was lower compared to experiments without warm-up initialization.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Result of pipeline optimization achieved using AIMKG recommender.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left" colspan="2"><bold>Churn modeling</bold></th>
<th valign="top" align="center" colspan="2"><bold>TelcoCustomerChurn</bold></th>
<th valign="top" align="center" colspan="2"><bold>ForectCoverType</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#919498;color:#ffffff">
<td valign="top" align="left"><bold>SpeedUp</bold></td>
<td valign="top" align="center"><bold>LossDiff</bold></td>
<td valign="top" align="center"><bold>SpeedUp</bold></td>
<td valign="top" align="center"><bold>LossDiff</bold></td>
<td valign="top" align="center"><bold>SpeedUp</bold></td>
<td valign="top" align="center"><bold>LossDiff</bold></td>
</tr>
<tr>
<td valign="top" align="left">8.60</td>
<td valign="top" align="center">&#x02013;0.11%</td>
<td valign="top" align="center">11.84</td>
<td valign="top" align="center">&#x02013;0.63%</td>
<td valign="top" align="center">1.47</td>
<td valign="top" align="center">&#x02013;0.02%</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>6.6 Additional attributes of AIMKG</title>
<sec>
<title>6.6.1 Search using semantic enrichment and graph traversal</title>
<p>We illustrate the potential of AIMKG to perform complex queries that utilizes combination of custom heuristic functions in Section 4.1 and graph traversal to return desired results through <xref ref-type="fig" rid="F5">Figure 5</xref>. We queried recommender to return datasets and models used for <italic>image detection</italic> task. Since this task does not exist in the repository, it identifies <italic>2d-object detection</italic> and <italic>3d object detection</italic> as similar tasks using the heuristic function in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>. Even though the task names did not have explicit mention of the word &#x0201C;image,&#x0201D; they are identified as image-based tasks due to the semantic property <italic>modality</italic>. In addition, the recommender traverses the path from <italic>Task</italic> &#x02192; <italic>Pipeline</italic> &#x02192; <italic>Stage</italic> &#x02192; <italic>Execution</italic> &#x02192; <italic>Artifact</italic> &#x02192; <italic>Dataset</italic> and <italic>Task</italic> &#x02192; <italic>Pipeline</italic> &#x02192; <italic>Stage</italic> &#x02192; <italic>Execution</italic> &#x02192; <italic>Artifact</italic> &#x02192; <italic>Model</italic> to retrieve models and datasets. More sample queries and their results can be found at Venkataramanan (<xref ref-type="bibr" rid="B46">2023</xref>).</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Sample query: list all the image detection pipelines with dataset and evaluations. The tasks &#x0201C;3d object detection&#x0201D; and &#x0201C;2d object detection&#x0201D; are returned by AIMKG even though no explicit mention of &#x0201C;image.&#x0201D; To add, the graph traversed from task to datasets and models to identify models and datasets used for image detection task. More sample queries of pipelines with hyperparameters can be found at Venkataramanan (<xref ref-type="bibr" rid="B46">2023</xref>).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0005.tif"/>
</fig>
</sec>
<sec>
<title>6.6.2 Relevant pipeline recommendation using graph traversal</title>
<p>In addition to <xref ref-type="disp-formula" rid="E3">Equation 3</xref>, similar datasets can also be obtained through graph traversal as shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. The query is to return datasets similar to <italic>Awesome-chatgpt-prompts</italic>. Using the inference that if the datasets are used for the same task, they can be similar in certain aspects, we performed graph traversal query, and the resulting graph is shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. To perform the same query, that is to return similar datasets to a given dataset, other kinds of inferences can be used such as (i) if the datasets are used in the same pipeline, they can be considered; (ii) if the datasets are used in the same pipeline with same model, they can be considered similar and so on.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Query: identify datasets similar to <italic>Awesome-chatgpt-prompts</italic>. In this example, similar datasets were identified based on graph traversal. That is, if the datasets are used for the same task, they might be similar in certain aspects.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0006.tif"/>
</fig>
</sec>
<sec>
<title>6.6.3 AIMKG: dynamic AI pipeline knowledge repository</title>
<p>AIMKG is a constantly evolving graph that updates itself periodically by fetching data from Papers-with-Code, OpenML, and HuggingFace. We are also working toward including other metadata sources mentioned in <xref ref-type="table" rid="T1">Table 1</xref>. This iterative process of periodic updates involves continuous monitoring, ensuring that the graph remains current and reflective of the evolving information landscape. Given that AI domain is ever changing with new models being introduced and manuscripts being published, it is imminent that AIMKG is live and dynamic. We demonstrate the importance of maintaining a live pipeline for AIMKG using the example described in <xref ref-type="table" rid="T7">Table 7</xref>.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Comparison of AIMKG at two different timestamps.</p></caption>
<table frame="box" rules="none">
<tbody>
<tr>
<td valign="top" align="left"><bold>Timestamp 1</bold></td>
</tr>
<tr>
<td valign="top" align="left"><bold>Input Query</bold>: Models and Pipelines for the task &#x0201C;Question Answering&#x0201D;</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Response:</bold></td>
</tr>
<tr>
<td valign="top" align="left">Result-1:</td>
</tr>
<tr>
<td valign="top" align="left">Model: (i) BERT, (ii) GELU,</td>
</tr>
<tr>
<td valign="top" align="left">Pipeline: Leveraging Commonsense Knowledge on Classifying False News and Determining Checkworthiness of Claims,</td>
</tr>
<tr>
<td valign="top" align="left">URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2108.03731v1.pdf">https://arxiv.org/pdf/2108.03731v1.pdf</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Code: none,</td>
</tr>
<tr>
<td valign="top" align="left">Year of publication: 2021</td>
</tr>
<tr>
<td valign="top" align="left">Result-2:</td>
</tr>
<tr>
<td valign="top" align="left">Model: (i) VisualBERT, (ii) Learning Cross-Modality Encoder Representations from Transformers</td>
</tr>
<tr>
<td valign="top" align="left">Pipeline: &#x0201C;A Comparison of Pre-trained Vision-and-Language Models for Multimodal Representation Learning across Medical Images and Reports,&#x0201D;</td>
</tr>
<tr>
<td valign="top" align="left">URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2009.01523v1.pdf">https://arxiv.org/pdf/2009.01523v1.pdf</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Code: <ext-link ext-link-type="uri" xlink:href="https://github.com/YIKUAN8/Transformers-VQA7">https://github.com/YIKUAN8/Transformers-VQA7</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Year of publication: 2020</td>
</tr>
<tr>
<td valign="top" align="left">...</td>
</tr>
<tr>
<td valign="top" align="left" style="border-top: thin solid #000000;"><bold>Timestamp 2</bold></td>
</tr>
<tr>
<td valign="top" align="left"><bold>Input Query:</bold> Models and Pipelines for the task &#x0201C;Question Answering&#x0201D;</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Response:</bold></td>
</tr>
<tr>
<td valign="top" align="left">Result-1:</td>
</tr>
<tr>
<td valign="top" align="left">Model: mulinski/bert-finetuned-squad,</td>
</tr>
<tr>
<td valign="top" align="left">Pipeline: Question Answering using bert-finetuned-squad,</td>
</tr>
<tr>
<td valign="top" align="left">URL: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/mulinski/bert-finetuned-squad">https://huggingface.co/mulinski/bert-finetuned-squad</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Code: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/mulinski/bert-finetuned-squad/tree/main">https://huggingface.co/mulinski/bert-finetuned-squad/tree/main</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Year of publication: 2023</td>
</tr>
<tr>
<td valign="top" align="left">Result-2:</td>
</tr>
<tr>
<td valign="top" align="left">Model: dantern/xlm-roberta-base-vn-dplat,</td>
</tr>
<tr>
<td valign="top" align="left">Pipeline: Question Answering using dantern/xlm-roberta-base-vn-dplat,</td>
</tr>
<tr>
<td valign="top" align="left">URL: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/dantern/xlm-roberta-base-vn-dplat">https://huggingface.co/dantern/xlm-roberta-base-vn-dplat</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Code: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/dantern/xlm-roberta-base-vn-dplat/tree/main">https://huggingface.co/dantern/xlm-roberta-base-vn-dplat/tree/main</ext-link>,</td>
</tr>
<tr>
<td valign="top" align="left">Year of publication: 2023</td>
</tr>
<tr>
<td valign="top" align="left">. . .</td>
</tr></tbody>
</table>
</table-wrap>
<p>We query AIMKG to return pipelines and models for the task <italic>Question Answering</italic>. Before the integration of most recent models from HuggingFace, AIMKG returned pipelines that were published in 2021 and 2020, respectively. Each of these pipelines used two models in their experimentation. When the same query was ran at a different timestamp, after integrating the most recent models, it returned <italic>bert-finetuned-squad</italic> and <italic>xlm-roberta-base-vn-dplat</italic> as the models used for <italic>Question Answering</italic> along with their pipelines. These models were published in 2023. The result from AIMKG now contains most recent models used for <italic>Question Answering</italic>. This self-updating mechanism not only enhances the graph&#x00027;s comprehensiveness but also ensures that it consistently serves as a reliable and up-to-date resource for users seeking the latest insights and connections within the represented domain. System maintenance and performance details are included in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
</sec>
<sec>
<title>6.6.4 Integration of multiple data sources</title>
<p>As mentioned in Section 3, AIMKG consists of pipeline metadata obtained from multiple sources such as Papers-with-Code, OpenML, and HuggingFace. It is worth noting that Papers-with-Code and HuggingFace have information overlap to certain degree. While the overlap has been identified and unified, it has also helped in pipeline completion in certain cases. For example, in <xref ref-type="fig" rid="F7">Figure 7</xref>, AIMKG had pipeline name and report from Papers-with-Code. For these pipelines, the model and dataset information is not available via Papers-with-Code API as they were not recorded by users explicitly. On the other hand, HuggingFace had model and dataset information for these pipelines. By utilizing paper arxiv ID and paper title, our AIMKG construction pipeline identified that these two are from the same pipeline by mapping them to CMO. While there are several such example, a few of them are included in <xref ref-type="fig" rid="F7">Figure 7</xref> to demonstrate the concept of integration of data sources that can aid in completion of pipeline metadata. Ontologies and knowledge graphs excel at the task of recognizing identical concepts present in various data sources. The ability of ontologies and knowledge graphs to discern shared meanings has enabled AIMKG to identify identical concepts from disparate data contexts.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Sample of pipeline completion done by AIMKG by identifying identical concepts from Papers-with-Code and Huggingface.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1476506-g0007.tif"/>
</fig>
</sec>
</sec>
<sec>
<title>6.7 Comparison with ChatGPT</title>
<p>As Large Language Models (LLMs) have been known to perform several tasks, we compared our AI pipeline recommendation task with ChatGPT-3.5 (OpenAI, <xref ref-type="bibr" rid="B34">2022</xref>). We queried both our AIMKG recommender and ChatGPT to return pipelines executed for the task <italic>Supervised Anomaly Detection</italic> and reported the results in <xref ref-type="table" rid="T8">Table 8</xref>. Anomaly detection is the task of detecting data instances that significantly deviate from the majority of the data points (Pang et al., <xref ref-type="bibr" rid="B35">2021</xref>). ChatGPT offers an interactive framework for querying, and it returns the models and their papers for common tasks such as 3D-aware image synthesis. The model seems to generate relevant information for familiar tasks from computer vision or natural language processing. However, the model seems to generate non-existent information for less familiar tasks such as anomaly detection or sequence-to-sequence speech generation. ChatGPT was prompted with sample query and sample expected response as mentioned in <xref ref-type="table" rid="T8">Table 8</xref>. The model generated similar response as given in the prompt. However, the pipelines returned by ChatGPT, in terms of papers, seem to be non-existent in literature. That is, ChatGPT exhibited instances of hallucination, generating information that does not exist in a convincingly realistic manner which is a well-known issue (Zuccon et al., <xref ref-type="bibr" rid="B50">2023</xref>). In the response presented in <xref ref-type="table" rid="T8">Table 8</xref>, all of GitHub links return 404 when attempted to reach. For all the three recommended papers, the arxiv link is the same and it does not belong to any of the papers recommended. Paper 1 that was recommended with the title &#x0201C;DeepOne-Class: Outlier Detection with Deep Learning&#x0201D; does not exist. This means the model and dataset generated in the response are also in question. For paper 2 and paper 3, the titles are partially correct but papers with exact titles do not exist. Similarly, the datasets mentioned in paper 2 and paper 3 are not the datasets used in the actual paper. It is evident that the recommendations from ChatGPT are not reliable to reproduce or train an AI pipeline. More example responses along with explanations can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Comparison of AIMKG recommender with ChatGPT.</p></caption>
<table frame="box" rules="none">
<tbody>
<tr>
<td valign="top" align="left"><bold>AIMKG results</bold></td>
</tr>
<tr>
<td valign="top" align="left"><bold>Input query</bold>: Supervised Anomaly Detection</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Response</bold>: Since the exact match is not found, we recommend the following pipelines from relevant task:</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Recommendation 1:</bold></td>
</tr>
<tr>
<td valign="top" align="left">Similar Task Name: Weakly Supervised Defect Detection</td>
</tr>
<tr>
<td valign="top" align="left">Similarity: 0.4752</td>
</tr>
<tr>
<td valign="top" align="left">Pipline-1: S2D2Net: An Improved Approach For Robust Steel Surface Defects Diagnosis With Small Sample Learning, URL: <ext-link ext-link-type="uri" xlink:href="https://ieeexplore.ieee.org/document/9506405">https://ieeexplore.ieee.org/document/9506405</ext-link>, Git: <ext-link ext-link-type="uri" xlink:href="https://github.com/vikxoxo/S2D2Net">https://github.com/vikxoxo/S2D2Net</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">Pipline-2: Mixed supervision for surface-defect detection: from weakly to fully supervised learning, URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2104.06064v3.pdf">https://arxiv.org/pdf/2104.06064v3.pdf</ext-link>, Git: <ext-link ext-link-type="uri" xlink:href="https://github.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision">https://github.com/aws-samples/amazon-sagemaker-edge-defect-detection-computer-vision</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">...</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Recommendation 2:</bold></td>
</tr>
<tr>
<td valign="top" align="left">Similar Task Name: Semi-supervised Anomaly Detection</td>
</tr>
<tr>
<td valign="top" align="left">Similarity: 0.4643</td>
</tr>
<tr>
<td valign="top" align="left">Pipline-1: Deep Semi-Supervised Anomaly Detection, URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/1906.02694v2.pdf">https://arxiv.org/pdf/1906.02694v2.pdf</ext-link>, Git: <ext-link ext-link-type="uri" xlink:href="https://github.com/Fraunhofer-AISEC/R2-AD2">https://github.com/Fraunhofer-AISEC/R2-AD2</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">Pipline-2: Real-world Anomaly Detection in Surveillance Videos, URL: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/pdf/1801.04264v3.pdf">http://arxiv.org/pdf/1801.04264v3.pdf</ext-link>, Git: <ext-link ext-link-type="uri" xlink:href="https://github.com/ShreyasArthur/AbnormalEventDetection">https://github.com/ShreyasArthur/AbnormalEventDetection</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">...</td>
</tr>
<tr>
<td valign="top" align="left" style="border-top: thin solid #000000;"><bold>ChatGPT results</bold></td>
</tr>
<tr>
<td valign="top" align="left"><bold>Input prompt:</bold> Create a session where you are an AI pipeline recommender. I will request for a pipeline that has been executed in the past. You can use any source you want. For example, when I request to recommend a pipeline for &#x0201C;3d video object tracking&#x0201D;, I expect a response as follows:</td>
</tr>
<tr>
<td valign="top" align="left">Similar Task Name: 3D Object Tracking, Similarity: 0.6083, Piplines: Exploring Point-BEV Fusion for 3D Point Cloud Object Tracking with Transformer, URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2208.05216v1.pdf">https://arxiv.org/pdf/2208.05216v1.pdf</ext-link>, Model: Transformer, Dataset: KITTI, Github: <ext-link ext-link-type="uri" xlink:href="https://github.com/jasonkks/pttr">https://github.com/jasonkks/pttr</ext-link> Piplines: MOTSLAM: MOT-assisted monocular dynamic SLAM using single-view depth estimation, URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2210.02038v1.pdf">https://arxiv.org/pdf/2210.02038v1.pdf</ext-link>, Model: SLAM, Dataset: KITTI, Github: none</td>
</tr>
<tr>
<td valign="top" align="left">Now give me the pipelines executed for supervised anomaly detection.</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Response:</bold> Similar Task Name: Supervised Anomaly Detection, Similarity: 0.7254</td>
</tr>
<tr>
<td valign="top" align="left">1) Paper Title: &#x0201C;DeepOne-Class: Outlier Detection with Deep Learning,&#x0201D; URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.06360">https://arxiv.org/abs/1802.06360</ext-link>, Model: Deep Neural Networks, Dataset: Credit Card Fraud, NAB, Github: <ext-link ext-link-type="uri" xlink:href="https://github.com/hpzhao/Anomaly-Detection-DeepOneClass">https://github.com/hpzhao/Anomaly-Detection-DeepOneClass</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">2) Paper Title: &#x0201C;Autoencoder-Based Anomaly Detection,&#x0201D; URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.06360">https://arxiv.org/abs/1802.06360</ext-link>, Model: Autoencoder, Dataset: MNIST, CIFAR-10, Github: <ext-link ext-link-type="uri" xlink:href="https://github.com/abnormal-1/anomaly-detection-resources">https://github.com/abnormal-1/anomaly-detection-resources</ext-link></td>
</tr>
<tr>
<td valign="top" align="left">3) Paper Title: &#x0201C;LSTM-based Encoder-Decoder for Anomaly Detection,&#x0201D; URL: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.06360">https://arxiv.org/abs/1802.06360</ext-link>, Model: Long Short-Term Memory (LSTM), Dataset: Time Series Anomaly Detection Benchmark, Github: <ext-link ext-link-type="uri" xlink:href="https://github.com/ChunFan-AST/LSTM_Anomaly_Detection">https://github.com/ChunFan-AST/LSTM_Anomaly_Detection</ext-link></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>AIMKG responses can include model and dataset for each pipeline. Please refer <xref ref-type="table" rid="T7">Table 7</xref> for model related examples. Please refer <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref> for more experiments with ChatGPT.</p>
</table-wrap-foot>
</table-wrap>
<p>On the other hand, AIMKG returns a list of relevant pipelines for <italic>Supervised Anomaly Detection</italic> tasks as the exact match of the task is unavailable. For brevity, the responses currently include the paper and git repository from which the pipeline can be reproduced. However, AIMKG can also list associated datasets, models, and metrics for some of these pipelines. It is noteworthy that the first recommendation task, <italic>Weakly Supervised Defect Detection</italic>, did not explicitly mention the word <italic>anomaly</italic>. However, our recommender captured that <italic>defect detection</italic> is synonymous with <italic>anomaly detection</italic> in the domain of AI by just using a model pretrained for generic tasks. This also demonstrates the efficiency of embedding and semantic property-based ranking functions described in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>. To add, the AIMKG recommender is explainable by design and the results are explainable.</p>
<p>To summarize, AIMKG produces relevant explainable results and also ensures the reproducibility of the recommended pipelines. While ChatGPT may respond with relevant models for familiar tasks, it hallucinates for many other cases, making it unreliable. Though ChatGPT has access to the data sources AIMKG is constructed with, it cannot construct an AI pipeline from the information available to it. Therefore, the construction of AIMKG enhanced with semantic knowledge is essential to recommend relevant pipelines to users.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s7">
<title>7 Conclusion</title>
<p>In this study, we proposed Common Metadata Ontology (CMO) to construct an Artificial Intelligence pipeline Metadata Knowledge Graph (AIMKG), a first-of-its-kind knowledge graph for AI pipelines. AI pipeline metadata from open sources such as Papers-with-Code, OpenML, and HuggingFace are integrated to AIMKG, resulting in 1.6 million pipelines with semantic enhancements. The semantic enhancements incorporated in AIMKG capture implicit knowledge (<xref ref-type="fig" rid="F5">Figure 5</xref>) and enhance reasoning capabilities. AIMKG can also store multimodal data types such as embeddings of task, dataset, model and pipeline nodes, supporting text, and numeric and vector data types. Using the computed semantic properties and embeddings, we introduced a custom heuristic ranking metric to rank relevant pipelines for recommendations using task, dataset, or model. The custom heuristic ranking function captured the underlying semantics of the pipeline entities, resulting in more relevant recommendations than the MLSchema-based recommender. The semantic properties also enhance search, as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. To enable natural language queries for pipelines, we proposed a custom graph embedding aggregation model to retrieve and recommend relevant pipelines. We also demonstrated the potential of AIMKG in optimizing pipelines by seeding them with relevant recommendations. Therefore, AIMKG is an atlas for navigating the rapidly evolving artificial intelligence world.</p>
<p>Currently, not all tasks and datasets in AIMKG have computed semantic properties such as modalities and categories. To address this, we plan to leverage reports and manuscripts associated with pipelines to automatically compute these properties, reducing the biases associated with manually curated vocabularies. In addition, we intend to calculate further semantic and statistical properties for datasets and models, such as dataset image size, color scale, number of classes, data points per class, and model type. We aim to integrate metadata from other open-source repositories, such as Kaggle and the Common Metadata Framework (Koomthanam et al., <xref ref-type="bibr" rid="B27">2024</xref>), into AIMKG. To enrich recommendations and ensure completeness for all 1.6 million pipelines, we plan to utilize fine-tuned language models for extracting information from research papers. Although community-driven sources such as Papers-with-Code, OpenML, and Hugging Face are widely used, they may contain metadata inaccuracies. To improve accuracy and reliability, we will implement robust metadata validation techniques (Soedarmadji et al., <xref ref-type="bibr" rid="B43">2019</xref>; Aggour et al., <xref ref-type="bibr" rid="B3">2017</xref>). In the future, we also envision interfacing AIMKG with large language models (LLMs), enabling users to query pipeline lineage, models, datasets, tasks, and other components through an interactive interface.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The code repository for AI pipeline Metadata Knowledge Graph can be found at - <ext-link ext-link-type="uri" xlink:href="https://github.com/HewlettPackard/ai-metadata-knowledge-graph">https://github.com/HewlettPackard/ai-metadata-knowledge-graph</ext-link>. Further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>RV: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. AT: Conceptualization, Data curation, Funding acquisition, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing. TK: Conceptualization, Investigation, Methodology, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SS: Formal analysis, Investigation, Methodology, Resources, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. AJ: Data curation, Methodology, Project administration, Supervision, Writing &#x02013; review &#x00026; editing. ASha: Data curation, Resources, Validation, Writing &#x02013; review &#x00026; editing. SB: Conceptualization, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing. MF: Conceptualization, Funding acquisition, Project administration, Resources, Supervision, Validation, Writing &#x02013; review &#x00026; editing. PF: Funding acquisition, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing. KR: Conceptualization, Validation, Writing &#x02013; review &#x00026; editing. AShe: Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was partly supported by National Science Foundation Award 2119654 &#x0201C;RII Track 2 FEC: Enabling Factory to Factory (F2F) Networking for Future Manufacturing.&#x0201D;</p>
</sec>
<ack>
<p>We thank the cloud services team at Hewlett Packard Labs for their continued support in providing GPUs and CPUs, crucial in hosting our graph and running evaluations. We also thank Hong Yung Yip who shared his expertise on knowledge graph construction. The authors acknowledge the use of ChatGPT-4, an AI language model developed by OpenAI, to assist in rephrasing text, checking grammar, and improving the overall clarity of the manuscript. The authors affirm that no original content was generated by the AI, and all ideas and conclusions are solely those of the authors.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>RV, AT, TK, SS, AJ, AS, SB, MF, and PF were employed at Hewlett Packard Enterprise Labs. The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Author disclaimer</title>
<p>Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation.</p>
</sec>
<sec sec-type="supplementary-material" id="s13">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdata.2024.1476506/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdata.2024.1476506/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.PDF" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Abdelaziz</surname> <given-names>I.</given-names></name> <name><surname>Dolby</surname> <given-names>J.</given-names></name> <name><surname>McCusker</surname> <given-names>J.</given-names></name> <name><surname>Srinivas</surname> <given-names>K.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;A toolkit for generating code knowledge graphs,&#x0201D;</article-title> in <source>Proceedings of the 11th on Knowledge Capture Conference</source>, 137&#x02013;144. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3460210.3493578">https://dl.acm.org/doi/abs/10.1145/3460210.3493578</ext-link></citation>
</ref>
<ref id="B2">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Achille</surname> <given-names>A.</given-names></name> <name><surname>Lam</surname> <given-names>M.</given-names></name> <name><surname>Tewari</surname> <given-names>R.</given-names></name> <name><surname>Ravichandran</surname> <given-names>A.</given-names></name> <name><surname>Maji</surname> <given-names>S.</given-names></name> <name><surname>Fowlkes</surname> <given-names>C. C.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;Task2Vec: Task embedding for meta-learning,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Seoul</publisher-loc>), <fpage>6430</fpage>&#x02013;<lpage>6439</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://openaccess.thecvf.com/content_ICCV_2019/html/Achille_Task2Vec_Task_Embedding_for_Meta-Learning_ICCV_2019_paper.html">https://openaccess.thecvf.com/content_ICCV_2019/html/Achille_Task2Vec_Task_Embedding_for_Meta-Learning_ICCV_2019_paper.html</ext-link></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aggour</surname> <given-names>K. S.</given-names></name> <name><surname>Williams</surname> <given-names>J. W.</given-names></name> <name><surname>McHugh</surname> <given-names>J.</given-names></name> <name><surname>Kumar</surname> <given-names>V. S.</given-names></name></person-group> (<year>2017</year>). <article-title>Colt: concept lineage tool for data flow metadata capture and analysis</article-title>. <source>Proc. VLDB Endow</source>. <volume>10</volume>, <fpage>1790</fpage>&#x02013;<lpage>1801</lpage>. <pub-id pub-id-type="doi">10.14778/3137765.3137783</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Akhtar</surname> <given-names>M.</given-names></name> <name><surname>Benjelloun</surname> <given-names>O.</given-names></name> <name><surname>Conforti</surname> <given-names>C.</given-names></name> <name><surname>Gijsbers</surname> <given-names>P.</given-names></name> <name><surname>Giner-Miguelez</surname> <given-names>J.</given-names></name> <name><surname>Jain</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>&#x0201C;Croissant: a metadata format for ML-ready datasets,&#x0201D;</article-title> in <source>Proceedings of the Eighth Workshop on Data Management for End-to-End Machine Learning</source> (<publisher-loc>Santiago</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3650203.3663326">https://dl.acm.org/doi/abs/10.1145/3650203.3663326</ext-link></citation>
</ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Anirudh</surname> <given-names>R.</given-names></name> <name><surname>Thiagarajan</surname> <given-names>J. J.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Bootstrapping graph convolutional neural networks for autism spectrum disorder classification,&#x0201D;</article-title> in <source>ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source> (<publisher-loc>Brighton</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3197</fpage>&#x02013;<lpage>3201</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Berahmand</surname> <given-names>K.</given-names></name> <name><surname>Daneshfar</surname> <given-names>F.</given-names></name> <name><surname>Salehi</surname> <given-names>E. S.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Autoencoders and their applications in machine learning: a survey</article-title>. <source>Artif. Intell. Rev</source>. <volume>57</volume>:<fpage>28</fpage>. <pub-id pub-id-type="doi">10.1007/s10462-023-10662-6</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bergstra</surname> <given-names>J.</given-names></name> <name><surname>Bardenet</surname> <given-names>R.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>K&#x000E9;gl</surname> <given-names>B.</given-names></name></person-group> (<year>2011</year>). <article-title>Algorithms forhyper-parameter optimization</article-title>. <source>Adv. Neural Inform. Process. Syst</source>. <volume>2011</volume>:<fpage>24</fpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Brack</surname> <given-names>A.</given-names></name> <name><surname>Hoppe</surname> <given-names>A.</given-names></name> <name><surname>Ewerth</surname> <given-names>R.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Citation recommendation for research papers via knowledge graphs,&#x0201D;</article-title> in <source>Linking Theory and Practice of Digital Libraries: 25th International Conference on Theory and Practice of Digital Libraries, TPDL 2021, Virtual Event, September 13&#x02013;17, 2021, Proceedings 25</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>165</fpage>&#x02013;<lpage>174</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Guestrin</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;XGboost: a scalable tree boosting system,&#x0201D;</article-title> in <source>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source> (<publisher-loc>Sanfrancisco</publisher-loc>), <fpage>785</fpage>&#x02013;<lpage>794</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/2939672.2939785">https://dl.acm.org/doi/abs/10.1145/2939672.2939785</ext-link></citation>
</ref>
<ref id="B10">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Kornblith</surname> <given-names>S.</given-names></name> <name><surname>Norouzi</surname> <given-names>M.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;A simple framework for contrastive learning of visual representations,&#x0201D;</article-title> in <source>International Conference on Machine Learning</source>, 1597&#x02013;1607. Available at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v119/chen20j.html">https://proceedings.mlr.press/v119/chen20j.html</ext-link></citation>
</ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Dess&#x000EC;</surname> <given-names>D.</given-names></name> <name><surname>Osborne</surname> <given-names>F.</given-names></name> <name><surname>Reforgiato Recupero</surname> <given-names>D.</given-names></name> <name><surname>Buscaldi</surname> <given-names>D.</given-names></name> <name><surname>Motta</surname> <given-names>E.</given-names></name> <name><surname>Sack</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;AI-KG: an automatically generated knowledge graph of artificial intelligence,&#x0201D;</article-title> in <source>The Semantic Web&#x02013;ISWC 2020: 19th International Semantic Web Conference, Athens, Greece, November 2&#x02013;6, 2020, Proceedings, Part II 19</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>127</fpage>&#x02013;<lpage>143</lpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Elsken</surname> <given-names>T.</given-names></name> <name><surname>Metzen</surname> <given-names>J. H.</given-names></name> <name><surname>Hutter</surname> <given-names>F.</given-names></name></person-group> (<year>2019</year>). <article-title>Neural architecture search: a survey</article-title>. <source>J. Machine Learn. Res</source>. <volume>20</volume>, <fpage>1997</fpage>&#x02013;<lpage>2017</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-05318-5_11</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>F&#x000E4;rber</surname> <given-names>M.</given-names></name> <name><surname>Lamprecht</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>The data set knowledge graph: creating a linked open data source for data sets</article-title>. <source>Quant. Sci. Stud</source>. <volume>2</volume>, <fpage>1324</fpage>&#x02013;<lpage>1355</lpage>. <pub-id pub-id-type="doi">10.1162/qss_a_00161</pub-id><pub-id pub-id-type="pmid">29378288</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>F&#x000E4;rber</surname> <given-names>M.</given-names></name> <name><surname>Leisinger</surname> <given-names>A.-K.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Datahunter: a system for finding datasets based on scientific problem descriptions,&#x0201D;</article-title> in <source>Proceedings of the 15th ACM Conference on PWC Systems</source> (<publisher-loc>Amsterdam</publisher-loc>), <fpage>749</fpage>&#x02013;<lpage>752</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3460231.3478882?casa_token=gV35uoxEs_QAAAAA:vv7apDXE9B-qbS4ifhlfSPh6kDiERRFK70CDje2M_c6PW_LLWw_1BzCu9epm7gaotMNmcV3ac5jtiw">https://dl.acm.org/doi/abs/10.1145/3460231.3478882?casa_token=gV35uoxEs_QAAAAA:vv7apDXE9B-qbS4ifhlfSPh6kDiERRFK70CDje2M_c6PW_LLWw_1BzCu9epm7gaotMNmcV3ac5jtiw</ext-link></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Generative AI for synthetic data generation: methods, challenges and the future</article-title>. <source>arXiv preprint arXiv:2403.04190</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2403.04190</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>X.</given-names></name> <name><surname>van Stein</surname> <given-names>B.</given-names></name> <name><surname>B&#x000E4;ck</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;A new approach towards the combined algorithm selection and hyper-parameter optimization problem,&#x0201D;</article-title> in <source>2019 IEEE Symposium Series on Computational Intelligence (SSCI)</source> (<publisher-loc>Xiamen</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2042</fpage>&#x02013;<lpage>2049</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Hariharan</surname> <given-names>A.</given-names></name> <name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Motz</surname> <given-names>M.</given-names></name> <name><surname>Weinhardt</surname> <given-names>C.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Accessible data lineage: a scoping review on open-source data lineage platforms,&#x0201D;</article-title> in <source>ICIS 2024 Proceedings</source>, 5. Available at: <ext-link ext-link-type="uri" xlink:href="https://aisel.aisnet.org/icis2024/data_soc/data_soc/5">https://aisel.aisnet.org/icis2024/data_soc/data_soc/5</ext-link></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>K.</given-names></name> <name><surname>Chu</surname> <given-names>X.</given-names></name></person-group> (<year>2021</year>). <article-title>AutoML: a survey of the state-of-the-art</article-title>. <source>Knowl. Bas. Syst</source>. <volume>212</volume>:<fpage>106622</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2020.106622</pub-id><pub-id pub-id-type="pmid">22357568</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Helali</surname> <given-names>M.</given-names></name> <name><surname>Mansour</surname> <given-names>E.</given-names></name> <name><surname>Abdelaziz</surname> <given-names>I.</given-names></name> <name><surname>Dolby</surname> <given-names>J.</given-names></name> <name><surname>Srinivas</surname> <given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>A scalable automl approach based on graph neural networks</article-title>. <source>Proc. VLDB Endow</source>. <volume>15</volume>, <fpage>2428</fpage>&#x02013;<lpage>2436</lpage>. <pub-id pub-id-type="doi">10.14778/3551793.3551804</pub-id><pub-id pub-id-type="pmid">27534393</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><collab>Hugging Face</collab></person-group> (<year>2016</year>). <source>Hugging Face</source>. Huggignface.</citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Humm</surname> <given-names>B. G.</given-names></name> <name><surname>Zender</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;An ontology-based concept for meta automl,&#x0201D;</article-title> in <source>Artificial Intelligence Applications and Innovations: 17th IFIP WG 12.5 International Conference, AIAI 2021, Hersonissos, Crete, Greece, June 25&#x02013;27, 2021, Proceedings 17</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>117</fpage>&#x02013;<lpage>128</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Jadon</surname> <given-names>A.</given-names></name> <name><surname>Kumar</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Leveraging generative ai models for synthetic data generation in healthcare: balancing research and privacy,&#x0201D;</article-title> in <source>2023 International Conference on Smart Applications, Communications and Networking (SmartNets)</source> (<publisher-loc>Istanbul</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>4</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://ieeexplore.ieee.org/abstract/document/10215825">https://ieeexplore.ieee.org/abstract/document/10215825</ext-link></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><collab>Kaggle</collab></person-group> (<year>2010</year>). <source>Kaggle, Datasets and Competitions</source>. Kaggle.</citation>
</ref>
<ref id="B24">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kannan</surname> <given-names>A. V.</given-names></name> <name><surname>Fradkin</surname> <given-names>D.</given-names></name> <name><surname>Akrotirianakis</surname> <given-names>I.</given-names></name> <name><surname>Kulahcioglu</surname> <given-names>T.</given-names></name> <name><surname>Canedo</surname> <given-names>A.</given-names></name> <name><surname>Roy</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Multimodal knowledge graph for deep learning papers and code,&#x0201D;</article-title> in <source>Proceedings of the 29th ACM International Conference on Information and Knowledge Management</source>, 3417&#x02013;3420. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3340531.3417439">https://dl.acm.org/doi/abs/10.1145/3340531.3417439</ext-link> <pub-id pub-id-type="pmid">34110991</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Karmaker</surname> <given-names>S. K.</given-names></name> <name><surname>Hassan</surname> <given-names>M. M.</given-names></name> <name><surname>Smith</surname> <given-names>M. J.</given-names></name> <name><surname>Xu</surname> <given-names>L.</given-names></name> <name><surname>Zhai</surname> <given-names>C.</given-names></name> <name><surname>Veeramachaneni</surname> <given-names>K.</given-names></name></person-group> (<year>2021</year>). <article-title>Automl to date and beyond: challenges and opportunities</article-title>. <source>ACM Comput. Surv</source>. <volume>54</volume>, <fpage>1</fpage>&#x02013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1145/3470918</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>Adam: a method for stochastic optimization</article-title>. <source>arXiv preprint arXiv:1412.6980</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1412.6980</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Koomthanam</surname> <given-names>A. J.</given-names></name> <name><surname>Tripathy</surname> <given-names>A.</given-names></name> <name><surname>Serebryakov</surname> <given-names>S.</given-names></name> <name><surname>Nayak</surname> <given-names>G.</given-names></name> <name><surname>Foltin</surname> <given-names>M.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>Common metadata framework: integrated framework for trustworthy artificial intelligence pipelines</article-title>. <source>IEEE Intern. Comput</source>. <volume>28</volume>, <fpage>37</fpage>&#x02013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1109/MIC.2024.3377170</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kumar</surname> <given-names>T.</given-names></name> <name><surname>Shah</surname> <given-names>A.</given-names></name> <name><surname>Mishra</surname> <given-names>A.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>S.</given-names></name> <name><surname>Mahendran</surname> <given-names>A.</given-names></name> <name><surname>Dunning</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;From roots to fruits: exploring lineage for dataset recommendations,&#x0201D;</article-title> in <source>Proceedings of the Second ACM Data Economy Workshop</source> (<publisher-loc>Seattle, WA</publisher-loc>), <fpage>41</fpage>&#x02013;<lpage>47</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3600046.3600053?casa_token=9KFjuOA11hkAAAAA:LC3fQhjgI_-szqmMM6T9wivHN6s0ByEWJ9hKWWbZorp6CGqELY_aoz_ruBOiXErJqrvcRrRQ4HCKLg">https://dl.acm.org/doi/abs/10.1145/3600046.3600053?casa_token=9KFjuOA11hkAAAAA:LC3fQhjgI_-szqmMM6T9wivHN6s0ByEWJ9hKWWbZorp6CGqELY_aoz_ruBOiXErJqrvcRrRQ4HCKLg</ext-link></citation>
</ref>
<ref id="B29">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Mathew</surname> <given-names>A.</given-names></name> <name><surname>Amudha</surname> <given-names>P.</given-names></name> <name><surname>Sivakumari</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Deep learning techniques: an overview,&#x0201D;</article-title> in <source>Advanced Machine Learning Technologies and Applications: Proceedings of AMLTA 2020</source>, 599&#x02013;608. Available online at: <ext-link ext-link-type="uri" xlink:href="https://link.springer.com/chapter/10.1007/978-981-15-3383-9_54">https://link.springer.com/chapter/10.1007/978-981-15-3383-9_54</ext-link></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Menghani</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Efficient deep learning: a survey on making deep learning models smaller, faster, and better</article-title>. <source>ACM Comput. Surv</source>. <volume>55</volume>, <fpage>1</fpage>&#x02013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1145/3578938</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mohammadi</surname> <given-names>M.</given-names></name> <name><surname>Tajik</surname> <given-names>E.</given-names></name> <name><surname>Martinez-Maldonado</surname> <given-names>R.</given-names></name> <name><surname>Sadiq</surname> <given-names>S.</given-names></name> <name><surname>Tomaszewski</surname> <given-names>W.</given-names></name> <name><surname>Khosravi</surname> <given-names>H.</given-names></name></person-group> (<year>2024</year>). <source>Artificial Intelligence in Multimodal Learning Analytics: A Systematic Literature Review</source>.</citation>
</ref>
<ref id="B32">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>M&#x000FC;llner</surname> <given-names>P.</given-names></name> <name><surname>Schmerda</surname> <given-names>S.</given-names></name> <name><surname>Theiler</surname> <given-names>D.</given-names></name> <name><surname>Lindstaedt</surname> <given-names>S.</given-names></name> <name><surname>Kowald</surname> <given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Towards employing PWC systems for supporting data and algorithm sharing,&#x0201D;</article-title> in <source>Proceedings of the 1st International Workshop on Data Economy</source> (<publisher-loc>Rome</publisher-loc>), <fpage>8</fpage>&#x02013;<lpage>14</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3565011.3569055">https://dl.acm.org/doi/abs/10.1145/3565011.3569055</ext-link></citation>
</ref>
<ref id="B33">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Musto</surname> <given-names>C.</given-names></name> <name><surname>Narducci</surname> <given-names>F.</given-names></name> <name><surname>Lops</surname> <given-names>P.</given-names></name> <name><surname>De Gemmis</surname> <given-names>M.</given-names></name> <name><surname>Semeraro</surname> <given-names>G.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Explod: a framework for explaining recommendations based on the linked open data cloud,&#x0201D;</article-title> in <source>Proceedings of the 10th ACM Conference on Recommender Systems</source> (<publisher-loc>Boston, MA</publisher-loc>), <fpage>151</fpage>&#x02013;<lpage>154</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/2959100.2959173?casa_token=Xlgy_FvbnL8AAAAA:m_Eg7vQEsT2CXlGugie6lrjvGSzFqY3oq4CxlAsnsx1-Pr4Q-MMvt1hG0YzDLacw50IK9Pz1I1Pljg">https://dl.acm.org/doi/abs/10.1145/2959100.2959173?casa_token=Xlgy_FvbnL8AAAAA:m_Eg7vQEsT2CXlGugie6lrjvGSzFqY3oq4CxlAsnsx1-Pr4Q-MMvt1hG0YzDLacw50IK9Pz1I1Pljg</ext-link></citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><collab>OpenAI</collab></person-group> (<year>2022</year>). <source>Chatgpt-3.5: Language Model By OpenAI</source>. OpenAI.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pang</surname> <given-names>G.</given-names></name> <name><surname>Shen</surname> <given-names>C.</given-names></name> <name><surname>Cao</surname> <given-names>L.</given-names></name> <name><surname>Hengel</surname> <given-names>A. V. D.</given-names></name></person-group> (<year>2021</year>). <article-title>Deep learning for anomaly detection: a review</article-title>. <source>ACM Comput. Surv</source>. <volume>54</volume>, <fpage>1</fpage>&#x02013;<lpage>38</lpage>. <pub-id pub-id-type="doi">10.1145/3439950</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><collab>Papers-with-code</collab></person-group> (<year>2018</year>). <source>Papers with Code</source>. Papers-with-code.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pedretti</surname> <given-names>G.</given-names></name> <name><surname>Moon</surname> <given-names>J.</given-names></name> <name><surname>Bruel</surname> <given-names>P.</given-names></name> <name><surname>Serebryakov</surname> <given-names>S.</given-names></name> <name><surname>Roth</surname> <given-names>R. M.</given-names></name> <name><surname>Buonanno</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>X-time: an in-memory engine for accelerating machine learning on tabular data with CAMs</article-title>. <source>arXiv preprint arXiv:2304.01285</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2304.01285</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Publio</surname> <given-names>G. C.</given-names></name> <name><surname>Esteves</surname> <given-names>D.</given-names></name> <name><surname>&#x00141;awrynowicz</surname> <given-names>A.</given-names></name> <name><surname>Panov</surname> <given-names>P.</given-names></name> <name><surname>Soldatova</surname> <given-names>L.</given-names></name> <name><surname>Soru</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>ML-schema: exposing the semantics of machine learning with schemas and ontologies</article-title>. <source>arXiv preprint arXiv:1807.05351</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1807.05351</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Salvador</surname> <given-names>A.</given-names></name> <name><surname>Hynes</surname> <given-names>N.</given-names></name> <name><surname>Aytar</surname> <given-names>Y.</given-names></name> <name><surname>Marin</surname> <given-names>J.</given-names></name> <name><surname>Ofli</surname> <given-names>F.</given-names></name> <name><surname>Weber</surname> <given-names>I.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Learning cross-modal embeddings for cooking recipes and food images,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Honolulu, HI</publisher-loc>), <fpage>3020</fpage>&#x02013;<lpage>3028</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://openaccess.thecvf.com/content_cvpr_2017/html/Salvador_Learning_Cross-Modal_Embeddings_CVPR_2017_paper.html">https://openaccess.thecvf.com/content_cvpr_2017/html/Salvador_Learning_Cross-Modal_Embeddings_CVPR_2017_paper.html</ext-link> <pub-id pub-id-type="pmid">31295105</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="web"><person-group person-group-type="author"><collab>SBERT Documentation</collab></person-group> (<year>2023</year>). <source>SBERT Pretrained Models Documentation</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://www.sbert.net/docs/pretrained_models.html">https://www.sbert.net/docs/pretrained_models.html</ext-link> (accessed September 15, 2023).</citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>Y.</given-names></name> <name><surname>Song</surname> <given-names>K.</given-names></name> <name><surname>Tan</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Lu</surname> <given-names>W.</given-names></name> <name><surname>Zhuang</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>HuggingGPT: solving AI tasks with ChatGPT and its friends in huggingface</article-title>. <source>arXiv preprint arXiv:2303.17580</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2303.17580</pub-id></citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shrestha</surname> <given-names>A.</given-names></name> <name><surname>Mahmood</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Review of deep learning algorithms and architectures</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>53040</fpage>&#x02013;<lpage>53065</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2912200</pub-id></citation>
</ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Soedarmadji</surname> <given-names>E.</given-names></name> <name><surname>Stein</surname> <given-names>H. S.</given-names></name> <name><surname>Suram</surname> <given-names>S. K.</given-names></name> <name><surname>Guevarra</surname> <given-names>D.</given-names></name> <name><surname>Gregoire</surname> <given-names>J. M.</given-names></name></person-group> (<year>2019</year>). <article-title>Tracking materials science data lineage to manage millions of materials experiments and analyses</article-title>. <source>NPJ Comput. Mater</source>. <volume>5</volume>:<fpage>79</fpage>. <pub-id pub-id-type="doi">10.1038/s41524-019-0216-x</pub-id></citation>
</ref>
<ref id="B44">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Thornton</surname> <given-names>C.</given-names></name> <name><surname>Hutter</surname> <given-names>F.</given-names></name> <name><surname>Hoos</surname> <given-names>H. H.</given-names></name> <name><surname>Leyton-Brown</surname> <given-names>K.</given-names></name></person-group> (<year>2013</year>). <article-title>&#x0201C;Auto-WEKA: combined selection and hyperparameter optimization of classification algorithms,&#x0201D;</article-title> in <source>Proceedings of the 19th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source> (<publisher-loc>Chicago, IL</publisher-loc>), <fpage>847</fpage>&#x02013;<lpage>855</lpage>. Available at: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/2487575.2487629?casa_token=S3NSpj33lH8AAAAA:XOutkr8nKKkoiSIEQruB86E-ZRT2AlfaxWKcofLaDu_x9n9gTxEwRH1ZkXMfyLAroZBwwHfHJwFIoA">https://dl.acm.org/doi/abs/10.1145/2487575.2487629?casa_token=S3NSpj33lH8AAAAA:XOutkr8nKKkoiSIEQruB86E-ZRT2AlfaxWKcofLaDu_x9n9gTxEwRH1ZkXMfyLAroZBwwHfHJwFIoA</ext-link></citation>
</ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vanschoren</surname> <given-names>J.</given-names></name> <name><surname>Van Rijn</surname> <given-names>J. N.</given-names></name> <name><surname>Bischl</surname> <given-names>B.</given-names></name> <name><surname>Torgo</surname> <given-names>L.</given-names></name></person-group> (<year>2014</year>). <article-title>OpenML: networked science in machine learning</article-title>. <source>ACM SIGKDD Expl. Newsl</source>. <volume>15</volume>, <fpage>49</fpage>&#x02013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1145/2641190.2641198</pub-id></citation>
</ref>
<ref id="B46">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Venkataramanan</surname> <given-names>R.</given-names></name></person-group> (<year>2023</year>). <source>Construction of AIMKG</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/revathyramanan/ai-pipeline-recommender">https://github.com/revathyramanan/ai-pipeline-recommender</ext-link> (accessed December 9, 2024).</citation>
</ref>
<ref id="B47">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Venkataramanan</surname> <given-names>R.</given-names></name></person-group> (<year>2024</year>). <source>Common Metadata Framework&#x02014;Common Metadata Ontology</source>. Available at: <ext-link ext-link-type="uri" xlink:href="https://hewlettpackard.github.io/cmf/common-metadata-ontology/readme/">https://hewlettpackard.github.io/cmf/common-metadata-ontology/readme/</ext-link> (accessed November 13, 2024).</citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Viswanathan</surname> <given-names>V.</given-names></name> <name><surname>Gao</surname> <given-names>L.</given-names></name> <name><surname>Wu</surname> <given-names>T.</given-names></name> <name><surname>Liu</surname> <given-names>P.</given-names></name> <name><surname>Neubig</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>DataFinder: Scientific dataset recommendation from natural language descriptions</article-title>. <source>arXiv preprint arXiv:2305.16636</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2305.16636</pub-id></citation>
</ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zaharia</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>A.</given-names></name> <name><surname>Davidson</surname> <given-names>A.</given-names></name> <name><surname>Ghodsi</surname> <given-names>A.</given-names></name> <name><surname>Hong</surname> <given-names>S. A.</given-names></name> <name><surname>Konwinski</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Accelerating the machine learning lifecycle with mlflow</article-title>. <source>IEEE Data Eng. Bull</source>. <volume>41</volume>, <fpage>39</fpage>&#x02013;<lpage>45</lpage>.</citation>
</ref>
<ref id="B50">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Zuccon</surname> <given-names>G.</given-names></name> <name><surname>Koopman</surname> <given-names>B.</given-names></name> <name><surname>Shaik</surname> <given-names>R.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;ChatGPT hallucinates when attributing answers,&#x0201D;</article-title> in <source>Proceedings of the Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region</source> (<publisher-loc>Beijing</publisher-loc>), <fpage>46</fpage>&#x02013;<lpage>51</lpage>. <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.1145/3624918.3625329?casa_token=iHqvsj9eDawAAAAA:NKqAHeJ_knPXKJ535R7dQ-I9rZnCnZmaN-BCsRqJmETLZHHPdv7UcvLGWu2dzJkc5Gd7QWnnhLA70Q">https://dl.acm.org/doi/abs/10.1145/3624918.3625329?casa_token=iHqvsj9eDawAAAAA:NKqAHeJ_knPXKJ535R7dQ-I9rZnCnZmaN-BCsRqJmETLZHHPdv7UcvLGWu2dzJkc5Gd7QWnnhLA70Q</ext-link> <pub-id pub-id-type="pmid">38261378</pub-id></citation></ref>
</ref-list>
</back>
</article>