<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2025.1469396</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Marine Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Assisting human annotation of marine images with foundation models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Orenstein</surname>
<given-names>Eric C.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1430291/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Woodward</surname>
<given-names>Benjamin</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2699875/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lundsten</surname>
<given-names>Lonny</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/722722/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Barnard</surname>
<given-names>Kevin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2608382/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Schlining</surname>
<given-names>Brian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2913002/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Katjia</surname>
<given-names>Kakani</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2172838/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Research and Development, Information and Technology Dissemination, Monterey Bay Aquarium Research Institute</institution>, <addr-line>Moss Landing, CA</addr-line>,&#xa0;<country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Research and Development, National Oceanography Centre</institution>, <addr-line>Southampton</addr-line>,&#xa0;<country>United Kingdom</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Research and Development, CVision AI</institution>, <addr-line>Medford, MA</addr-line>,&#xa0;<country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Matthias Obst, University of Gothenburg, Sweden</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Wang Minxiao, Chinese Academy of Sciences (CAS), China</p>
<p>Lukasz Janowski, Gdynia Maritime University, Poland</p>
<p>Carla Cherubini, Politecnico di Bari, Italy</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Eric C. Orenstein, <email xlink:href="mailto:Eric.Orenstein@noc.ac.uk">Eric.Orenstein@noc.ac.uk</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>24</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1469396</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>07</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>06</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Orenstein, Woodward, Lundsten, Barnard, Schlining and Katjia.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Orenstein, Woodward, Lundsten, Barnard, Schlining and Katjia</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p> This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Marine scientists have been leveraging supervised machine learning algorithms to analyze image and video data for nearly two decades. There have been many advances, but the cost of generating expert human annotations to train new models remains extremely high. There is broad recognition both in computer and domain sciences that generating training data remains the major bottleneck when developing ML models for targeted tasks. Increasingly, computer scientists are not attempting to produce highly-optimized models from general annotation frameworks, instead focusing on adaptation strategies to tackle new data challenges. Taking inspiration from large language models, computer vision researchers are now thinking in terms of &#x201c;foundation models&#x201d; that can yield reasonable zero- and few-shot detection and segmentation performance with human prompting. Here we consider the utility of this approach for ocean imagery, leveraging Meta&#x2019;s Segment Anything Model to enrich ocean image annotations based on existing labels. This workflow yields promising results, especially for modernizing existing data repositories. Moreover, it suggests that future human annotation efforts could use foundation models to speed progress toward a sufficient training set to address domain specific problems.</p>
</abstract>
<kwd-group>
<kwd>foundation model</kwd>
<kwd>marine imagery</kwd>
<kwd>segmentation</kwd>
<kwd>object detection</kwd>
<kwd>human-in-the-loop</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="1"/>
<equation-count count="2"/>
<ref-count count="41"/>
<page-count count="10"/>
<word-count count="5672"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Ocean scientists have been capturing images and video to observe marine organisms for decades (<xref ref-type="bibr" rid="B16">Jaffe, 2014</xref>; <xref ref-type="bibr" rid="B28">Robison et&#xa0;al., 2017</xref>). The instruments that collect this visual data have become progressively more efficient with improved battery technology and the advent of digital sensors and storage. Researchers now regularly collect terabytes of images, perhaps representing 100s of thousands of observations, over a single field campaign (<xref ref-type="bibr" rid="B6">Bell et&#xa0;al., 2022</xref>). The sheer amount of raw data precludes fully manual annotation and has inspired marine scientists to invest time and effort into automating the process.</p>
<p>Marine scientists have leveraged recent advances in supervised machine learning (ML) models, training and deploying a neural network or vision transformer architecture. These tools learn a feature space directly from a set of annotated image or video data, obviating the need for hand-engineered features tailored to a particular data set. Scientists iteratively tune a model until it achieves acceptable performance on an independent validation set and then deploy it to process new data collected in the field. Crucially, this entire process relies on expertly annotated data to ensure that the model learns a robust mapping between the input images and the desired output concepts.</p>
<p>Creating a high-quality, taxonomically-correct set of labeled data for training ML models remains an extremely time-consuming task (<xref ref-type="bibr" rid="B36">Van Horn et&#xa0;al., 2015</xref>). Highly-trained annotators must spend 100s of hours examining images and footage to identify a sufficient number of animals to appropriately tune modern models (<xref ref-type="bibr" rid="B15">Hughes et&#xa0;al., 2018</xref>). The degree of difficulty, and hence number of necessary human hours, for these annotation tasks increases significantly when experts must also localize objects with bounding boxes, polygons, or segmentation masks (<xref ref-type="bibr" rid="B19">Katija et&#xa0;al., 2022</xref>). Unfortunately, this is typically an open-ended process: most supervised models fail when applied to data collected in new regions with different equipment, thus requiring continuing manual annotation efforts as scientists seek to work in new regions or with different tools (<xref ref-type="bibr" rid="B4">Beery et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B24">Orenstein et&#xa0;al., 2020</xref>).</p>
<p>These challenges are not unique to oceanographic or marine biological applications of ML. Computer scientists are increasingly looking to develop methods to adapt models to new data distributions, often with the explicit inclusion of humans in the workflow. This type of approach is common in Natural Language Processing where researchers have embraced the development of &#x201c;foundation models,&#x201d; any large model trained on a large corpus of annotated data that generalizes well to new tasks (<xref ref-type="bibr" rid="B7">Bommasani et&#xa0;al., 2021</xref>). The strong performance in few- and zero-shot scenarios&#x2014;situations where limited or no training data is available&#x2014;is often achieved via prompt engineering, where humans interact with the model to generate a valid response to the task at hand. Computer vision researchers are now developing foundation models for image annotation tasks like object detection and segmentation. The goal is to produce trained models that are general enough to yield useful output based on a simpler, human-generated prompt (e.g. a point, bounding box, or text description) that specifies what to localize in an image. The resulting localized data can then be used to fine-tune a model for a downstream, domain-specific task.</p>
<p>The utility of foundation models for ocean imaging is manifest; they could be leveraged to speed a human annotator&#x2019;s first pass through a dataset or to enrich existing annotations. In this paper, we present and analyze the output of Meta AI&#x2019;s Segment Anything Model (SAM; <xref ref-type="bibr" rid="B21">Kirillov et&#xa0;al., 2023</xref>) on four ocean-specific dataset enrichment tasks: (1) selecting regions from point annotations on images collected from a tow sled running transects around Antarctica (<xref ref-type="bibr" rid="B18">Jansen et&#xa0;al., 2023</xref>); (2) converting point annotations to bounding boxes on images from the Station M abyssal monitoring station (<xref ref-type="bibr" rid="B34">Smith and Druffel, 1998</xref>); (3) returning segmentation masks from a set of bounding boxes of fish in images collected at a cabled observatory 4-km off the coast of Spain (<xref ref-type="bibr" rid="B11">Francescangeli et&#xa0;al., 2023</xref>); and (4) creating segmentation masks from bounding boxes around the inner filters of larvaceans in images drawn from FathomNet (<xref ref-type="bibr" rid="B19">Katija et&#xa0;al., 2022</xref>). These datasets represent diverse habitats, sampling methodologies, and target organisms. In all cases, SAM is not attempting to output a taxonomic label. Instead, the model attempts to create a localization&#x2014;drawing either a box or outline&#x2014;around an object based on a point selected by a human. While we are programmatically feeding SAM pre-existing annotations, the prompts could equivalently be interactively supplied by a human annotator.</p>
<p>We underscore that these results are an early exploration of an extremely powerful new tool. Our work should be viewed as an engineering test, an attempt to understand how to apply foundation models to assist human annotators in a principled and clear-eyed manner. These results should be taken as illustrative rather than conclusive; there are many remaining avenues to explore and challenges to address. With that caveat, we believe our results are compelling enough to suggest that foundation models should quickly become a standard part of expert human annotation workflows for marine visual data.</p>
<sec id="s1_1">
<label>1.1</label>
<title>Related work</title>
<p>Foundation models can be thought of as a human-AI system that attempts to alleviate some of the annotation burden for human experts. The body of work is akin to human-in-the-loop systems used to speed manual classification of imagery by leveraging model output to presort data or ask annotators guiding questions. In the question setting, the computer selects maximally informative questions to ask users based on the image itself and the annotators&#x2019; previous responses (<xref ref-type="bibr" rid="B8">Branson et&#xa0;al., 2010</xref>). This technique has been used effectively for fine grained classification, especially of birds (<xref ref-type="bibr" rid="B37">Wah et&#xa0;al., 2011</xref>). More recently, representation models have been trained to presort image data ecological studies. The MegaDetector is a general purpose terrestrial object detector that finds animals in camera trap data but does not ascribe a label, effectively removing empty frames from a raw dataset (<xref ref-type="bibr" rid="B23">Norouzzadeh et&#xa0;al., 2021</xref>). The MAIA method was developed specifically for marine imagery and uses a series of unsupervised and semi-supervised steps to bootstrap annotations in a new dataset (<xref ref-type="bibr" rid="B40">Zurowietz et&#xa0;al., 2018</xref>). The goal of these systems is to pre-filter data automatically for human verification and, eventually, training of a dataset-specific model (<xref ref-type="bibr" rid="B29">Russakovsky et&#xa0;al., 2015</xref>).</p>
<p>Foundation models can be used in an identical manner, generating region proposals automatically for expert review, but have the additional capability of operating directly with a human-in-the-loop. Instead of asking users questions or prefiltering data, foundation models ingest direct human input to parse a generic feature representation of a given image. While technologically different, the approaches are conceptually similar: get a human to help the machine interpret an image with minimal effort. SAM in particular has shown promise for domain science applications like drone-based remote sensing for detecting aquaculture infrastructure (<xref ref-type="bibr" rid="B26">Ren et&#xa0;al., 2023</xref>). <xref ref-type="bibr" rid="B17">Janowski and Wr&#xf3;blewski (2024)</xref> applied SAM to analyze a diversity of seabed data collected by several sampling systems. The pipeline they articulate uses SAM without human prompting and is targeting bathymetric features along the seabed. We believe our work is the first to measure SAM&#x2019;s performance on marine biological images based on human prompts.</p>
<p>The methods described in this paper belong to the broad field of computer vision. We rely on technical terms throughout the paper and have attempted to describe them as concisely as possible in the main paper. We have provided a glossary in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S1</bold>
</xref> as a quick reference for terms that show up throughout the text. We also point readers to excellent primers on computer vision for animal ecology by <xref ref-type="bibr" rid="B38">Weinstein (2018)</xref> and a more specific treatment for marine biology in <xref ref-type="bibr" rid="B5">Belcher et&#xa0;al. (2023)</xref>.</p>
</sec>
</sec>
<sec id="s2">
<label>2</label>
<title>Methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Segment Anything Model</title>
<p>The Segment Anything Model (SAM) is an image segmentation foundation model, a system that is trained on a broad corpus of annotated images so it can easily generalize to new scenarios (<xref ref-type="bibr" rid="B21">Kirillov et&#xa0;al., 2023</xref>). Importantly, SAM was designed to be applied to a range of downstream tasks using prompt engineering, explicitly meant to function in zero- and few-shot environments with human input. SAM was trained on SA-1B, an enormous dataset of 1 billion segmentation masks drawn from 11 million images. The scale of SA-1B dwarfs previous libraries of segmentation masks, with approximately 400x more masks than previously released datasets (<xref ref-type="bibr" rid="B21">Kirillov et&#xa0;al., 2023</xref>). The dataset is composed of high resolution images of everyday objects, collected around the world, taken with a variety of cameras, and licensed from a third party photo provider. There are some images of marine organisms in this data, but none collected underwater nor drawn from scientific datasets.</p>
<p>The SAM framework consists of a Vision Transformer model used as an image encoder, a prompt encoder that maps inputs to an embedding space, and a decoder that combines image and prompt embeddings to generate output masks. The developers tested SAM&#x2019;s zero-shot performance with point prompts on 23 datasets covering a range of domains, including two underwater image datasets: Northumberland Dolphin Dataset (NDD20), a set of DSLR and GoPro images, and TrashCan, a subset of the JAMSTEC Deep-sea Debris Database of ROV video data (<xref ref-type="bibr" rid="B35">Trotter et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B13">Hong et&#xa0;al., 2020</xref>). NDD20 segmentation masks are annotated both at the coarse level of &#x2018;dolphin&#x2019; and identified as individuals. TrashCan images are labeled with 7 morphotaxonomic classes and 8 types of human detritus found on the sea floor. SAM produced reasonable masks on both of these datasets as measured by both Intersection over Union and a qualitative survey distributed to annotators (<xref ref-type="bibr" rid="B21">Kirillov et&#xa0;al., 2023</xref>).</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Deployment</title>
<p>We deployed SAM with the ViT-L backbone trained on the SB-1 dataset<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref> the first release of the model (<xref ref-type="bibr" rid="B21">Kirillov et&#xa0;al., 2023</xref>). We used the model as-is, with no hyperparameter tweaking or fine tuning. We wrote a wrapper function to feed SAM prompts in a standard format and retrieve region proposals to simulate a human annotator interacting with SAM. This workflow was devised to make use of existing human annotations and assess what SAMs output might be should those annotations have been done with a foundation model.</p>
<p>We deployed SAM on a server based NVIDIA RTX A6000. All wrapper code written for this project to feed images and prompts into the model and analyze results are available on GitHub<xref ref-type="fn" rid="fn2">
<sup>2</sup>
</xref>. We note we are not able to make the endpoint publicly available to run new images through the model. Users will need access to their own GPU to run SAM.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Datasets</title>
<p>We drew data from four datasets containing a diversity of images from around the world, collected in different habitats with different approaches to target different organisms (<xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>). In all cases, we took a small subset from each repository, selected by a random number generator. Our team did not create any new human annotations for prompting; we adhere to the taxonomies and labeling schemes used by the original data annotators.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Summary of datasets considered.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="center">Region</th>
<th valign="top" align="center">Habitat</th>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">Classes</th>
<th valign="top" align="center">Prompts</th>
<th valign="top" align="center">Type</th>
<th valign="top" align="center">Evaluation metrics</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">AS-AID</td>
<td valign="top" align="center">Antarctic</td>
<td valign="top" align="center">benthic</td>
<td valign="top" align="center">towed array</td>
<td valign="top" align="center">26</td>
<td valign="top" align="center">165</td>
<td valign="top" align="center">points</td>
<td valign="top" align="center">dist.</td>
</tr>
<tr>
<td valign="top" align="left">Station M</td>
<td valign="top" align="center">CA Current</td>
<td valign="top" align="center">benthic</td>
<td valign="top" align="center">camera trap</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">355</td>
<td valign="top" align="center">points</td>
<td valign="top" align="center">dist., IoU</td>
</tr>
<tr>
<td valign="top" align="left">OBSEA</td>
<td valign="top" align="center">Medit.</td>
<td valign="top" align="center">benthic</td>
<td valign="top" align="center">camera trap</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">117</td>
<td valign="top" align="center">boxes</td>
<td valign="top" align="center">dist., IoU</td>
</tr>
<tr>
<td valign="top" align="left">FathomNet</td>
<td valign="top" align="center">CA Current</td>
<td valign="top" align="center">midwater</td>
<td valign="top" align="center">ROV</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">60</td>
<td valign="top" align="center">boxes</td>
<td valign="top" align="center">dist., IoU</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Twenty five randomly selected images were drawn from each one and existing human annotations were used to prompt the Segment Anything Model. Since different types of annotations were available for each dataset, the output was measured with a suite of evaluation metrics. A qualitative survey of output quality was distributed to a domain expert for each set of images.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>The Antarctic Seafloor Annotated Imagery Database</title>
<p>The Antarctic Seafloor Annotated Image Database (AS-AID) is a set of images collected from downward facing camera on a tow sled in the waters around Antarctica between 1985 and 2019 (<xref ref-type="bibr" rid="B18">Jansen et&#xa0;al., 2023</xref>). The dataset represents 80 classes of animals and substrate collected in diverse high latitude habitats. Domain experts used a point-grid annotation scheme to estimate percent coverage of substrates and organisms. Grid sampling is an approach for making relatively unbiased population estimates in a spatial region within some uncertainty bound. For image-based sampling, the dimensions of the sample grid are determined by the image size and the desired sampling statistics (<xref ref-type="bibr" rid="B25">Perkins et&#xa0;al., 2016</xref>). The ASAID annotators overlaid a 9 x 12 grid of points and identified the substrate or organism found at that single pixel for a total of 108 point annotations of animals and objects in each frame. The team used the CoralNet web-based GUI and its label suggestion function to do their manual labeling (<xref ref-type="bibr" rid="B9">Chen et&#xa0;al., 2021</xref>). All organism labels were based on the Collaborative and Annotation Tools for Analysis of Marine Imagery (CATAMI) hierarchical classification scheme (<xref ref-type="bibr" rid="B1">Althaus et&#xa0;al., 2015</xref>). In the current work, we ignore substrate labels and focus only on organism point annotations for prompting SAM.</p>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Station M benthic camera trap</title>
<p>Station M is an abyssal monitoring station established in 1989 off the coast of Central California to study seafloor processes over time and better quantify energetic relationships between the surface and the benthos (<xref ref-type="bibr" rid="B34">Smith and Druffel, 1998</xref>; <xref ref-type="bibr" rid="B32">Sherman and Smith, 2009</xref>). Images are taken every hour, and a subsample has been point annotated in MBARI&#x2019;s Video Annotation and Reference System (VARS) for 15 classes by a team of experts in the regional benthic fauna (<xref ref-type="bibr" rid="B30">Schlining and Stout, 2006</xref>). The taxonomic names adhere to the World Register of Marine Species (WoRMS) knowledge database (<xref ref-type="bibr" rid="B14">Horton et&#xa0;al., 2021</xref>). We focused on the jellyfish <italic>Benthocodon</italic> spp., one of the most common organisms at Station M, for the purposes of the current work. The point annotations for <italic>Benthocodon</italic> spp. in each subsampled image were used to prompt SAM. For the purpose of evaluation for the current work, a human expert from the MBARI Video Lab made bounding box annotations around the previously identified points.</p>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>OBSEA Image Dataset</title>
<p>The Seafloor Observatory (OBSEA) Image Dataset is an annotated subsample of image data collected by a cabled video-platform deployed in a marine protected area 4 km off the coast of Barcelona, Spain (<xref ref-type="bibr" rid="B11">Francescangeli et&#xa0;al., 2023</xref>). Images were collected every 30 minutes over a two year period from 2013 to 2015 with a camera observing an artificial reef structure at about 20 m depth. Two different cameras were used over the course of the deployment and images were JPEG compressed for storage. The OBSEA research team built a custom python-based annotation tool to draw bounding boxes oriented along the major axis of target fish (<xref ref-type="bibr" rid="B22">Marini, 2022</xref>). The classification adhered to the FishBase hierarchy and included an &#x201c;unknown&#x201d; category for out of focus targets (<xref ref-type="bibr" rid="B12">Froese and Pauly, 2000</xref>). Since the OBSEA data was originally annotated with bounding boxes, we use those localizations rather than points to prompt SAM to generate segmentation masks. The boxes were rotated to align with the image axes for the purpose of these experiments.</p>
</sec>
<sec id="s2_3_4">
<label>2.3.4</label>
<title>FathomNet imagery</title>
<p>FathomNet is a global image database that hosts human-verified annotated marine images, collected in all marine habitats, from the surface to the benthos and the coast to the open ocean (<xref ref-type="bibr" rid="B19">Katija et&#xa0;al., 2022</xref>). We selected annotated images of <italic>Bathochordaeus mcnutti</italic>, a species of filter feeding larvacean found in the midwater, from the broader FathomNet repository (<xref ref-type="bibr" rid="B31">Sherlock et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B20">Katija et&#xa0;al., 2017</xref>). These images often had multiple bounding box localizations per individual highlighting the animal itself, the inner filter, and outer filter where present. The midwater habitat and the shapes of the filters are very unlike anything originally used to train SAM. The annotations were made with the VARS-Localize interface according to WoRMS accepted taxonomic designations (<xref ref-type="bibr" rid="B3">Barnard, 2020</xref>; <xref ref-type="bibr" rid="B14">Horton et&#xa0;al., 2021</xref>). For the purposes of our tests, we selected images collected from 2018 to 2024 and prompted SAM with just localizations of the inner filter.</p>
</sec>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Evaluation</title>
<p>Bounding boxes and masks suggested by SAM were evaluated with three metrics. Intersection over Union (IoU) and Euclidean distance measurements are fairly standard in image processing and computer vision. A qualitative survey, akin to the one distributed by the SAM authors, was given to our expert human annotators.</p>
<sec id="s2_4_1">
<label>2.4.1</label>
<title>Intersection over Union</title>
<p>Intersection over Union (IoU) is the ratio between the area correctly identified by the computer and the total number of pixels between the proposal and the ground truth:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>A</italic> is the region proposed by SAM and <italic>B</italic> is the ground truth localization. <italic>A</italic> &#x2229; <italic>B</italic> is the number of pixels shared between the regions and <italic>A</italic>&#x222a;<italic>B</italic> is the total number of pixels in both. <italic>IoU</italic> is bounded between 0 and 1, with <italic>IoU</italic> = 0 indicating the regions are entirely disjoint and <italic>IoU</italic> = 1 indicating they are perfectly aligned (<xref ref-type="bibr" rid="B27">Rezatofighi et&#xa0;al., 2019</xref>). <italic>IoU</italic> was computed between each proposal and ground truth annotation. These values are subsequently averaged to yield a score for a collection of images.</p>
<p>
<italic>IoU</italic> was used to evaluate the output on the Station M, OBSEA, and FatomNet datasets (<xref ref-type="disp-formula" rid="eq1">Equation 1</xref>). The images from AS-AID only have point annotations and thus <italic>IoU</italic> cannot be computed for those proposals.</p>
</sec>
<sec id="s2_4_2">
<label>2.4.2</label>
<title>Distance</title>
<p>The Euclidean distance was computed between the ground truth point annotation or center of the human localized bounding box and the center of the bounding box output by SAM:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where the point (<italic>x</italic>
<sub>1</sub>
<italic>,y</italic>
<sub>1</sub>) represents the ground truth point annotation or center of the bounding box. (<italic>x</italic>
<sub>2</sub>
<italic>,y</italic>
<sub>2</sub>) is the center of the bounding box proposed by SAM. The SAM output for each dataset was evaluated with <italic>dist</italic> (<xref ref-type="disp-formula" rid="eq2">Equation 2</xref>).</p>
</sec>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>AS-AID</title>
<p>After removing substrate labels, SAM was prompted with 414 annotated points from 25 randomly selected images in the AS-AID dataset and returned bounding boxes and segmentation masks (<xref ref-type="fig" rid="f1">
<bold>Figures&#xa0;1a</bold>
</xref>, <xref ref-type="fig" rid="f2">
<bold>2a</bold>
</xref>). The foundation model output was filtered by an area threshold of one million pixels to remove localizations that were larger than a third of the full frame image, removing 121 proposals for a return rate of &#x223c;70%. After filtering by region area, the average distance between the SAM bounding box proposals and the original annotation point was 66.0 pixels (<xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1b</bold>
</xref>). The mean distance is larger than the other datasets considered in this work.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>SAM performance on AS-AID image data. The model was prompted with points annotated by grid sampling. For all plots, clearly incorrect region proposals have been removed based on the empirical area threshold. <bold>(a)</bold> SAM region proposals are plotted in orange with the output bounding boxes and their respective center points. Green circles are the original point annotations. <bold>(b)</bold> The distribution of Euclidean distance in pixels between the center point of the SAM region proposals and the original point annotations after removing obvious incorrect, large proposals.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1469396-g001.tif">
<alt-text content-type="machine-generated">Two-panel image. Panel (a) shows an underwater scene with identification markers, using orange boxes and green points to highlight specific areas. Panel (b) is a histogram displaying data on Euclidean distance in pixels, with counts on the vertical axis. Most counts fall within zero to one hundred pixels, decreasing significantly beyond this range.</alt-text>
</graphic>
</fig>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>SAM segmentation masks on all four datasets. The model was prompted with points or bounding boxes as described in Section 2.2. The output was filtered as described in Section 3. In all figures, the green boxes and points represent ground truth annotations while orange outlines are SAM segmentation mask proposals. <bold>(a)</bold> AS-AID. <bold>(b)</bold> Station-M. <bold>(c)</bold> OBSEA. <bold>(d)</bold> FathomNet.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1469396-g002.tif">
<alt-text content-type="machine-generated">Four underwater images showcase different scenes with outlined shapes. A) Dense seabed covered with marine life, outlined in various colors. B) Sparse seafloor with small outlined boxes. C) Submerged structure with fish, highlighted with rectangles. D) Floating transparent organism, outlined for emphasis.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Station M</title>
<p>SAM was prompted with 335 ground truth point annotations and returned bounding boxes and segmentation masks (<xref ref-type="fig" rid="f2">
<bold>Figures&#xa0;2b</bold>
</xref>, <xref ref-type="fig" rid="f3">
<bold>3</bold>
</xref>). The output was thresholded by area to retain region proposals with an area of less than one million pixels squared. The threshold was chosen based on the size of the original images (2256 x 1504) and set to exclude any bounding boxes larger than a third of the entire image area. This threshold was chosen empirically by observing the outputs and filtering out localizations over a given size.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>SAM performance on image data collected at Station M. The SAM model was prompted with point annotations created by human expert annotators. For all plots, clearly incorrect region proposals have already been removed based on the empirical area threshold. <bold>(a)</bold> SAM region proposals are plotted in orange with both the output bounding boxes and center points. Green circles are the original point annotations. Blue boxes are the ground truth regions made by a human expert for this project. Note that there are instances where the human expert drew boxes where the original human annotator did not indicate a jellyfish <italic>Benthocodon</italic> spp. <bold>(b)</bold> The distribution of Euclidean distance in pixels between the center point of the SAM region proposals and the original point annotations. <bold>(c)</bold> The distribution of Intersection over Union scores between the human expert&#x2019;s bounding boxes and the SAM proposals. Note that no comparison was made between the new regions identified by the human if there is not a corresponding point label.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1469396-g003.tif">
<alt-text content-type="machine-generated">Underwater scene with marked objects and two histograms. Image a shows a seabed with colored squares highlighting objects. Chart b displays counts of Euclidean distances, with most values near zero. Chart c displays counts for intersection over union values, peaking around 0.5.</alt-text>
</graphic>
</fig>
<p>After applying the area threshold, SAM returned 284 region proposals from 335 original prompts, a return rate of approximately 85%. SAM typically missed low contrast examples of <italic>Benthodocon</italic> sp. in the far-field (approximately the upper third of images) and sometimes returned localizations that included shadows cast by the camera&#x2019;s strobes (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3a</bold>
</xref>). The average Euclidean distance between the original point annotations and the center of the SAM bounding boxes proposals was approximately 15 pixels (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3b</bold>
</xref>). The average IoU between the human generated bounding boxes and the SAM proposals was 0.42 (<xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3c</bold>
</xref>).</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>OBSEA</title>
<p>The 25 image subset of the OBSEA dataset contained 117 bounding box labels. SAM was prompted with the manually drawn boxes and returned both bounding boxes and segmentation masks (<xref ref-type="fig" rid="f2">
<bold>Figures&#xa0;2c</bold>
</xref>, <xref ref-type="fig" rid="f4">
<bold>4a</bold>
</xref>). No area threshold was applied since SAM did not return any obviously incorrect region proposals with an area close to the frame size. The average distance between the center of the manually drawn bounding boxes was 6.3 pixels (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4b</bold>
</xref>). The average IoU between the groundtruth and SAM proposals was 0.39 (<xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4c</bold>
</xref>). The relatively low IoU was a function of the rectified groundtruth localizations; aligning the boxes to the x-axis rather than the major axis of an individual fish clipped off extremities.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>SAM performance on OBSEA imagery. The SAM model was prompted with bounding boxes created by human expert annotators. <bold>(a)</bold> SAM region proposals are plotted in orange with both the output bounding boxes and center points. Green boxes are the original human annotations and the green circles are the box centers. <bold>(b)</bold> The distribution of Euclidean distance in pixels between the center point of the SAM bounding box proposals and the center of the human made bounding boxes. <bold>(c)</bold> The distribution of Intersection over Union scores computed from the overlap between the human ground truth and SAM region proposals.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1469396-g004.tif">
<alt-text content-type="machine-generated">Underwater scene showing a sunken structure with multiple yellow boxes highlighting objects. Two bar graphs depict data: one shows Euclidean distance in pixels with varying counts, and the other shows Intersection over Union with a range of counts, both labeled with axes.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>FathomNet</title>
<p>The 60 image subset of FathomNet data contained 60 bounding box ground truth labels of the inner filter of <italic>Bathochordaeus mcnutti</italic>. SAM returned both bounding boxes and segmentation masks based on the manually drawn input localizations (<xref ref-type="fig" rid="f2">
<bold>Figures&#xa0;2d</bold>
</xref>, <xref ref-type="fig" rid="f5">
<bold>5a</bold>
</xref>). Again, SAM did not return any obviously incorrect region proposals and did not require filtering. The average distance between the centers of the region proposals and ground truth bounding boxes was 5.9 pixels (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5b</bold>
</xref>). The mean IoU between the proposals and human generated boxes was 0.88 (<xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5c</bold>
</xref>).</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>SAM performance on FathomNet imagery of <italic>Bathochordaeus mcnutti</italic>. The SAM model was prompted with bounding boxes drawn around the inner filter by human expert annotators. <bold>(a)</bold> The SAM region proposal is plotted in orange and the original human annotation is shown in green. The dots represent the centers of the bounding boxes. <bold>(b)</bold> The distribution of Euclidean distance in pixels between the center points of the SAM proposals and the ground truth bounding boxes. <bold>(c)</bold> The distribution of Intersection over Union scores between the human ground truth and SAM region proposals.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1469396-g005.tif">
<alt-text content-type="machine-generated">Image divided into three panels. Panel (a) shows an underwater scene with lines highlighting an object. Panels (b) and (c) display bar charts. Panel (b) depicts counts versus Euclidean distance in pixels, showing a high frequency at lower distances. Panel (c) shows counts versus intersection over union, with a concentration near the value one.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>In this paper, we experimented with Meta AI&#x2019;s Segment Anything Model on four ocean-specific dataset enrichment tasks, testing its ability to return localizations from minimal human prompts. While the model was not perfect, it often produced reasonable bounding box and segmentation proposals from a given point or bounding box prompt as compared to localizations created by human experts. SAM had several important, dataset-dependent failure modes. When it struggled, the errors frequently manifested as boxes or masks with areas close to that of the full frame image. Such obviously incorrect localizations are easy to filter with an empirical area threshold. There are, however, patterns to these errors that give clues as to how foundation models might be most effectively used for enrichment of historic datasets and future annotation assistance.</p>
<p>The AS-AID imagery was the most challenging for SAM to work with. These point annotations were collected from an overhead perspective that is not well represented in the SAM training data. Moreover, the habitat represented in AS-AID is biologically diverse, including animals with complex morphologies imaged over variable substrate. 30% of the prompts caused SAM to return erroneous segmentations over the empirical size threshold. Organisms that are closely grouped in space were often segmented as a single animal. Likewise, since the prompts were from a gridded human annotation workflow, the points were not always on or near the centroid of the target. The model would then sometimes return the negative space formed by a coral branch or between the limbs of an echinoderm. The most egregious errors are easy to filter out with an area threshold, but many failures are difficult to spot with high level metrics and must be corrected by a human operator. Given SAM&#x2019;s performance on the AS-AID data, the model might be most effective for annotation enrichment when used to target particular organisms like worms, sponges, echinoderms, and certain kinds of bryozoans that occur on their own rather than in colonies. Further experiments are needed to assess SAM&#x2019;s reliability when prompted manually with points closer to the center of each object.</p>
<p>SAM was able to reliably produce quality proposals from point annotations in the fore- and middle-ground of Station M camera trap images. The model struggled with small objects far from the camera trap and animals with heavy shadows from the strobes. In the first case, SAM would missegment and return a localization above the empirical area threshold. This seems to be a consistent issue with small, relatively low contrast objects &#x2013; they blend into the background, obscuring edges and making segmentation difficult. In these experiments, about 15% of all annotations returned incorrect localizations on the scale of the entire image. <italic>Benthocodon</italic> sp. nearer the camera often appeared with bold shadows on both sides, an effect of the strobes. Because Station M is on sandy substrate, these shadows manifest as dark regions on a light background, leading SAM to return horizontally elongated region proposals that include the animal and both shadows. The wide region proposals drove the lower IoU scores relative to the output from the other datasets. This type of error is not easy to filter by simple thresholding and suggests that size estimates from SAM-derived proposals might be suspect without further inspection.</p>
<p>SAM yielded qualitatively excellent results on the OBSEA dataset, returning bounding boxes that included the entire animal and segmentation masks that adhered tightly to the outline of the target fish. The high quality results from SAM are perhaps a function of fish-like objects being present in the original training set. Indeed, the SAM developers reported high zero-shot results on the NDD20 dolphin dataset. SAM occasionally missed heavily shadowed portions of targets in the OBSEA data, losing tails and fins. These are again subtle errors that are difficult to filter with simple thresholds. Fortunately, the errors are rare for the OBSEA data and would mostly affect size estimates from the SAM output.</p>
<p>The model did well with the <italic>Bathochordaeus mcnutti</italic> data drawn from FathomNet. The bounding boxes were typically tight with the inner filter and aligned well with the ground truth. Segmentation masks likewise were tight to object outline, though tended to include the larvacean itself and occasionally cut off small edges of the filter. While the morphology of the larvacean is quite distinct from the objects used to train SAM, the animal and filter present as light pixels on a dark background; SAM functioned as a very effective edge detector. These results indicate that a foundation model might help annotators quickly create masks and boxes with several clicks. The output would be sufficient for training certain types of AI models, but perhaps not immediately usable for filter volume estimates.</p>
<p>Across the four datasets, SAM was most successful on images in the midwater. The relatively simple, uniform background was easy for the model to parse even when the target morphology (like the larvacean) was different from most of SAM&#x2019;s training data. The model likewise did well with fish, a concept and morphology it is familiar with from its training data. Both of the tested benthic environments caused distinct issues for SAM. The complexities of the organisms, the variability of the substrate, and distinctive overhead angle in AS-AID caused missegmentations in many cases. The angle and illumination of the Station-M camera system caused shadows and resulted in many far field, small targets that resulted in erroneous localizations from SAM.</p>
<p>While there are many important caveats, SAM&#x2019;s output in these tests is promising. One should not assume it will work out-of-the-box in all cases, but can anticipate using it to help human annotators do their job more efficiently <xref ref-type="bibr" rid="B2">Awais et&#xa0;al. (2025)</xref>. With the right user interface, a user could feed SAM unannotated marine image data and prompts to generate region proposals <xref ref-type="bibr" rid="B10">Crosby et&#xa0;al. (2023)</xref>; <xref ref-type="bibr" rid="B41">Zurowietz et&#xa0;al. (2019)</xref>. The annotator can then accept or adjust the proposals to get an accurate bounding box or segmentation mask. This workflow will initially be most effective in sparse environments, like midwater image data where object edges are easier to detect, or when asked to find targets like fish that the model is likely already familiar with. Eventually SAM, and other foundation models, will be better able to handle data from complex marine environments with additional human feedback and marine-specific annotations <xref ref-type="bibr" rid="B39">Zhao et&#xa0;al. (2025)</xref>.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>The results on these four very different marine image datasets suggests that SAM, and other foundation models, have potential for assisting human experts when creating localized annotations. While SAM&#x2019;s zero-shot results are very impressive in this domain case study, they are not good enough to be trusted in a fully operational manner; human experts should be prepared to check any data enrichment outputs and manipulate region proposals generated with direct user input. Researchers should exercise caution when attempting to apply the system as-is and spend time determining which organisms, deployment strategies, or regions might effectively leverage the model&#x2019;s strengths.</p>
<p>There is lots of potential for further studies and calibration of foundation model performance for marine applications. We reiterate our study is best viewed as preliminary. Larger, more comprehensive studies are needed to properly establish efficacy and efficiency, both for annotator assistance and full automated deployment. Future work might include, for example: evaluating the potential of other available foundation models for segmentation; comparing foundation model output directly against a bespoke segmentation model trained directly on the target data distribution; determining foundation model efficacy in noise environments and establishing abilities on old, lower resolution marine image data; execute a humancomputer interactions study to measure the efficiency gains realized when an expert human works with a foundation model to generate new annotations <xref ref-type="bibr" rid="B33">Siriborvornratanakul (2024)</xref>. We note that executing such projects requires access to enormous annotated datasets, larger than the ones we used for this study. Indeed, the most effective solution may eventually be training of a foundation model specifically for marine images <xref ref-type="bibr" rid="B39">Zhao et&#xa0;al. (2025)</xref>.</p>
<p>If properly applied, SAM and other foundation models could be very effective for enriching previously annotated image datasets like we did in this study. Such models could also be helpful in a fully interactive manner for new annotations. The output of foundation models will certainly improve with fine-tuning after an appropriately large number of expert-annotated ocean imagery has been collected. Likewise, fully automated models, foundation or otherwise, will continue to require expert-annotated, domain-specific imagery. But existing generic foundation models can already help speed marine scientists toward such bespoke models.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Material</bold>
</xref>. Further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>EO: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. BW: Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. LL: Data curation, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. KB: Data curation, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. BS: Data curation, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. KK: Conceptualization, Funding acquisition, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This work was supported by the National Science Foundation Convergence Accelerator Track E Phase I and II (ITE-2137977 and ITE-2230776). Additional support comes from the Monterey Bay Aquarium Research Institute through generous support from the David and Lucile Packard Foundation.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>The authors gratefully acknowledge the Station M team at MBARI for providing the annotated benthic camera trap images used in this study.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>Author BW was employed by CVision AI.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmars.2025.1469396/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmars.2025.1469396/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf"/>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>sam_vit_l_0b3195.pth available at: <ext-link ext-link-type="uri" xlink:href="https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth">https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth</ext-link>
</p>
</fn>
<fn id="fn2">
<label>2</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/bioinspirlab/deepsea-sam-experiments.git">https://github.com/bioinspirlab/deepsea-sam-experiments.git</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Althaus</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Hill</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Ferrari</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Edwards</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Przeslawski</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sch&#xf6;nberg</surname> <given-names>C. H.</given-names>
</name>
<etal/>
</person-group>. (<year>2015</year>). <article-title>A standardised vocabulary for identifying benthic biota and substrata from underwater imagery: the catami classification scheme</article-title>. <source>PloS One</source> <volume>10</volume>, <elocation-id>e0141039</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0141039</pub-id>, PMID: <pub-id pub-id-type="pmid">26509918</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Awais</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Naseer</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Khan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Anwer</surname> <given-names>R. M.</given-names>
</name>
<name>
<surname>Cholakkal</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Shah</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>Foundation models defining a new era in vision: a survey and outlook</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source> <volume>47</volume>, <page-range>2245&#x2013;2264</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2024.3506283</pub-id>, PMID: <pub-id pub-id-type="pmid">40030979</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Barnard</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <source>VARS-Localize</source>. Available online at: <uri xlink:href="https://github.com/mbari-org/vars-localize">https://github.com/mbari-org/vars-localize</uri> (Accessed <access-date>July 02, 2023</access-date>).</citation></ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Beery</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Van Horn</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Perona</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Recognition in Terra Incognita</article-title>. In <person-group person-group-type="editor">
<name>
<surname>Ferrari</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Hebert</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sminchisescu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Weiss</surname> <given-names>Y.</given-names>
</name>
</person-group> (Eds.), <source>Computer vision &#x2013; ECCV 2018</source> (Lecture Notes in Computer Science), Vol. <volume>11220</volume>, <page-range>456&#x2013;473</page-range>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01270-0_28</pub-id>
</citation></ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Belcher</surname> <given-names>B. T.</given-names>
</name>
<name>
<surname>Bower</surname> <given-names>E. H.</given-names>
</name>
<name>
<surname>Burford</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Celis</surname> <given-names>M. R.</given-names>
</name>
<name>
<surname>Fahimipour</surname> <given-names>A. K.</given-names>
</name>
<name>
<surname>Guevara</surname> <given-names>I. L.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Demystifying image-based machine learning: a practical guide to automated analysis of field imagery using modern machine learning tools</article-title>. <source>Front. Mar. Sci.</source> <volume>10</volume>, <elocation-id>1157370</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fmars.2023.1157370</pub-id>
</citation></ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bell</surname> <given-names>K. L. C.</given-names>
</name>
<name>
<surname>Quinzin</surname> <given-names>M. C.</given-names>
</name>
<name>
<surname>Poulton</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hope</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Amon</surname> <given-names>D.</given-names>
</name>
</person-group> (Eds.) (<year>2022</year>). <source>The 2022 Global Deep-Sea Capacity Assessment</source>. <publisher-name>Ocean Discovery League</publisher-name>. <publisher-loc>Saunderstown, USA</publisher-loc>. doi:&#xa0;<pub-id pub-id-type="doi">10.21428/cbd17b20.48af7fcb</pub-id>
</citation></ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bommasani</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hudson</surname> <given-names>D. A.</given-names>
</name>
<name>
<surname>Adeli</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Altman</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Arora</surname> <given-names>S.</given-names>
</name>
<name>
<surname>von Arx</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>On the opportunities and risks of foundation models</article-title>. <source>Stanford University Human-Centered Artificial Intelligence</source> <volume>2108</volume>, <fpage>07258</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2108.07258</pub-id>. Retrieved from <uri xlink:href="https://crfm.stanford.edu/report.html">https://crfm.stanford.edu/report.html</uri>
</citation></ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Branson</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wah</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Schroff</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Babenko</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Welinder</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Perona</surname> <given-names>P.</given-names>
</name>
<etal/>
</person-group>. (<year>2010</year>). &#x201c;<article-title>Visual recognition with humans in the loop</article-title>,&#x201d; In <person-group person-group-type="editor">
<name>
<surname>Daniilidis</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Maragos</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Paragios</surname> <given-names>N.</given-names>
</name>
</person-group> (eds) <source>Computer Vision &#x2013; ECCV 2010. Lecture Notes in Computer Science</source> <publisher-name>Springer</publisher-name>, <publisher-loc>Berlin, Heidelberg</publisher-loc>, <volume>6314</volume>, <fpage>438</fpage>&#x2013;<lpage>451</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-642-15561-1_32</pub-id>
</citation></ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Beijbom</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Chan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Bouwmeester</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Kriegman</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>A new deep learning engine for coralnet</article-title>,&#x201d; in <source>2021 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)</source>. <publisher-loc>Montreal, BC, Canada</publisher-loc>, <page-range>3686&#x2013;95</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCVW54120.2021.00412</pub-id>.</citation></ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Crosby</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Orenstein</surname> <given-names>E. C.</given-names>
</name>
<name>
<surname>Poulton</surname> <given-names>S. E.</given-names>
</name>
<name>
<surname>Bell</surname> <given-names>K. L.</given-names>
</name>
<name>
<surname>Woodward</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Ruhl</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Designing ocean vision AI: An investigation of community needs for imaging-based ocean conservation</article-title>,&#x201d; in <conf-name>Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (CHI &#x2019;23)</conf-name>. <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>, Article 535, <page-range>1&#x2013;16</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3544548.3580886</pub-id>.</citation></ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Francescangeli</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Marini</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mart&#xed;nez</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Del R&#xed;o</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Toma</surname> <given-names>D. M.</given-names>
</name>
<name>
<surname>Nogueras</surname> <given-names>M.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Image dataset for benchmarking automated fish detection and classification algorithms</article-title>. <source>Sci. Data</source> <volume>10</volume>, <fpage>5</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41597-022-01906-1</pub-id>, PMID: <pub-id pub-id-type="pmid">36596792</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Froese</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Pauly</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2000</year>). <source>FishBase 2000: concepts designs and data sources</source> Vol. <volume>1594</volume> <publisher-name>(WorldFish). ICLARM</publisher-name>, <publisher-loc>Los Banos, Laguna, Philippines</publisher-loc>. <fpage>344</fpage> p.</citation></ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Fulton</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sattar</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Trashcan: A semantically-segmented dataset towards visual detection of marine debris</article-title>. <source>arXiv preprint arXiv:2007.08097</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2007.08097</pub-id>
</citation></ref>
<ref id="B14">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Horton</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Kroh</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Ahyong</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Bailly</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Boyko</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Brand&#xe3;o</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). (<publisher-name>World Register of Marine Species (WoRMS</publisher-name>). Available online at: <uri xlink:href="http://www.marinespecies.org">http://www.marinespecies.org</uri> (Accessed <access-date>2021-01-30</access-date>).</citation></ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Mornin</surname> <given-names>J. D.</given-names>
</name>
<name>
<surname>Biswas</surname> <given-names>S. K.</given-names>
</name>
<name>
<surname>Beck</surname> <given-names>L. E.</given-names>
</name>
<name>
<surname>Bauer</surname> <given-names>D. P.</given-names>
</name>
<name>
<surname>Raj</surname> <given-names>A.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Quanti. us: a tool for rapid, flexible, crowd-based annotation of images</article-title>. <source>Nat. Methods</source> <volume>15</volume>, <fpage>587</fpage>&#x2013;<lpage>590</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41592-018-0069-0</pub-id>, PMID: <pub-id pub-id-type="pmid">30065368</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jaffe</surname> <given-names>J. S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Underwater optical imaging: the past, the present, and the prospects</article-title>. <source>IEEE J. Oceanic Eng.</source> <volume>40</volume>, <fpage>683</fpage>&#x2013;<lpage>700</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JOE.48</pub-id>
</citation></ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Janowski</surname> <given-names>&#x141;.</given-names>
</name>
<name>
<surname>Wr&#xf3;blewski</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Application and evaluation of the ai-powered segment anything model (sam) in seafloor mapping: A case study from puck lagoon, Poland</article-title>. <source>Remote Sens.</source> <volume>16</volume>, <fpage>2638</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs16142638</pub-id>
</citation></ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jansen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shelamoff</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Gros</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Windsor</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Hill</surname> <given-names>N. A.</given-names>
</name>
<name>
<surname>Barnes</surname> <given-names>D. K.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>The Antarctic seafloor annotated imagery database</article-title>. <source>bioRxiv 2023.02</source> <volume>16</volume>, <fpage>528770</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1101/2023.02.16.528770</pub-id>
</citation></ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Katija</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Orenstein</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Schlining</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Lundsten</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Barnard</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Sainz</surname> <given-names>G.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Fathomnet: A global image database for enabling artificial intelligence in the ocean</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>15914</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-022-19939-2</pub-id>, PMID: <pub-id pub-id-type="pmid">36151130</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Katija</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Sherlock</surname> <given-names>R. E.</given-names>
</name>
<name>
<surname>Sherman</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>Robison</surname> <given-names>B. H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>New technology reveals the role of giant larvaceans in oceanic carbon cycling</article-title>. <source>Sci. Adv.</source> <volume>3</volume>, <elocation-id>e1602374</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1126/sciadv.1602374</pub-id>, PMID: <pub-id pub-id-type="pmid">28508058</pub-id></citation></ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirillov</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Mintun</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Ravi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Rolland</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Gustafson</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Segment anything</article-title>. <source>EEE/CVF International Conference on Computer Vision (ICCV)</source>, <publisher-loc>Paris, France</publisher-loc>, <page-range>3992&#x2013;4003</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00371</pub-id>.</citation></ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marini</surname> <given-names>S.</given-names>
</name>
</person-group>. (<year>2022</year>). <article-title>Image-Tagging-tool: Image Tagging (v1.0)</article-title>. <source>Zenodo</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.6566282</pub-id>
</citation></ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Norouzzadeh</surname> <given-names>M. S.</given-names>
</name>
<name>
<surname>Morris</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Beery</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Joshi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Jojic</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Clune</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A deep active learning system for species identification and counting in camera trap images</article-title>. <source>Methods Ecol. Evol.</source> <volume>12</volume>, <fpage>150</fpage>&#x2013;<lpage>161</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/2041-210X.13504</pub-id>
</citation></ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Orenstein</surname> <given-names>E. C.</given-names>
</name>
<name>
<surname>Kenitz</surname> <given-names>K. M.</given-names>
</name>
<name>
<surname>Roberts</surname> <given-names>P. L.</given-names>
</name>
<name>
<surname>Franks</surname> <given-names>P. J.</given-names>
</name>
<name>
<surname>Jaffe</surname> <given-names>J. S.</given-names>
</name>
<name>
<surname>Barton</surname> <given-names>A. D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Semi-and fully supervised quantification techniques to improve population estimates from machine classifiers</article-title>. <source>Limnology Oceanography: Methods</source> <volume>18</volume>, <page-range>739&#x2013;753</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/lom3.10399</pub-id>
</citation></ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Perkins</surname> <given-names>N. R.</given-names>
</name>
<name>
<surname>Foster</surname> <given-names>S. D.</given-names>
</name>
<name>
<surname>Hill</surname> <given-names>N. A.</given-names>
</name>
<name>
<surname>Barrett</surname> <given-names>N. S.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Image&#xa0;subsampling&#xa0;and point scoring approaches for large-scale marine benthic monitoring programs</article-title>. <source>Estuarine Coast. Shelf Sci.</source> <volume>176</volume>, <fpage>36</fpage>&#x2013;<lpage>46</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecss.2016.04.005</pub-id>
</citation></ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Segment anything model (SAM) assisted remote sensing supervision for mariculture&#x2014;using liaoning province, China as an example</article-title>. <source>Remote Sens.</source> <volume>15</volume>, <fpage>5781</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs15245781</pub-id>
</citation></ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rezatofighi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Tsoi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Gwak</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sadeghian</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Reid</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Savarese</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Generalized intersection over union: A metric and a loss for bounding box regression</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</conf-name>. <publisher-loc>Long Beach, CA, USA</publisher-loc> <volume>2019</volume>, <fpage>658</fpage>&#x2013;<lpage>666</lpage>doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00075</pub-id>
</citation></ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Robison</surname> <given-names>B. H.</given-names>
</name>
<name>
<surname>Reisenbichler</surname> <given-names>K. R.</given-names>
</name>
<name>
<surname>Sherlock</surname> <given-names>R. E.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The coevolution of midwater research and ROV technology at MBARI</article-title>. <source>Oceanography</source> <volume>30</volume>, <fpage>26</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5670/oceanog.2017.421</pub-id>
</citation></ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Russakovsky</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L. -J.</given-names>
</name>
<name>
<surname>Fei-Fei</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Best of both worlds: human-machine collaboration for object annotation</article-title>,&#x201d; in <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Boston, MA, USA</publisher-loc> <volume>2015</volume>, <fpage>2121</fpage>&#x2013;<lpage>2131</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298824</pub-id>
</citation></ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Schlining</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Stout</surname> <given-names>N. J.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>MBARI&#x2019;s video annotation and reference system</article-title>,&#x201d; in <source>OCEANS 2006</source> (<publisher-loc>Boston, MA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>) <volume>2006</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/OCEANS.2006.306879</pub-id>
</citation></ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sherlock</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Walz</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Schlining</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Robison</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Morphology, ecology,&#xa0;and molecular biology of a new species of giant larvacean in the eastern north pacific: Bathochordaeus mcnutti sp. nov</article-title>. <source>Mar. Biol.</source> <volume>164</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00227-016-3046-0</pub-id>, PMID: <pub-id pub-id-type="pmid">28042175</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sherman</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>Smith</surname> <given-names>J. K.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Deep-sea benthic boundary layer communities and food supply: A long-term monitoring strategy</article-title>. <source>Deep Sea Res. Part II: Topical Stud. Oceanography</source> <volume>56</volume>, <fpage>1754</fpage>&#x2013;<lpage>1762</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.dsr2.2009.05.020</pub-id>
</citation></ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Siriborvornratanakul</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Reducing human annotation effort using self-supervised learning for image segmentation</article-title>,&#x201d; In <person-group person-group-type="editor">
<name>
<surname>Degen</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Ntoa</surname> <given-names>S.</given-names>
</name>
</person-group> (eds) <source>Artificial Intelligence in HCI. Lecture Notes in Computer</source>/series, <volume>14734</volume>. (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>436</fpage>&#x2013;<lpage>445</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-031-60606-9_26</pub-id>
</citation></ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smith</surname> <given-names>J. K.</given-names>
</name>
<name>
<surname>Druffel</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Long time-series monitoring of an abyssal site in the NE Pacific: an introduction</article-title>. <source>Deep Sea Res. Part II: Topical Stud. Oceanography</source> <volume>45</volume>, <fpage>573</fpage>&#x2013;<lpage>586</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0967-0645(97)00094-5</pub-id>
</citation></ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trotter</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Atkinson</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Sharpe</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Richardson</surname> <given-names>K.</given-names>
</name>
<name>
<surname>McGough</surname> <given-names>A. S.</given-names>
</name>
<name>
<surname>Wright</surname> <given-names>N.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>NDD20: A large-scale few-shot dolphin dataset for coarse and fine-grained categorisation</article-title>. <source>arXiv preprint arXiv:2005.13359</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2005.13359</pub-id>
</citation></ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Van Horn</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Branson</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Farrell</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Haber</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Barry</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ipeirotis</surname> <given-names>P.</given-names>
</name>
<etal/>
</person-group>. (<year>2015</year>). &#x201c;<article-title>Building a bird recognition app and large scale dataset with citizen scientists: The fine print in fine-grained dataset collection</article-title>,&#x201d; in <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <publisher-loc>Boston, MA, USA</publisher-loc>, <volume>2015</volume>, <fpage>595</fpage>&#x2013;<lpage>604</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2015.7298658</pub-id>
</citation></ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wah</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Branson</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Perona</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Belongie</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>Multiclass recognition and part localization with humans in the loop</article-title>,&#x201d; in <source>2011 International Conference on Computer Vision</source> (<publisher-loc>Barcelona, Spain</publisher-loc>: <publisher-name>IEEE</publisher-name>) <volume>2011</volume> <fpage>2524</fpage>&#x2013;<lpage>2531</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2011.6126539</pub-id>
</citation></ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weinstein</surname> <given-names>B. G.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A computer vision for animal ecology</article-title>. <source>J. Anim. Ecol.</source> <volume>87</volume>, <fpage>533</fpage>&#x2013;<lpage>545</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1111/1365-2656.12780</pub-id>, PMID: <pub-id pub-id-type="pmid">29111567</pub-id></citation></ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Gu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Usuyama</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>H. H.</given-names>
</name>
<name>
<surname>Kiblawi</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2025</year>). <article-title>A foundation model for joint segmentation, detection and recognition of biomedical objects across nine modalities</article-title>. <source>Nat. Methods</source> <volume>22</volume>, <fpage>166</fpage>&#x2013;<lpage>176</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41592-024-02499-w</pub-id>, PMID: <pub-id pub-id-type="pmid">39558098</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zurowietz</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Langenk&#xe4;mper</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Hosking</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Ruhl</surname> <given-names>H. A.</given-names>
</name>
<name>
<surname>Nattkemper</surname> <given-names>T. W.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Maia&#x2014;a machine learning assisted image annotation method for environmental monitoring and exploration</article-title>. <source>PloS One</source> <volume>13</volume>, <elocation-id>e0207498</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0207498</pub-id>, PMID: <pub-id pub-id-type="pmid">30444917</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zurowietz</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Langenk&#xe4;mper</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Nattkemper</surname> <given-names>T. W.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>BIIGLE2Go&#x2014;a scalable image annotation system for easy deployment on cruises</article-title>,&#x201d; in <source>OCEANS 2019-Marseille</source> (<publisher-loc>Marseille, France</publisher-loc>: <publisher-name>IEEE</publisher-name>) <volume>2019</volume>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/OCEANSE.2019.8867417</pub-id>
</citation></ref>
</ref-list>
</back>
</article>