<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Psychol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Psychology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Psychol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-1078</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpsyg.2026.1768533</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Approaching human visual perception through AI-based representation of figure-ground segregation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Yip</surname>
<given-names>Chakkai</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3317129"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Moroze</surname>
<given-names>Ezekiel</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3318187"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nishina</surname>
<given-names>Shigeaki</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/15045"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="funding-acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">funding-acquisition</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yazdanbakhsh</surname>
<given-names>Arash</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/164561"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Computational Neuroscience and Vision Laboratory, Department of Psychological and Brain Sciences, Boston University</institution>, <city>Boston</city>, <state>MA</state>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>Honda Research Institute Japan Co., Ltd.</institution>, <city>Wako-shi</city>, <state>Saitama</state>, <country country="jp">Japan</country></aff>
<aff id="aff3"><label>3</label><institution>Graduate Program for Neuroscience, Boston University</institution>, <city>Boston</city>, <state>MA</state>, <country country="us">United States</country></aff>
<aff id="aff4"><label>4</label><institution>Center for Systems Neuroscience, Boston University</institution>, <city>Boston</city>, <state>MA</state>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Arash Yazdanbakhsh, <email xlink:href="mailto:yazdan@bu.edu">yazdan@bu.edu</email></corresp>
<fn fn-type="equal" id="fn0001">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-27">
<day>27</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1768533</elocation-id>
<history>
<date date-type="received">
<day>16</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>10</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Yip, Moroze, Nishina and Yazdanbakhsh.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Yip, Moroze, Nishina and Yazdanbakhsh</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-27">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec id="sec1001">
<title>Introduction</title>
<p>Understanding how the visual system assigns borders to foreground objects is central to figure&#x2013;ground perception, yet the computational principles underlying this process are still under investigation.</p>
</sec>
<sec id="sec2001">
<title>Methods</title>
<p>We trained multiple convolutional neural network (CNN) architectures on simple overlapping/occlusion stimuli and tested them on systematically degraded contours to probe how border-ownership (BOS) inference depends on available border context.</p>
</sec>
<sec id="sec3001">
<title>Results</title>
<p>Across networks, BOS could be inferred from feedforward computations even under degraded conditions, but performance showed a strong dependence on junction-like configurations, indicating that geometric context contributes more than isolated edges. Accuracy increased approximately linearly with the amount of contextual information provided by fragmented borders, and representation analyses revealed a hierarchical progression from local edge responses to more spatially coherent, BOS-specific features.</p>
</sec>
<sec id="sec4001">
<title>Discussion</title>
<p>Together, these results delineate which aspects of BOS can emerge from hierarchical feedforward processing and suggest that additional mechanisms such as horizontal and feedback interactions may reduce the visual information required for robust figure-ground segregation.</p>
</sec>
</abstract>
<kwd-group>
<kwd>AI saliency mapping</kwd>
<kwd>border-ownership</kwd>
<kwd>contour junctions</kwd>
<kwd>convolutional neural networks</kwd>
<kwd>feedforward processes</kwd>
<kwd>figure-ground segregation</kwd>
<kwd>partial occlusion</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="0"/>
<equation-count count="3"/>
<ref-count count="29"/>
<page-count count="13"/>
<word-count count="8448"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Perception Science</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Border-ownership (BOS) refers to the process by which the brain determines which side of a border belongs to an object in a figure, which plays a key role in figure-ground segregation. Studies have identified BOS-selective neural activity in primate early visual areas for both artificial and natural stimuli (<xref ref-type="bibr" rid="ref29">Zhou et al., 2000</xref>; <xref ref-type="bibr" rid="ref11">Hesse and Tsao, 2016</xref>), yet full understanding of cue integration and representation in biological visual networks is limited. Since convolutional neural networks (CNNs) share several key properties with the visual pathway, we can use them as a framework to explore how BOS representations might arise through hierarchical processing and to provide perspectives on the mechanisms that support figure&#x2013;ground segregation in the brain.</p>
<p>Analogous to how receptive field size grows in higher visual areas, CNNs analyze larger patches of an image through their processing hierarchy due to increasing spatial integration across layers. Each stacked bundle of convolution-nonlinearity-pooling can be considered analogous to a single visual area forming a processing hierarchy (<xref ref-type="bibr" rid="ref16">Lindsay, 2021</xref>). Early layers function similarly to simple cells by responding to simple stimuli like edge and orientation, while processed features become more complex deeper in the CNN in a manner analogous to visual processing. As such, the activation in deeper CNN layers could resemble the recruitment of higher visual areas as measured by fMRI (<xref ref-type="bibr" rid="ref9">G&#x00FC;&#x00E7;l&#x00FC; and Van Gerven, 2015</xref>). CNNs utilize algorithms in part similar to feedback processing in the visual system for learning during the training process but operate exclusively using feedforward processing after training. Comparatively in the visual system, feedback connections between areas with different receptive fields such as V1, V2, and V4 play a critical role during the representation of scenes (<xref ref-type="bibr" rid="ref25">Yazdanbakhsh and Gori, 2008</xref>, <xref ref-type="bibr" rid="ref26">2011</xref>; <xref ref-type="bibr" rid="ref19">Sherbakov and Yazdanbakhsh, 2013</xref>). The fixed operation of CNNs after training makes it impossible for them to use feedback signaling to process new data. Nevertheless, studies have indicated strong correlation between activation of deeper layers in CNNs and spiking activity of real neurons in higher visual areas during object recognition tasks (<xref ref-type="bibr" rid="ref24">Yamins et al., 2014</xref>; <xref ref-type="bibr" rid="ref2">Cadena et al., 2019</xref>) and naturally emerging BOS signals during video prediction (<xref ref-type="bibr" rid="ref27">Ye et al., 2025</xref>). Therefore, despite having a few differences, we can exploit the similarities between CNNs and the visual system to offer insight into the representations of BOS in the visual system.</p>
<p>Beyond simple one-to-one feedforward architectures, other modifications have been applied to achieve better performance in CNNs. Residual learning is a common technique that was developed to address degradation problems in neural networks through skip connections that maintain identity mappings in early layers (<xref ref-type="bibr" rid="ref10">He et al., 2016</xref>). From a neuronal perspective, the functional role of these shortcut pathways can be viewed as analogous to forms of feedforward cortical modulation. In primate vision, V1 sends direct projections to V4 through pathways that bypass V2 (<xref ref-type="bibr" rid="ref17">Nakamura et al., 1993</xref>). Similarly, pyramidal cells from neocortex layer V often bypass the intermediate layer and project long range axons to subcortical areas (<xref ref-type="bibr" rid="ref4">Douglas and Martin, 2004</xref>), which resemble the identity-preserving characteristics of residual CNN architectures. Theory in predictive coding also suggests that cortical circuits send predictions from top to bottom while sending error signals back (<xref ref-type="bibr" rid="ref7">Friston, 2005</xref>).</p>
<p>Multiscale integration has also been found to be a significant property in terms of BOS, where neurons with varying connectivity operate at multiple scales to form a complete representation. In V1, the receptive field center is defined by feedforward signals with laterally mediated near-surround and feedback-shaped far-surround areas (<xref ref-type="bibr" rid="ref1">Angelucci and Bressloff, 2006</xref>). Computational models formalized BOS through this receptive field paradigm using large field grouping (G) units that pool over extended regions and send feedback to small-field border units, creating size-tolerant, context-dependent BOS (<xref ref-type="bibr" rid="ref3">Craft et al., 2007</xref>). Other complementary models cast BOS as cross-scale competition, where units that share the same retinotopic center, but different spatial scales compete through mutual suppression, allowing the scale that best explains the configuration to dominate (<xref ref-type="bibr" rid="ref12">Layton et al., 2012</xref>, <xref ref-type="bibr" rid="ref13">2014</xref>).</p>
<p>In this work, we exploited the similarities between CNNs and primate visual systems to interrogate similar mechanisms underpinning BOS representation. By investigating mechanistic differences between feedforward CNNs and visual systems, we provided insight into the specific roles of horizontal and feedback connections in BOS representation. Ultimately, we aim to provide a computational lens for interpreting BOS phenomena and help clarify which aspects of visual organization can emerge from hierarchical processing.</p>
</sec>
<sec sec-type="methods" id="sec2">
<label>2</label>
<title>Methods</title>
<p>To investigate whether CNNs can learn BOS cues from controlled geometric configurations, we developed sets of synthetic stimuli containing overlapping shapes with occlusion relationships. In this work, we use the rectangle as our base stimulus class since it has been widely employed in psychophysical and electrophysiological studies, both in the domains of BOS and in transparency perception (<xref ref-type="bibr" rid="ref22">Watanabe and Cavanagh, 1993a</xref>,<xref ref-type="bibr" rid="ref23">b</xref>; <xref ref-type="bibr" rid="ref29">Zhou et al., 2000</xref>; <xref ref-type="bibr" rid="ref28">Zhang and Von Der Heydt, 2010</xref>). The rectangle stimulus pattern provides a relatively minimal and well-controlled geometry, which leads to a reliable generation of unambiguous T-junctions and occlusion relationships. By employing this well-defined stimulus class, we can compare our findings to prior biological studies while also minimizing possible confounds introduced by more complicated or naturalistic shape statistics.</p>
<sec id="sec3">
<label>2.1</label>
<title>Stimuli generation</title>
<p>In order to produce images with interpretable BOS properties to the network, we systematically generated datasets of two overlapping rectangles with randomized size, relative position, and degree of overlap to represent the naturalistic variability in occlusion relationships. For each image in the dataset with fixed resolution (<inline-formula>
<mml:math id="M1">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>227</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) according to the input requirements of the network, we defined the left rectangle parameters (<inline-formula>
<mml:math id="M2">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) and the right rectangle parameters (<inline-formula>
<mml:math id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) by constrained uniform sampling. The <inline-formula>
<mml:math id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represented the starting position (top-left corner) of the rectangles, and <inline-formula>
<mml:math id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> was the width and height of the rectangles. There were several other parameters involved that guaranteed in-bounds placement and proper overlap: <inline-formula>
<mml:math id="M6">
<mml:mi>&#x03B1;</mml:mi>
</mml:math>
</inline-formula> signified the minimum distance between the image margin and the position of the rectangle, <inline-formula>
<mml:math id="M7">
<mml:mi>&#x03B4;</mml:mi>
</mml:math>
</inline-formula> represented the minimum interior offset of the right rectangle relative to the left, and <inline-formula>
<mml:math id="M8">
<mml:mi>&#x03B2;</mml:mi>
</mml:math>
</inline-formula> was the minimum extension beyond the overlap region so that it is not fully enclosed or masked by the left rectangle. An illustration of the geometric parameterization and the distinction between fixed displacement constraints and randomly sampled variables is shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>. In our experiments, we set <inline-formula>
<mml:math id="M9">
<mml:mrow>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M10">
<mml:mrow>
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M11">
<mml:mrow>
<mml:mi>&#x03B4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M12">
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Geometric parametrization of stimulus generation. The black frame shows the image size <inline-formula>
<mml:math id="M14">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The red dash line marks a fixed margin where no object pixels are generated and white dash line represents randomly sampled variables drawn from uniform distribution. The parameters <inline-formula>
<mml:math id="M15">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> control the size and position of the two rectangles, while <inline-formula>
<mml:math id="M16">
<mml:mrow>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x03B2;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x03B4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> specify the fixed displacement constraints. All parameters are sampled within the ranges specified in the Methods.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrates two overlapping rectangles within a larger rectangular boundary labeled with height H and width W. Variables in red and white, such as xL, xR, yL, yR, wL, wR, hL, hR, &#x03B1;, &#x03B2;, and &#x03B4;, denote distances or offsets between edges and dimensions of different sections, using both solid and dashed lines to show measurements.</alt-text>
</graphic>
</fig>
<p>We rendered two luminance settings for the experiments as depicted in <xref ref-type="fig" rid="fig2">Figure 2</xref>. The luminance-invariant (contour) condition displayed rectangles as hollow outlines with their borders set to a high contrast value and thereby eliminated surface cues and forced reliance on border geometry for BOS. In luminance-variant (solid) condition, the two rectangles were filled at two grayscale levels such that:</p>
<disp-formula id="E1">
<mml:math id="M17">
<mml:mrow>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mi mathvariant="normal">,</mml:mi>
<mml:mn>250</mml:mn>
</mml:mrow>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo>|</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>|</mml:mo>
</mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>50</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M18">
<mml:mrow>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> were the grayscale values for the rectangles. We kept an absolute difference to ensure a strong luminance contrast that would not trivially disappear at close luminance values.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Examples of the two luminance-condition stimulus types. Rows represent stimulus types, with solid indicating filled in shapes of different luminance, and contour indicating outlines of occluding shapes. Columns provide examples of borders owned by right and left rectangles, respectively. The networks were trained to distinguish right vs. left BOS using these labels via supervised learning.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Four-panel graphic showing arrangements of two overlapping rectangles. Top panels labeled "Solid" show filled gray rectangles; bottom panels labeled "Contour" display outlined rectangles. Left panels have the front rectangle on the left, right panels have the front rectangle on the right.</alt-text>
</graphic>
</fig>
<p>Models were trained in a supervised fashion, with ground-truth classification labels assigned based on the occlusion relationship of the two rectangles for a specific stimulus. The rectangle in the foreground was labeled with ownership of the shared border and the background rectangle was considered occluded. Models had to learn to classify stimuli following this occlusion labeling.</p>
<sec id="sec4">
<label>2.1.1</label>
<title>Fragmented stimuli</title>
<p>We tested the robustness of the network by creating a series of testing datasets that were based on the contour stimulus (<xref ref-type="fig" rid="fig3">Figure 3</xref>). We tested network tolerance under fragmented border conditions, as objects in natural scenes are often displayed as discontinuous boundaries due to occlusion or fragmentation caused by shadow or texture. Nevertheless, BOS-selective neurons continue to respond when dealing with disrupted borders (<xref ref-type="bibr" rid="ref28">Zhang and von der Heydt, 2010</xref>), which indicates the ability of biological vision to merge separate visual elements into single border representations. To probe whether CNNs exhibit the ability to preserve BOS assignment under fragmented conditions, we systematically designed two classes of fragmented figures to examine network performance.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Fragmentation stimuli displays. <bold>(A)</bold> Example stimuli with two levels of gap. The parameter <italic>n</italic> represents a ratio between continuous contour and gap, forming dashes with spacing according to <italic>n</italic> value. For example, <italic>n</italic>&#x202F;=&#x202F;5 means after every 5 pixels, one is visible and the rest are missing, whereas &#x2155; means after each 5 sequential visible pixels there is one missing. The two examples are shown with <italic>n</italic>&#x202F;=&#x202F;&#x2155; and 5, respectively. The upper row presents the full stimulus image, and the lower row shows a magnified view of the contour construction, where green pixels indicate the visible border segment and red pixels represent gap (missing) pixels. Variable gap levels were applied only to contour stimuli and not solid stimuli. <bold>(B)</bold> Positional fragment indexing for fragmented permutation of stimuli. Each number fragment corresponds to a different sector of the stimulus. <bold>(C)</bold> Example of fragmented stimuli represented by 8-bit binary codes, where each digit indicates the absence (0) or presence (1) of the corresponding fragment. The result of all 256 groups of stimuli are included in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 1</xref>.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure with three main panels labeled A, B, and C. Panel A compares two scenarios with rectangles, showing dashed outlines and colored sections; the left uses n equals one fifth and the right uses n equals five. Panel B displays labeled diagrams of a left and right rectangle with numbered corners. Panel C shows five rows, each illustrating two overlapping rectangles with a binary eight-digit code beside each configuration.</alt-text>
</graphic>
</fig>
<p>For the first class of condition, we produced 10 groups of figures with discontinuous borders by dividing the contours of rectangles from the contour stimuli group into evenly spaced segments with increasing intervals (<xref ref-type="fig" rid="fig3">Figure 3A</xref>). We did not fragment the solid stimuli group since fragmenting filled shapes would introduce ambiguity about whether removed regions should be interpreted as holes or as changes in surface appearance rather than as missing contour information. The degree of interval was controlled by a border-to-gap ratio, ranging from 5 border pixels with 1 gap pixel (<inline-formula>
<mml:math id="M19">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mn>5</mml:mn>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>) to 1 border pixel with 5 gap pixels (<inline-formula>
<mml:math id="M20">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). These figures were constructed using the same procedure as in the training set, but with their contours modified into dashed outlines.</p>
<p>For the second class, we used methods similar to <xref ref-type="bibr" rid="ref28">Zhang and Von Der Heydt (2010)</xref>, where the rectangle was divided into 8 fragments with each fragment either absent or present during testing. This factorial design created a total of 256 combinations. Fragments were paired diagonally between left and right rectangles since our stimuli generative model always placed the right rectangle at the bottom right quadrant relative to the left. Each fragment was subsequently assigned an index (<xref ref-type="fig" rid="fig3">Figure 3B</xref>). By doing this, we generated a unique 8-digit binary code for every combination. Specifically, the order of digits was represented by the numbering of fragments from <xref ref-type="fig" rid="fig3">Figure 3B</xref> with 1 being present and 0 being absent. For instance, 00101111 means that fragments 1, 2, and 4 were absent in the stimuli (<xref ref-type="fig" rid="fig3">Figure 3C</xref>).</p>
</sec>
</sec>
<sec id="sec5">
<label>2.2</label>
<title>Model</title>
<p>The CNNs are a class of deep learning models designed for image analysis. They process the images in order of layers by applying learned spatial filters to local regions of the input and nonlinear transformations. The local responses are cumulative as information moves through the network so that later layers can encode increasingly complex and spatially extended patterns. Compared to traditional computer vision systems, CNNs learn all filtering operations directly from data during training and optimize parameters through gradient-based optimization. In this work, three CNNs with different architectures were trained on the three versions of datasets (solid, contour, mix of solid/contour) with the objective to identify which shape owned the border between two overlapping ones. We used AlexNet to serve as our baseline model due to its relatively shallow structure and interpretable feature representation. To test whether adding residual learning or multiscale feature extraction perform better than purely feedforward, we chose Inception-v3 and ResNet-50, respectively, as alternate architectures (<xref ref-type="fig" rid="fig4">Figure 4</xref>).</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Illustration of the key architectural differences between the three network architectures evaluated. Layers refer to trainable convolutions. For inception, kernel sizes of parallel convolutions are denoted in orange. Labels refer to classification heads, consisting of trainable fully connected layers and sigmoid classification layers.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram comparing three neural network architectures. Left shows a simple feedforward AlexNet with layers in sequence. Center depicts ResNet-50 with a skip connection between layers. Right presents the Inception-v3 module with parallel convolution filters of sizes one by one, three by three, and five by five within an oval.</alt-text>
</graphic>
</fig>
<p>For each stimulus group used in the cross-conditions (both the solid/contour and cross-shape), we randomly generated 20,000 images and partitioned them into training, validation, and testing subsets using a 7:1:2 ratio. The validation subset was used exclusively for hyperparameters tuning, while the testing subset was reserved for the final performance evaluation. Detailed training hyperparameters and loss functions for each model are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table 2</xref>. For the fragmentation conditions, which we used as a test set only for measuring robustness and not for network training, we randomly generated sets of 2,000 images per group (10 groups) for the first class of fragmentation (gap level) and 1,000 images per group (256 groups) for the second class of fragmentation (fragments).</p>
<sec id="sec6">
<label>2.2.1</label>
<title>AlexNet</title>
<p>AlexNet is an early and relatively simple CNN structure that processes images through a sequence of convolutional and pooling layers arranged in a strictly feedforward manner. Information flows feedforwardly through the network, with each layer operating on the output of the previous one. Consequently, AlexNet is a comparatively shallow and structurally simple architecture by modern standards, which makes it a standard benchmark for examining how well purely one-to-one feedforward connections could account for BOS selectivity.</p>
</sec>
<sec id="sec7">
<label>2.2.2</label>
<title>Inception module</title>
<p>Compared to the traditional CNN structure that consists of combinations of convolution-nonlinearity-pooling, Inception networks extend the basic CNN framework by allowing multiple types of visual features to be extracted in parallel at each processing stage. It incorporates multiple parallel convolutional layers with different filter sizes and concatenates the resulting feature map at the end as a single joint representation (<xref ref-type="bibr" rid="ref21">Szegedy et al., 2015</xref>). Therefore, instead of using a single filter size at each layer, Inception modules apply several convolutional filters of different spatial scales simultaneously and combine their outputs, enabling the network to capture both fine local details and broader spatial patterns within the same layer.</p>
<p>The entire module consists of four individual pathways, a pathway of single 1&#x00D7;1 convolution that performs channel recombination, two pathways of 1&#x00D7;1 convolution to reduce dimensions following by either a 3&#x00D7;3 or 5&#x00D7;5 convolution to integrate medium to large spatial pattern, and finally a 3&#x00D7;3 maxpooling to introduce translational invariance which is potentially significant as our stimuli have varying position in the image. With appropriate padding, the four pathways result in the same spatial dimension but different number of channels, making the final merge achievable.</p>
</sec>
<sec id="sec8">
<label>2.2.3</label>
<title>Residual connection</title>
<p>ResNet provides a solution to the degradation problem in deep networks by introducing residual (skip) connections that allow information to propagate from shallower layer to deeper layer without going through the intermediate layers. This design enables feature reuse and preserves low-level visual information, such as edges and corners, which are particularly critical for accurately processing our geometric stimuli and supporting reliable BOS decisions.</p>
</sec>
</sec>
<sec id="sec9">
<label>2.3</label>
<title>Analyzing performance</title>
<sec id="sec10">
<label>2.3.1</label>
<title>Network accuracy</title>
<p>Considering the multiplicity of networks and training sets, we used the pairwise McNemar&#x2019;s test (<xref ref-type="bibr" rid="ref20">Spoerer et al., 2017</xref>) to compare classification accuracy between different networks trained on the same dataset. The non-parametric attribute makes it suitable for the evaluation of significant differences in their prediction outcomes when tested on the same datasets. To account for the multiple comparisons, <italic>p</italic>-values were adjusted using the Benjamini&#x2013;Hochberg false discovery rate (FDR), with the FDR set to 0.05, to control the expected proportion of false positives.</p>
<p>For comparison across versions of an individual network, we implemented the two-proportion z-test to determine whether the performance achieved by a network trained on contour when tested on solid differs significantly from the performance achieved in the reverse direction. This thereby determined the generalization and robustness of the networks to other types of stimuli.</p>
</sec>
<sec id="sec11">
<label>2.3.2</label>
<title>Saliency mapping</title>
<p>In order to probe the internal decision-making process of the network and identify the features driving its predictions, we examined the saliency of the learned representations through Gradient-weighted Class Activation Mapping (Grad-CAM) that highlighted the most informative image regions for classification in a heatmap (<xref ref-type="bibr" rid="ref18">Selvaraju et al., 2020</xref>). Through saliency mapping, we analyzed what image cues contribute to BOS representation and how important features evolved at different stack bundles of convolution-nonlinearity-pooling in the network hierarchy. Grad-CAM functions by calculating the target class score gradient with respect to the feature maps of the selected layer and averaging it to determine the importance of each channel. The weighted combination of feature maps is then rectified and upsampled to produce a class-discriminative heatmap that highlights the image regions most influential to the model&#x2019;s prediction. This allowed visualization of how each network represents BOS-related cues and how these cues affect ownership decisions in different architectures and training scenarios.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="sec12">
<label>3</label>
<title>Results</title>
<sec id="sec13">
<label>3.1</label>
<title>Quantitative analysis of fragmentation performance</title>
<sec id="sec14">
<label>3.1.1</label>
<title>Generalization of network</title>
<p>The generalization was shown by networks trained under one luminance condition and tested under opposite conditions. All networks showed significant differences (<inline-formula>
<mml:math id="M21">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x003C;</mml:mo>
<mml:mn>0.001</mml:mn>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>I</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>6.03</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>22.26</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>11.23</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) where networks trained on solid stimuli outperformed networks trained on contour stimuli (<xref ref-type="fig" rid="fig5">Figure 5</xref>). This implies that solid stimuli supplied more transferable information through combined contour and surface cues, whereas contour stimuli did not sufficiently prepare the network to interpret ownership when luminance-defined regions were introduced. Overall, the results demonstrated a strong directional dependence in generalization, with solid-trained networks transferring more effectively to contour conditions than contour-trained networks transfer to solid conditions.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Classification accuracy of three different model architectures under solid/contour cross-condition evaluation. Permutations of models were trained on contour stimuli and tested on solid stimuli (train contour/test solid) and also trained on solid stimuli and tested on contour (train solid/test contour; see <xref ref-type="fig" rid="fig1">Figure 1</xref> for examples of stimulus sets). Each model architecture performed around chance (50%, marked by the dashed line) when trained on contour stimuli and tested on solid stimuli, whereas accuracy was substantially higher when trained on solid stimuli and tested on contour.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Bar chart comparing accuracy of three models: Inception, Resnet, and Alexnet, with two scenarios. Blue bars represent train solid/test contour, showing higher accuracy, while orange bars represent train contour/test solid, showing lower accuracy. A horizontal dashed line marks 0.5 accuracy.</alt-text>
</graphic>
</fig>
<p>In addition to the luminance conditions, we also tested whether networks generalized beyond specific shapes by performing a cross-shape evaluation (Circle vs. Rectangle) in which all networks were trained on contour rectangles and tested on contour circles with similar occlusion relationships. Despite the networks never having been exposed to other geometric stimuli, all networks were able to retain above-chance (50%) accuracies (<xref ref-type="fig" rid="fig6">Figure 6</xref>). This in part indicates that the learned representations did not depend on the general geometry of the object, but rather on more shape-invariant cues that signal BOS, such as junction configuration and relative contour arrangement. The accuracy of the three networks also followed a trend in line with their accuracies observed in the solid/contour cross-condition.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Cross-shape generalization from rectangles to circles. <bold>(A)</bold> Classification accuracy of three different network architectures trained on contour rectangle stimuli and tested on contour circle stimuli with analogous occlusion configurations. The dashed line indicates chance level (50%). All models achieve above-chance performance, with ResNet showing the strongest transfer and AlexNet the weakest. <bold>(B)</bold> Example of a left-owned test stimulus from the circular contour dataset, illustrating the geometric structure used in the cross-shape generalization experiment.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Panel A presents a bar chart comparing the accuracy of three models: Inception, Resnet, and Alexnet, with Resnet showing the highest accuracy, followed by Inception, and Alexnet the lowest. Panel B displays a graphic of two overlapping white circles on a black background.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec15">
<label>3.1.2</label>
<title>Performance across networks</title>
<p>The networks were trained using contour rectangles with continuous borders to recognize discontinuous borders with increment gaps to test the robustness and generalization of different feedforward structures (<xref ref-type="fig" rid="fig3">Figure 3A</xref>). Within the small gap (<inline-formula>
<mml:math id="M22">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) conditions, Inception performed the best and exhibited strong robustness with consistent high accuracies above 95% and significant differences with both ResNet and AlexNet (<xref ref-type="fig" rid="fig7">Figure 7</xref>). ResNet performed moderately well in this regime, with accuracies near 90%, but still significantly below Inception for nearly every gap size. AlexNet showed an irregular fluctuation where the steepest degradation happened as even modest discontinuities were introduced but rose again at larger gaps, though recovery was unstable. As gap levels increased (<inline-formula>
<mml:math id="M23">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>), all networks showed progressive decline in performance. The lack of significance at extreme gap level (<inline-formula>
<mml:math id="M24">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) indicates that all networks failed under highly discontinuous contours, consistent with the absence of sufficient edge information for reliable BOS assignment.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Network performance under incremental gap levels. C represents the contour stimuli with no gap, with the gap parameter progressively indicating exaggerated dashing of the contours of the stimuli (see <xref ref-type="fig" rid="fig3">Figure 3A</xref> for example). <bold>(A)</bold> Classification accuracy across increasing gap levels. <bold>(B)</bold> Pairwise McNemar&#x2019;s test result comparing model accuracy at each gap level (A/I: AlexNet vs. Inception, A/R: AlexNet vs. ResNet, I/R: Inception vs. ResNet). Color intensity indicates the <inline-formula>
<mml:math id="M25">
<mml:mrow>
<mml:msup>
<mml:mi>&#x03C7;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> statistics, and each cell displays the FDR-adjusted <italic>p</italic>-value for the corresponding pairwise comparison.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Panel A presents a grouped bar chart comparing accuracy across AlexNet, ResNet, and Inception models at various gap levels, with Inception consistently achieving the highest accuracy. Panel B features a heatmap displaying chi-squared statistics for model comparisons at the same gap levels, with darker shades indicating higher values and lighter shades lower values.</alt-text>
</graphic>
</fig>
<p>The contrast between the performance of Inception under fragmentation (<xref ref-type="fig" rid="fig7">Figure 7</xref>) and in the solid/contour cross-condition (<xref ref-type="fig" rid="fig5">Figure 5</xref>) demonstrates that the two generalizations, i.e., across fragmentation and across solid/contour, were different. Fragmentation maintains contour-based representation and simply removes portions of the stimulus, whereas solid/contour cross-condition generalization requires the network to bridge between qualitatively different cue domains. Inception&#x2019;s multiscale structure appeared to be well suited for integrating incomplete contour information, but less effective at generalizing across contour vs. solid-based representations.</p>
</sec>
<sec id="sec16">
<label>3.1.3</label>
<title>Fragment contributions and interaction effects</title>
<p>To evaluate the significance of individual fragments and their interaction, we used a second-order regression model similar to <xref ref-type="bibr" rid="ref28">Zhang and Von Der Heydt (2010)</xref> but without the contrast polarity term as follows:</p>
<disp-formula id="E2">
<mml:math id="M26">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003C;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M27">
<mml:mi>R</mml:mi>
</mml:math>
</inline-formula> is the accuracy of the network, <inline-formula>
<mml:math id="M28">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the baseline accuracy of the network and a fitted intercept of all conditions, <inline-formula>
<mml:math id="M29">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M30">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are fragment variables (0 when absent; 1 when present). <inline-formula>
<mml:math id="M31">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math id="M32">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are regression coefficients that estimate the significance of individual fragment <inline-formula>
<mml:math id="M33">
<mml:mi>i</mml:mi>
</mml:math>
</inline-formula> and the interaction between two fragments in the BOS decision. <inline-formula>
<mml:math id="M34">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a normalization function that eliminates the global effect of fragment availability <inline-formula>
<mml:math id="M35">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula> in a given stimulus.</p>
<p>The model revealed a highly structural pattern in terms of how networks perform BOS where the individual fragment coefficient displayed a sharply unequal distribution of significance. Among eight of the fragments (<xref ref-type="fig" rid="fig8">Figure 8A</xref>), only fragment 8 exhibits a strong positive effect on BOS decision. In contrast, the remaining seven fragments showed only small negative or near-zero effects, suggesting that they contribute little individually and even present local ambiguity when present alone. This asymmetric pattern implies that the networks rely selectively on specific, spatially critical fragments rather than treating all boundary segments equivalently. The fragment interaction coefficient further clarified this pattern (<xref ref-type="fig" rid="fig8">Figure 8B</xref>). Several fragment pairs exhibited strong positive interaction (1&#x2013;8, 3&#x2013;7, 3&#x2013;6, 5&#x2013;6), demonstrating a synergy effect in which BOS information became more complete and obvious to the network when these fragments existed together than independently. This interaction likely reflects the grouping mechanism in biological vision that relies on a certain combination of elements for recognition. Conversely, there were some fragment pairs that interfered with the network&#x2019;s decision (1&#x2013;3, 5&#x2013;8) by introducing confounding or opposite structural cues about BOS.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Fragment contributions and interaction effects estimated by the regression model. <bold>(A)</bold> Illustration of individual fragments (<xref ref-type="fig" rid="fig3">Figure 3B</xref>) averaging across networks. <bold>(B)</bold> Matrix of pairwise interaction coefficient <inline-formula>
<mml:math id="M36">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. A high <inline-formula>
<mml:math id="M37">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> implies a synergistic effect in BOS decision performance for that pair of fragments.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure with two panels labeled A and B. Panel A is a bar chart showing coefficient d subscript i values for fragment indices one through eight, with error bars. Most coefficients are negative or near zero, except for index eight, which is positive and larger in magnitude. Panel B is a heat map depicting coefficient f subscript i j values for fragment indices one through eight on both axes, with a color scale from approximately negative zero point zero two five to positive zero point zero two five. Blue signifies lower values and yellow higher values, illustrating variation across the matrix.</alt-text>
</graphic>
</fig>
<p>To account for the influence of global fragment counts, we introduced a normalization function that adjusted for the general improvement in the model&#x2019;s accuracy as more fragment information was presented in the stimulus.</p>
<disp-formula id="E3">
<mml:math id="M38">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msup>
<mml:mi>N</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where the linear coefficient <inline-formula>
<mml:math id="M39">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the improvement from adding fragments regardless of their specific location. The nonlinear coefficient <inline-formula>
<mml:math id="M40">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> captures the curvature of this relationship, expressing the degree of saturation or acceleration as more fragments are added. From the simulation, the two coefficients indicated a consistently positive effect of fragment availability (<xref ref-type="fig" rid="fig9">Figure 9</xref>). The large linear term indicated that each additional fragment increased accuracy on average, and the small quadratic term suggested that this improvement did not saturate within the tested range. Instead, the network continued to benefit from incremental fragment cues even at high fragment availability. The observed bimodal distribution arose primarily from the strong contribution of fragment 8, whose presence or absence largely determined whether sufficient occlusion-specific structure was available to support a reliable BOS decision.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Effect of fragment availability (<italic>N</italic>) on classification accuracy. Fragment availability refers to the number of fragments present in the stimulus, irrespective of their position. For each number of fragments available, all permutations of fragment positions were tested, with each point corresponding to the accuracy for a single stimulus condition with a specific fragment configuration (<xref ref-type="fig" rid="fig3">Figure 3C</xref>). Points are grouped by the fragment availability with a small horizontal jitter applied for visualization, and the red curve shows the fitted normalization function with linear coefficient <inline-formula>
<mml:math id="M41">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>0.050</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and quadratic coefficient <inline-formula>
<mml:math id="M42">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>0.004</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The dashed purple and yellow curves represent separate regression analyses for stimuli groups in which fragment 8 is present versus absent, respectively.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot showing accuracy versus fragment availability (N), with three fitted lines: a red solid line, a purple dashed line, and a yellow dashed line. Legend indicates &#x03B3;1 equals 0.050 and &#x03B3;2 equals 0.004. Axis labels are "Accuracy" (Y-axis) and "Fragment Availability (N)" (X-axis). Blue dots represent individual data points.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="sec17">
<label>3.2</label>
<title>Saliency-based analysis</title>
<sec id="sec18">
<label>3.2.1</label>
<title>Saliency across network hierarchy</title>
<p>Across all three networks that trained on a mixed dataset, Grad-CAM visualizations showed a clear hierarchical progression in how BOS information was represented and evolved (<xref ref-type="fig" rid="fig10">Figure 10</xref>). Shallow layers were more responsive to low-level visual elements such as edges, corners (L-junctions), and isolated contour fragments, generating saliency patterns that were spatially scattered and not yet aligned with the ownership-defining border. Intermediate layers began to integrate these local features into larger spatial structures, showing more continuous activation along contour segments and increased sensitivity to the relative arrangement of the two rectangles. By the deep layers, the saliency converged into a compact, highly localized region centered on the occluding area, indicating that ownership decisions were made at later stages of processing, and the activation patterns consistently corresponded to the correct ownership boundary regarding the stimulus type. Overall, these results demonstrate that BOS selectivity is a hierarchical computation that gradually evolves from distributed local feature detection to a coherent, border-specific representation in deeper stages of the network.</p>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>Visualization of Grad-CAM across network architectures and layer depth (AlexNet: <italic>pool1/pool2/pool5</italic>; ResNet: <italic>add1/add8/add16</italic>; Inception: <italic>mixed1/mixed5/mixed10</italic>). Heatmaps reflect the most salient regions of stimuli to the network&#x2019;s BOS classification. In shallow layers, networks appeared to focus on low level features such as edges, but in deeper layers, saliency was refined to T-junctions and corners (L-junctions). All networks evaluated were trained on mixed datasets consisting of both contour and solid stimulus patterns. <italic>PoolK, addK, mixedK</italic> are standing for networks&#x2019; layers &#x2013; higher <italic>K</italic> means deeper layer.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g010.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Comparison of heatmap visualizations for Alexnet, Resnet, and Inception neural networks shows attention distributions at shallow, intermediate, and deep layers across four input types: solid right, solid left, contour right, and contour left rectangles. Warmer colors indicate stronger model focus within and around rectangles.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec19">
<label>3.2.2</label>
<title>Saliency across fragmentation</title>
<p>For fragmentation, Grad-CAM visualizations showed a gradual loss of and reorganization of BOS representations with increasing gap level (<xref ref-type="fig" rid="fig11">Figure 11</xref>). For small gaps, saliency strongly aligned with the fragmented contour segments that together defined the owned border, suggesting that the networks captured discontinuous local evidence and assembled this into a coherent ownership signal. As gap size grew, saliency became less continuous and distributed more unevenly among the remaining contour fragments. At very large gap levels, however, saliency became spatially non-specific, especially at deep layers where saliency was more spread out and shifted toward regions other than those occluded. Under these conditions, the deep layer activation patterns ceased to be coherently associated with the true ownership boundary, and correspondingly the classification accuracy dropped sharply even when early layer saliency remained consistent with that observed for intact contours. In general, the representation of BOS in CNNs degrades with increasing fragmentation, as we see a transition from continuous contour-based representations to the sparse and unstable cues lacking in power to assign ownership reliably.</p>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>Visualization of Grad-CAM across increment gap levels (<italic>n</italic> &#x2208; {&#x2155;, &#x2153;, 1, 3, 5}, see <xref ref-type="fig" rid="fig2">Figure 2</xref>). Heatmaps reflect the most salient regions of stimuli to the network&#x2019;s BOS classification, essentially highlighting the regions of stimuli most indicative of the BOS relationship with respect to different depths of the network. Saliency tended to become less specific as gap increment (<italic>n</italic>) increased. All networks evaluated were trained on contour datasets with no gap.</p>
</caption>
<graphic xlink:href="fpsyg-17-1768533-g011.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Heatmap visualization showing activation patterns for Alexnet, Resnet, and Inception neural network models. Each model has shallow, intermediate, and deep layers displayed in columns, with varying configurations per row. Warmer colors indicate higher activation.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="sec20">
<label>4</label>
<title>Discussion</title>
<p>We trained various hierarchical architectures of CNNs to perform basic BOS and used explainability methods to interrogate the stimulus features underpinning their ownership classification. This study shows how BOS is developed by generalization <italic>in silico</italic> hierarchical feedforward visual networks, granting insight into feedforward edge/junction detection mechanisms critical to BOS which could be analogous to biological systems. By examining where the performance of feedforward architectures degraded, we can gain insight into the contribution of feedback connections in biological systems.</p>
<sec id="sec21">
<label>4.1</label>
<title>Hierarchical BOS representation</title>
<p>The CNN architectures tested showed a hierarchical pattern of BOS-related information representation according to saliency analysis. Shallow layers mainly responded to edges, with intermediate layers focused on extended contour patches, corners, and junction structure, while deep layers generated compact and spatially coherent patterns of activation that mapped closely to the occlusion region rather than to other individual features. This progression was consistent among stimulus types and architectures, suggesting that it embodies a general attribute of hierarchical CNN architecture processing.</p>
<p>This pattern mirrors the organization of the primate visual system, where receptive fields expand in size and contextual modulation increases along the ventral stream (<xref ref-type="bibr" rid="ref5">Felleman and Van Essen, 1991</xref>; <xref ref-type="bibr" rid="ref1">Angelucci and Bressloff, 2006</xref>). Specifically, occlusion-centered saliency emerged in deep layers, implying that instead of just the explicit border encoding, CNNs create ownership representations that signify a more global interpretation of figure&#x2013;ground structure. These representations are analogous to those incrementally higher visual areas, where BOS signals are able to accommodate local variations in edge locations, while sensitivity to overall object configuration is still present (<xref ref-type="bibr" rid="ref11">Hesse and Tsao, 2016</xref>).</p>
<p>However, this hierarchical emergence occurred in all network structures even without explicit recurrent or feedback connections, which are critical for processing biological BOS (<xref ref-type="bibr" rid="ref12">Layton et al., 2012</xref>, <xref ref-type="bibr" rid="ref13">2014</xref>). This indicates that feedforward processes are adequate to generate BOS-like representations robust to different stimuli fragmentation and gap conditions, but they do not imply mechanistic equivalence with the cortical circuits. This does not reject the notion that feedback can also contribute to further consolidation of BOS representation. Rather, our results clarify which aspects of BOS can be represented by feedforward processes and which likely require additional circuitry.</p>
</sec>
<sec id="sec22">
<label>4.2</label>
<title>Role of T-junction cues</title>
<p>The regression analysis (<xref ref-type="fig" rid="fig8">Figure 8A</xref>) revealed that BOS decisions made by the networks are highly impacted by fragment 8. This fragment was spatially located at the corner of the overlapping region between the two rectangles and, together with its paired counterpart, formed a T-junction configuration characteristic of occlusion geometry. The disproportionately larger and significant individual fragment coefficient associated with it suggests that networks assign substantially greater importance to this occluding central feature than others when inferring and representing ownership. This is supported by the saliency map, as the deep layer activations are consistently concentrated on this fragment across stimuli and networks.</p>
<p>These results are similar to those reported by <xref ref-type="bibr" rid="ref29">Zhou et al. (2000)</xref>, which demonstrated that BOS-selective neurons respond most robustly at regions that participate in the T-junction. Unlike individual edges, T-junctions determine BOS by indicating the geometry of the occlusion, rather than relying solely on orientation, which makes it one of the most reliable monocular cues for depth ordering and figure&#x2013;ground segregation in biological vision. The configuration of T-junctions also provides a basis for ownership inference across changes in object size because their geometric patterns remain unchanged during scaling operations. Occlusion cues based on junction geometry have been shown to support figure&#x2013;ground segregation across variations in projected retinal size when relative contour support and alignment are maintained (<xref ref-type="bibr" rid="ref14">Lesher, 1995</xref>).</p>
<p>As such, the dominance effect of occluding fragments implies that the network has developed a preference for junction-like and overlapping configurations and incorporates a biological-proven BOS cue that enables size-invariance of representing occlusion during supervised learning.</p>
</sec>
<sec id="sec23">
<label>4.3</label>
<title>Fragment integration and gestalt principle in BOS</title>
<p>Aside from individual fragment effects, the performance of BOS was also importantly influenced by the manner in which contours were arranged continuously. Our findings showed that BOS accuracy was stable up to gap level <italic>n</italic>&#x202F;=&#x202F;1, i.e., every other pixel visibility (<xref ref-type="fig" rid="fig7">Figure 7</xref>). Under these circumstances, networks can integrate edge information across small gaps to have a coherent representation of the ownership border. The networks could hold on to significant BOS representation up to gap level <italic>n</italic>&#x202F;=&#x202F;1, and partially up to <italic>n</italic>&#x202F;=&#x202F;2,3, where borders in the stimulus are fragmented into dot-like segments. This transition indicated a failure of networks to maintain perceptual continuation. Conventional Gestalt concepts suggest that collinear or smoothly oriented elements are more likely to be clustered into a single contour, whereas spatially separated points lack sufficient structure to support grouping (<xref ref-type="bibr" rid="ref6">Field et al., 1993</xref>). In other words, the performance in contour integration will degrade dramatically when elements lose adequate continuation cues even while the local orientation information is preserved.</p>
<p>With respect to this view, the early layer saliency map in our network continues to pick up edges and corners even when contour gap is greater, but the local signal does not carry through to stable BOS representations at deeper stages, meaning the performance reduction is not caused by a weakened edge detection, but by network&#x2019;s inability to perform global integration. In biological vision, it is believed that long-range horizontal and feedback connections influence the completion of contours and perceptual clustering when resources are constrained and make it possible for figure&#x2013;ground signals to persist despite fragmented input (<xref ref-type="bibr" rid="ref1">Angelucci and Bressloff, 2006</xref>; <xref ref-type="bibr" rid="ref8">Gilbert and Li, 2013</xref>). Lack of such mechanisms in purely feedforward networks like those tested most likely makes them vulnerable when contour continuity no longer supports Gestalt grouping.</p>
<p>With regard to the effect of fragment availability, the networks exhibit a strong linear and weak quadratic relationship with the number of presence fragments (<xref ref-type="fig" rid="fig9">Figure 9</xref>). This trend is in accordance with the data from <xref ref-type="bibr" rid="ref28">Zhang and von der Heydt (2010)</xref>, which reported a positive correlation between contextual fragment availability and response magnitude among BOS-selective neurons, suggesting that more contextual data reinforces the signaling of BOS. However, there is a significant difference in the response profile: whereas the biological data shows a rapid increase followed by clear saturation as fragment availability increases, the networks of our study show more linear dependence and minimal signs of saturation.</p>
</sec>
</sec>
<sec id="sec24">
<label>5</label>
<title>Limitations and future directions</title>
<p>The CNN architectures tested clearly showed hierarchical patterns of BOS representation mirroring that of biological visual systems. It is important to note the simplicity of the stimuli used to train the networks to distinguish BOS, it is possible that the lack of feedback and horizontal processing would create more mechanistic divergence between CNNs and biological systems. We found that, whereas the biological data shows clear saturation as contextual information increases in stimuli, the networks of our study show more linear dependence and minimal signs of saturation.</p>
<p>Training networks on more complex BOS stimuli with more varied objects and junction geometries, or even natural images and stimuli, could reveal how more realistic BOS tasks benefit more from the additional connectivity in biological systems through more difficult to interpret contextual cues. Models may have accomplished the BOS task through the representation of depth, occlusion, or BOS between shapes in the stimulus, or via other means that are correlated to BOS. While this ambiguity is not unique to CNNs, as electrophysiological studies have likewise relied on simplified geometric stimuli that may also conflate BOS with closely related visual cues (<xref ref-type="bibr" rid="ref29">Zhou et al., 2000</xref>), training with more complex stimuli could strengthen the interpretation of Grad-CAM saliency maps as visualizations of stimuli regions strictly related to BOS. To enrich our understanding of differing contextual requirements between CNNs and biological systems, models could be tested with other mechanisms of fragment removal or change (such as fragment rotation) to examine precise impacts of availability of visual context with regards to continuity.</p>
<p>Our proposed networks have merely feedforward processing, while biology is equipped with feedback among other mechanisms. These additional processes could aid in the corroboration of contextual information, facilitating increased certainty of BOS representation with less information. Hybrid CNN-RNN (recurrent neural network) models exist with recurrent connections approximating biological feedback but have not been extensively evaluated beyond their performance in basic image recognition benchmarks (<xref ref-type="bibr" rid="ref15">Liang and Hu, 2015</xref>; <xref ref-type="bibr" rid="ref20">Spoerer et al., 2017</xref>). Interrogating these architectures would allow for direct comparison with purely feedforward systems, possibly providing direct insight into the differences in BOS processes arising from these fundamental differences in connectivity.</p>
<p>In addition, disparities in performance and representational robustness across networks could be due to the variations in the number of layers, total number of learnable parameters, and architectural motifs. For instance, the superior accuracy and more stable saliency pattern of Inception (50 layers) during increment gap levels could arise from simply having more layers than AlexNet (8 layers). A more reliable approach would be to constrain the confounding characteristics and parameters while keeping training data constant. Exploring this issue could clarify whether the advantages observed are due mainly to the application of architectural principles (e.g., multiscale integration) or just increased network depth.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec25">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>.</p>
</sec>
<sec sec-type="author-contributions" id="sec26">
<title>Author contributions</title>
<p>CY: Conceptualization, Formal analysis, Investigation, Methodology, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Data curation, Visualization, Software. EM: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing, Software. SN: Investigation, Validation, Writing &#x2013; review &#x0026; editing, Supervision, Funding acquisition. AY: Investigation, Supervision, Validation, Writing &#x2013; review &#x0026; editing, Conceptualization, Formal analysis, Methodology, Project administration, Resources, Software, Writing &#x2013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="sec27">
<title>Conflict of interest</title>
<p>SN was employed by Honda Research Institute Japan Co., Ltd.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec28">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec29">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec30">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fpsyg.2026.1768533/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fpsyg.2026.1768533/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Angelucci</surname><given-names>A.</given-names></name> <name><surname>Bressloff</surname><given-names>P. C.</given-names></name></person-group> (<year>2006</year>). <article-title>Contribution of feedforward, lateral and feedback connections to the classical receptive field center and extra-classical receptive field surround of primate V1 neurons</article-title>. <source>Prog. Brain Res.</source> <volume>154</volume>, <fpage>93</fpage>&#x2013;<lpage>120</lpage>. doi: <pub-id pub-id-type="doi">10.1016/S0079-6123(06)54005-1</pub-id>, <pub-id pub-id-type="pmid">17010705</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cadena</surname><given-names>S. A.</given-names></name> <name><surname>Denfield</surname><given-names>G. H.</given-names></name> <name><surname>Walker</surname><given-names>E. Y.</given-names></name> <name><surname>Gatys</surname><given-names>L. A.</given-names></name> <name><surname>Tolias</surname><given-names>A. S.</given-names></name> <name><surname>Bethge</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Deep convolutional models improve predictions of macaque V1 responses to natural images</article-title>. <source>PLoS Comput. Biol.</source> <volume>15</volume>:<fpage>e1006897</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006897</pub-id>, <pub-id pub-id-type="pmid">31013278</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Craft</surname><given-names>E.</given-names></name> <name><surname>Sch&#x00FC;tze</surname><given-names>H.</given-names></name> <name><surname>Niebur</surname><given-names>E.</given-names></name> <name><surname>von der Heydt</surname><given-names>R.</given-names></name></person-group> (<year>2007</year>). <article-title>A neural model of figure&#x2013;ground organization</article-title>. <source>J. Neurophysiol.</source> <volume>97</volume>, <fpage>4310</fpage>&#x2013;<lpage>4326</lpage>. doi: <pub-id pub-id-type="doi">10.1152/jn.00203.2007</pub-id>, <pub-id pub-id-type="pmid">17442769</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Douglas</surname><given-names>R. J.</given-names></name> <name><surname>Martin</surname><given-names>K. A. C.</given-names></name></person-group> (<year>2004</year>). <article-title>Neuronal circuits of the neocortex</article-title>. <source>Annu. Rev. Neurosci.</source> <volume>27</volume>, <fpage>419</fpage>&#x2013;<lpage>451</lpage>. doi: <pub-id pub-id-type="doi">10.1146/annurev.neuro.27.070203.144152</pub-id>, <pub-id pub-id-type="pmid">15217339</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Felleman</surname><given-names>D. J.</given-names></name> <name><surname>Van Essen</surname><given-names>D. C.</given-names></name></person-group> (<year>1991</year>). <article-title>Distributed hierarchical processing in the primate cerebral cortex</article-title>. <source>Cereb. Cortex</source> <volume>1</volume>, <fpage>1</fpage>&#x2013;<lpage>47</lpage>. doi: <pub-id pub-id-type="doi">10.1093/cercor/1.1.1-a</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Field</surname><given-names>D. J.</given-names></name> <name><surname>Hayes</surname><given-names>A.</given-names></name> <name><surname>Hess</surname><given-names>R. F.</given-names></name></person-group> (<year>1993</year>). <article-title>Contour integration by the human visual system: evidence for a local &#x201C;association field.&#x201D;</article-title>. <source>Vis. Res.</source> <volume>33</volume>, <fpage>173</fpage>&#x2013;<lpage>193</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0042-6989(93)90156-Q</pub-id>, <pub-id pub-id-type="pmid">8447091</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Friston</surname><given-names>K.</given-names></name></person-group> (<year>2005</year>). <article-title>A theory of cortical responses</article-title>. <source>Philos. Trans. R. Soc. Lond. Ser. B Biol. Sci.</source> <volume>360</volume>, <fpage>815</fpage>&#x2013;<lpage>836</lpage>. doi: <pub-id pub-id-type="doi">10.1098/rstb.2005.1622</pub-id>, <pub-id pub-id-type="pmid">15937014</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gilbert</surname><given-names>C. D.</given-names></name> <name><surname>Li</surname><given-names>W.</given-names></name></person-group> (<year>2013</year>). <article-title>Top-down influences on visual processing</article-title>. <source>Nat. Rev. Neurosci.</source> <volume>14</volume>, <fpage>350</fpage>&#x2013;<lpage>363</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrn3476</pub-id>, <pub-id pub-id-type="pmid">23595013</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>G&#x00FC;&#x00E7;l&#x00FC;</surname><given-names>U.</given-names></name> <name><surname>Van Gerven</surname><given-names>M. A. J.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep neural networks reveal a gradient in the complexity of neural representations across the ventral stream</article-title>. <source>J. Neurosci.</source> <volume>35</volume>, <fpage>10005</fpage>&#x2013;<lpage>10014</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.5023-14.2015</pub-id>, <pub-id pub-id-type="pmid">26157000</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Ren</surname><given-names>S.</given-names></name> <name><surname>Sun</surname><given-names>J.</given-names></name></person-group> (<year>2016</year>). <source>Deep residual learning for image recognition., in 2016 IEEE conference on computer vision and pattern recognition (CVPR)</source>. <publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hesse</surname><given-names>J. K.</given-names></name> <name><surname>Tsao</surname><given-names>D. Y.</given-names></name></person-group> (<year>2016</year>). <article-title>Consistency of border-ownership cells across artificial stimuli, natural stimuli, and stimuli with ambiguous contours</article-title>. <source>J. Neurosci.</source> <volume>36</volume>, <fpage>11338</fpage>&#x2013;<lpage>11349</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.1857-16.2016</pub-id>, <pub-id pub-id-type="pmid">27807174</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Layton</surname><given-names>O. W.</given-names></name> <name><surname>Mingolla</surname><given-names>E.</given-names></name> <name><surname>Yazdanbakhsh</surname><given-names>A.</given-names></name></person-group> (<year>2012</year>). <article-title>Dynamic coding of border-ownership in visual cortex</article-title>. <source>J. Vis.</source> <volume>12</volume>:<fpage>8</fpage>. doi: <pub-id pub-id-type="doi">10.1167/12.13.8</pub-id>, <pub-id pub-id-type="pmid">23220579</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Layton</surname><given-names>O. W.</given-names></name> <name><surname>Mingolla</surname><given-names>E.</given-names></name> <name><surname>Yazdanbakhsh</surname><given-names>A.</given-names></name></person-group> (<year>2014</year>). <article-title>Neural dynamics of feedforward and feedback processing in figure-ground segregation</article-title>. <source>Front. Psychol.</source> <volume>5</volume>:<fpage>972</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2014.00972</pub-id>, <pub-id pub-id-type="pmid">25346703</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lesher</surname><given-names>G. W.</given-names></name></person-group> (<year>1995</year>). <article-title>Illusory contours: toward a neurally based perceptual theory</article-title>. <source>Psychon. Bull. Rev.</source> <volume>2</volume>, <fpage>279</fpage>&#x2013;<lpage>321</lpage>. doi: <pub-id pub-id-type="doi">10.3758/BF03210970</pub-id>, <pub-id pub-id-type="pmid">24203713</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>M.</given-names></name> <name><surname>Hu</surname><given-names>X.</given-names></name></person-group> (<year>2015</year>) <chapter-title>Recurrent convolutional neural network for object recognition</chapter-title>, in <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name> <fpage>3367</fpage>&#x2013;<lpage>3375</lpage></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lindsay</surname><given-names>G. W.</given-names></name></person-group> (<year>2021</year>). <article-title>Convolutional neural networks as a model of the visual system: past, present, and future</article-title>. <source>J. Cogn. Neurosci.</source> <volume>33</volume>, <fpage>2017</fpage>&#x2013;<lpage>2031</lpage>. doi: <pub-id pub-id-type="doi">10.1162/jocn_a_01544</pub-id>, <pub-id pub-id-type="pmid">32027584</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nakamura</surname><given-names>H.</given-names></name> <name><surname>Gattass</surname><given-names>R.</given-names></name> <name><surname>Desimone</surname><given-names>R.</given-names></name> <name><surname>Ungerleider</surname><given-names>L. G.</given-names></name></person-group> (<year>1993</year>). <article-title>The modular organization of projections from areas V1 and V2 to areas V4 and TEO in macaques</article-title>. <source>J. Neurosci.</source> <volume>13</volume>, <fpage>3681</fpage>&#x2013;<lpage>3691</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.13-09-03681.1993</pub-id>, <pub-id pub-id-type="pmid">7690064</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Selvaraju</surname><given-names>R. R.</given-names></name> <name><surname>Cogswell</surname><given-names>M.</given-names></name> <name><surname>Das</surname><given-names>A.</given-names></name> <name><surname>Vedantam</surname><given-names>R.</given-names></name> <name><surname>Parikh</surname><given-names>D.</given-names></name> <name><surname>Batra</surname><given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title>. <source>Int. J. Comput. Vis.</source> <volume>128</volume>, <fpage>336</fpage>&#x2013;<lpage>359</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sherbakov</surname><given-names>L.</given-names></name> <name><surname>Yazdanbakhsh</surname><given-names>A.</given-names></name></person-group> (<year>2013</year>). <article-title>Multiscale sampling model for motion integration</article-title>. <source>J. Vis.</source> <volume>13</volume>:<fpage>18</fpage>. doi: <pub-id pub-id-type="doi">10.1167/13.11.18</pub-id>, <pub-id pub-id-type="pmid">24080519</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Spoerer</surname><given-names>C. J.</given-names></name> <name><surname>McClure</surname><given-names>P.</given-names></name> <name><surname>Kriegeskorte</surname><given-names>N.</given-names></name></person-group> (<year>2017</year>). <article-title>Recurrent convolutional neural networks: a better model of biological object recognition</article-title>. <source>Front. Psychol.</source> <volume>8</volume>:<fpage>1551</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpsyg.2017.01551</pub-id>, <pub-id pub-id-type="pmid">28955272</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Szegedy</surname><given-names>C.</given-names></name> <name><surname>Liu</surname><given-names>Wei</given-names></name> <name><surname>Jia</surname><given-names>Yangqing</given-names></name> <name><surname>Sermanet</surname><given-names>P.</given-names></name> <name><surname>Reed</surname><given-names>S.</given-names></name> <name><surname>Anguelov</surname><given-names>D.</given-names></name> <etal/></person-group> (<year>2015</year>). <chapter-title>Going deeper with convolutions</chapter-title>, in <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, (<publisher-loc>Boston, MA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>9</lpage>.</mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Watanabe</surname><given-names>T.</given-names></name> <name><surname>Cavanagh</surname><given-names>P.</given-names></name></person-group> (<year>1993a</year>). <article-title>Surface decomposition accompanying the perception of transparency</article-title>. <source>Spat. Vis.</source> <volume>7</volume>, <fpage>95</fpage>&#x2013;<lpage>111</lpage>. doi: <pub-id pub-id-type="doi">10.1163/156856893X00306</pub-id>, <pub-id pub-id-type="pmid">8347552</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Watanabe</surname><given-names>T.</given-names></name> <name><surname>Cavanagh</surname><given-names>P.</given-names></name></person-group> (<year>1993b</year>). <article-title>Transparent surfaces defined by implicit X junctions</article-title>. <source>Vis. Res.</source> <volume>33</volume>, <fpage>2339</fpage>&#x2013;<lpage>2346</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0042-6989(93)90111-9</pub-id>, <pub-id pub-id-type="pmid">8273298</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yamins</surname><given-names>D. L. K.</given-names></name> <name><surname>Hong</surname><given-names>H.</given-names></name> <name><surname>Cadieu</surname><given-names>C. F.</given-names></name> <name><surname>Solomon</surname><given-names>E. A.</given-names></name> <name><surname>Seibert</surname><given-names>D.</given-names></name> <name><surname>DiCarlo</surname><given-names>J. J.</given-names></name></person-group> (<year>2014</year>). <article-title>Performance-optimized hierarchical models predict neural responses in higher visual cortex</article-title>. <source>Proc. Natl. Acad. Sci. USA</source> <volume>111</volume>, <fpage>8619</fpage>&#x2013;<lpage>8624</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1403112111</pub-id>, <pub-id pub-id-type="pmid">24812127</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yazdanbakhsh</surname><given-names>A.</given-names></name> <name><surname>Gori</surname><given-names>S.</given-names></name></person-group> (<year>2008</year>). <article-title>A new psychophysical estimation of the receptive field size</article-title>. <source>Neurosci. Lett.</source> <volume>438</volume>, <fpage>246</fpage>&#x2013;<lpage>251</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neulet.2008.04.040</pub-id>, <pub-id pub-id-type="pmid">18467028</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yazdanbakhsh</surname><given-names>A.</given-names></name> <name><surname>Gori</surname><given-names>S.</given-names></name></person-group> (<year>2011</year>). <article-title>Mathematical analysis of the accordion grating illusion: a differential geometry approach to introduce the 3D aperture problem</article-title>. <source>Neural Netw.</source> <volume>24</volume>, <fpage>1093</fpage>&#x2013;<lpage>1101</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2011.06.016</pub-id>, <pub-id pub-id-type="pmid">21782387</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname><given-names>Z.</given-names></name> <name><surname>Wessel</surname><given-names>R.</given-names></name> <name><surname>Franken</surname><given-names>T. P.</given-names></name></person-group> (<year>2025</year>). <article-title>Brain-like border ownership signals support prediction of natural videos</article-title>. <source>iScience</source> <volume>28</volume>:<fpage>112199</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.isci.2025.112199</pub-id>, <pub-id pub-id-type="pmid">40224014</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>N. R.</given-names></name> <name><surname>Von Der Heydt</surname><given-names>R.</given-names></name></person-group> (<year>2010</year>). <article-title>Analysis of the context integration mechanisms underlying figure&#x2013;ground organization in the visual cortex</article-title>. <source>J. Neurosci.</source> <volume>30</volume>, <fpage>6482</fpage>&#x2013;<lpage>6496</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.5168-09.2010</pub-id>, <pub-id pub-id-type="pmid">20463212</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>H.</given-names></name> <name><surname>Friedman</surname><given-names>H. S.</given-names></name> <name><surname>Von Der Heydt</surname><given-names>R.</given-names></name></person-group> (<year>2000</year>). <article-title>Coding of border ownership in monkey visual cortex</article-title>. <source>J. Neurosci.</source> <volume>20</volume>, <fpage>6594</fpage>&#x2013;<lpage>6611</lpage>. doi: <pub-id pub-id-type="doi">10.1523/JNEUROSCI.20-17-06594.2000</pub-id>, <pub-id pub-id-type="pmid">10964965</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0002">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/36095/overview">Birgitta Dresp-Langley</ext-link>, Centre National de la Recherche Scientifique (CNRS), France</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0003">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/36500/overview">Adam James Reeves</ext-link>, Northeastern University, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1966053/overview">Gabriel Byczynski</ext-link>, University of Geneva, Switzerland</p>
</fn>
</fn-group>
</back>
</article>