<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Earth Sci.</journal-id>
<journal-title>Frontiers in Earth Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Earth Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-6463</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1407173</article-id>
<article-id pub-id-type="doi">10.3389/feart.2024.1407173</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Earth Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep learning for geological mapping in the overburden area</article-title>
<alt-title alt-title-type="left-running-head">Liu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/feart.2024.1407173">10.3389/feart.2024.1407173</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Liu</surname>
<given-names>Yao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2690983/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cheng</surname>
<given-names>Jianyuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>L&#xfc;</surname>
<given-names>Qingtian</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Zaibin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lu</surname>
<given-names>Jingjin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fan</surname>
<given-names>Zhenyu</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2150272/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Lianzhi</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Xi&#x2019;an Research Institute Co. Ltd.</institution>, <institution>China Coal Technology and Engineering Group Corp.</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Chinese Academy of Geological Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>China Aero Geophysical Survey and Remote Sensing Center for Natural Resources</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>School of Earth and Space Sciences</institution>, <institution>Peking University</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/92148/overview">Giovanni Martinelli</ext-link>, National Institute of Geophysics and Volcanology, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1917817/overview">Wenchao Chen</ext-link>, Xi&#x2019;an Jiaotong University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2733495/overview">Sha Song</ext-link>, Chang&#x2019;an University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2736821/overview">Hu Bin</ext-link>, East China University of Technology, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Yao Liu, <email>liuyao2008666@126.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>17</day>
<month>06</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1407173</elocation-id>
<history>
<date date-type="received">
<day>26</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>05</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Liu, Cheng, L&#xfc;, Liu, Lu, Fan and Zhang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Liu, Cheng, L&#xfc;, Liu, Lu, Fan and Zhang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>This paper aims to achieve bedrock geologic mapping in the overburden area using big data, distributed computing, and deep learning techniques. First, the satellite Bouguer gravity anomaly with a resolution of 2&#x2032;&#xd7;2&#x2032; in the range of E66<sup>&#xb0;</sup>-E96<sup>&#xb0;</sup>, N40<sup>&#xb0;</sup>-N55<sup>&#xb0;</sup> and 1:5000000 Asia-European geological map are used to design a dataset for bedrock prediction. Then, starting from the gravity anomaly formula in the spherical coordinate system, we deduce the non-linear functional between rock density &#x3c1; and rock mineral composition m, content p, buried depth h, diagenesis time t and other variables. We analyze the feasibility of using deep neural network to approximate the above nonlinear generalization. The problem of solving deep neural network parameters is transformed into a non-convex optimization problem. We give an iterative, gradient descent-based solution algorithm for the non-convex optimization problem. Utilizing neural architecture search (NAS) and human-designed approach, we propose a geological-geophysical mapping network (GGMNet). The dataset for the network consists of both gravity anomaly and <italic>a priori</italic> geological information. The network has fast convergence speed and stable iteration during the training process. It also has better performance than a single neural network search or human-designed architectures, with the mean pixel accuracy (MAP) &#x3d; 63.1% and the frequency weighted intersection over union (FWIoU) &#x3d; 42.88. Finally, the GGMNet is used to predict the rock distribution of the Junggar Basin.</p>
</abstract>
<kwd-group>
<kwd>satellite gravity anomaly</kwd>
<kwd>deep learning</kwd>
<kwd>convolutional neural networks</kwd>
<kwd>geological mapping</kwd>
<kwd>Junggar basin</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Solid Earth Geophysics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Cenozoic loose sediments mask the underlying geological information of the underlying bedrock. Using geophysical detection as the forerunner, combined with the constraints of prior geological-geophysical information, the overburden can be well stripped, revealing deep hidden structures and bedrock (<xref ref-type="bibr" rid="B7">Deng et al., 2019</xref>). When using the Bouguer gravity anomaly to map the bedrock in the overburden area, it is necessary to separate the gravity anomaly to obtain the residual field of the target depth. We use edge detection technology to obtain the physical boundary of the remaining Bouguer gravity anomaly. Then, the interpreter combined the existing geological prior information and previous interpretation experience to screen each physical property boundary one by one, and infer the corresponding geological body boundary, stratigraphic age, lithology, etc. This problem is summed up in two steps: accurate description of the outline of the geological body; determining which stratigraphic age and lithology the geological body belongs to. The method require very high geological background knowledge of the interpreter. Due to the limitation of the amount of data, it is difficult to integrate the geological and geophysical data of the entire area.</p>
<p>The successful application of artificial intelligence technology in the field of machine vision provides us with a new research idea. Image semantic segmentation is performing a similar task: segmenting the target and then classifying the resulting object at the pixel level. This paper aims to propose an end-to-end convolutional neural network for overburden area mapping using satellite gravity big data and large-scale regional geological information.</p>
</sec>
<sec id="s2">
<title>2 Geological setting</title>
<p>The Junggar Basin and its surrounding basin-mountain belt are located in the triangle zone of the Kazakhstan plate, the Siberian plate and the Tarim plate. Since the Paleozoic Era, it has undergone tectonic evolutionary processes such as oceanic expansion, subduction and decay of the oceanic shell of the ancient ocean basin, collision, and intraplate movement (<xref ref-type="bibr" rid="B18">Jinyi, 2004</xref>; <xref ref-type="bibr" rid="B31">Wenjiao et al., 2006</xref>; <xref ref-type="bibr" rid="B17">Jian et al., 2014</xref>; <xref ref-type="bibr" rid="B23">Luo et al., 2016</xref>). Most of the existing researches have focused on the well-exposed bedrock areas in East and West Junggar. The hinterland of the basin and the shallow cover area between East and West Junggar, which is covered by Middle-Cenozoic loose sediments, have a relatively low level of basic geological work. The lack of basic geological data has caused some important basic geological issues to remain unresolved. Therefore, this paper selects the Junggar Basin as the study area and predicts the overburden area in the hinterland of the Basin through deep neural network, so as to provide a reference for subsequent studies.</p>
</sec>
<sec id="s3">
<title>3 Methods and data</title>
<sec id="s3-1">
<title>3.1 Theory</title>
<p>Gravity anomalies are closely related to the density and spatial distribution of the earth&#x2019;s internal matter. <xref ref-type="bibr" rid="B14">Heck et al. (2007)</xref> proposed that in the spherical coordinate system the gravity can be expressed as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>&#x3c1;</mml:mi>
<mml:munder>
<mml:mo>&#x222d;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
<mml:msup>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the Euclidean distance between the observation point P (&#x3c6;, &#x3bb;, r) and the source point Q (&#x3c6;&#x2032;, &#x3bb;&#x2032;, r&#x2032;); and <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the angle between the position vectors of P and Q; G is the gravitational constant, &#x3c1; is the density, &#x3c6; is the latitude, &#x3bb; is the longitude, and r is the radial distance.<disp-formula id="e2">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m5">
<mml:mrow>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>sin</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>sin</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>For<disp-formula id="e4">
<mml:math id="m6">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>cos</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
<mml:msup>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>we have <disp-formula id="e5">
<mml:math id="m7">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>&#x3c1;</mml:mi>
<mml:munder>
<mml:mo>&#x222d;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>u</mml:mi>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>Density &#x3c1; can be expressed as a nonlinear function of variables such as rock mineral composition m, content p, burial depth h, rock formation time t, etc.<disp-formula id="e6">
<mml:math id="m8">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>Inserting Eq. <xref ref-type="disp-formula" rid="e6">6</xref> into Eq. <xref ref-type="disp-formula" rid="e5">5</xref> yields<disp-formula id="e7">
<mml:math id="m9">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:munder>
<mml:mo>&#x222d;</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>u</mml:mi>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>r</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3c6;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:msup>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>The above integral equation can be abstracted as the following nonlinear functional:<disp-formula id="e8">
<mml:math id="m10">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where u is related to the spatial location, g is the gravity anomaly, and v is related to the rock properties. This nonlinear functional defines a nonlinear mapping relationship from the spatial location and gravity anomaly to lithology.</p>
<p>According to the universal approximation theorem (<xref ref-type="bibr" rid="B5">Cybenko, 1989</xref>; <xref ref-type="bibr" rid="B15">Hornik et al., 1989</xref>), a feedforward neural network with a sufficient number of hidden units and a nonlinear activation function can approximate any Borel function from one finite-dimensional discrete space to another with arbitrary accuracy. In other words, a deep convolutional neural network can be used to approximate the mapping defined in Eq. <xref ref-type="disp-formula" rid="e8">8</xref>.</p>
<p>In general, a convolutional neural network of depth n can be represented as:<disp-formula id="e9">
<mml:math id="m11">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#x22ef;</mml:mo>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>The data sample x (gravity anomaly) is fed into a cascading n-layer nonlinear transform network to obtain the desired output y (lithologic distribution). The parameter &#x3b8; is the learning parameter of this nonlinear transformation. We use an optimizing the algorithm to find &#x3b8; so that the neural network can maximize the approximation of the mapping defined in Eq. <xref ref-type="disp-formula" rid="e8">8</xref>.</p>
</sec>
<sec id="s3-2">
<title>3.2 Gradient-based learning</title>
<p>The parametric model y &#x3d; f (x; <italic>&#x3b8;</italic>) defines a Conditional probability distribution p (y &#x7c; x; &#x3b8;). We use the principle of maximum likelihood to estimate it. The maximum likelihood estimator for <italic>&#x3b8;</italic> is then defined as<disp-formula id="e10">
<mml:math id="m12">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mi>argmax</mml:mi>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="double-struck">Y</mml:mi>
<mml:mo mathvariant="double-struck">&#x7c;</mml:mo>
<mml:mi mathvariant="double-struck">X</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mspace width="1.7em"/>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mi>argmax</mml:mi>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>where <inline-formula id="inf3">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="double-struck">X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is a set of m examples. The p<sub>model</sub> (y &#x7c; x; <italic>&#x3b8;</italic>) is a parametric family of probability distributions indexed by <italic>&#x3b8;</italic> and it maps any configuration x to a real number estimating the true probability p<sub>data</sub> (y &#x7c; x).</p>
<p>The product of many probabilities is prone to numerical underflow, which is not easy to calculate. We observe that the logarithm of the likelihood does not change its arg max, but conveniently transform the product into a summation:<disp-formula id="e11">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mi>argmax</mml:mi>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>Dividing by m, we obtain the expectation with respect to the empirical distribution define by the training data as the estimation criterion.<disp-formula id="e12">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mi>argmax</mml:mi>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>Deep neural network learning is estimating the parameter &#x3b8; using the principle of maximum likelihood. The essence of this optimization problem is to maximize the log-likelihood, that is, to minimize the negative log-likelihood, which is equivalent to minimizing the cross entropy between the empirical distribution defined by the training set and probability distribution defined by model (<xref ref-type="bibr" rid="B10">Goodfellow et al., 2016</xref>).</p>
<p>The cost function is given by<disp-formula id="e13">
<mml:math id="m16">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>In order to enhance the generalization ability of the neural network and avoid overfitting during the optimization, we add a parameter regularization term to the cost function to obtain a new objective function:<disp-formula id="e14">
<mml:math id="m17">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>J</mml:mi>
<mml:mo>&#x223c;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi mathvariant="normal">&#x3a9;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>The problem of minimizing the objective function is a nonconvex optimization problem. This means that we cannot accurately obtain the global optimal solution of the problem. Therefore, deep neural network training uses an iterative, gradient-based optimization method to obtain a local optimal solution that makes the objective function sufficiently small. The stochastic gradient descent (SGD) algorithm is employed to solve the above nonconvex optimization problem.</p>
<p>We decompose the cross-entropy cost function as a sum over training examples of some per-example loss function.<disp-formula id="e15">
<mml:math id="m18">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>p</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>For these additive cost function, the gradient of the cross-entropy cost function is:<disp-formula id="e16">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2207;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>m</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mo>&#x2207;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>The above gradient is an expectation that we can approximately estimate using a small set of samples. On each step of the SGD algorithm, we can sample a minibatch of examples <inline-formula id="inf4">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="normal">B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>m</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> drawn randomly from the training set. The estimate of the gradient is formed as<disp-formula id="e17">
<mml:math id="m21">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>m</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mo>&#x2207;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msup>
<mml:mi>m</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>mod</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
</p>
<p>The stochastic gradient descent algorithm then follows the estimated gradient downhill:<disp-formula id="e18">
<mml:math id="m22">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b5;</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>where &#x3b5; is the learning rate.</p>
<p>The stochastic gradient descent algorithm is sometimes very slow or unreliable in the learning process. The method of stochastic gradient descent with momentum (SGD with momentum) is designed to accelerate learning (<xref ref-type="bibr" rid="B26">Robbins and Monro, 1951</xref>). SGD with momentum can avoid training into saddle points (<xref ref-type="bibr" rid="B21">Lee et al., 2016</xref>) and improve network generalization performance (<xref ref-type="bibr" rid="B11">Hardt et al., 2015</xref>; <xref ref-type="bibr" rid="B32">Wilson et al., 2017</xref>). SGD scales the gradient uniformly in all directions to determine the descending step size, which can be particularly harmful to ill-conditioned problems. Therefore, SGD needs to frequently modify the learning rate according to the actual situation. To address this issue, adaptive methods such as Adam (<xref ref-type="bibr" rid="B20">Kingma and Ba, 2015</xref>), Adagrad (<xref ref-type="bibr" rid="B8">Duchi et al., 2011</xref>), and RMSprop (<xref ref-type="bibr" rid="B30">Tieleman and Hinton, 2012</xref>) have been proposed that adaptively correct the learning rate during training.</p>
<p>Although the convergence speed and generalization ability of Adam and other adaptive methods are better than SGD in the initial stage of training, their performance in the convergence part has stagnated. A more natural strategy is to use the Adam algorithm to initialize the training, which allows the model to converge quickly and then convert to the SGD with momentum when appropriate (<xref ref-type="bibr" rid="B19">Keskar and Socher, 2017</xref>).</p>
</sec>
<sec id="s3-3">
<title>3.3 Datasets</title>
<p>The satellite Bouguer gravity data are downloaded from the website (<ext-link ext-link-type="uri" xlink:href="https://bgi.obs-mip.fr/data-products/grids-and-models/wgm2012-global-model/">https://bgi.obs-mip.fr/data-products/grids-and-models/wgm2012-global-model/</ext-link>) with the resolution of 2&#x2032;&#xd7;2&#x2032; and the range of E65<sup>&#xb0;</sup>-95<sup>&#xb0;</sup>, N40<sup>&#xb0;</sup>-55<sup>&#xb0;</sup>. We obtained the regional gravity anomaly by upward continuing the satellite Bouguer gravity anomaly (<xref ref-type="fig" rid="F1">Figure 1A</xref>) to 10 km. We subtract the regional field from the total field to obtain the residual gravity anomaly (<xref ref-type="fig" rid="F1">Figure 1B</xref>). Considering that the resolution of the satellite Bouguer gravity anomaly data used in this paper is 2&#x2032;&#xd7;2&#x2032;, the matching 1:5000000 Asia-European geological map (<xref ref-type="fig" rid="F2">Figure 2</xref>) is used as <italic>a priori</italic> information for data annotation. In order to accurately depict the boundary contours of the geologic body and accurately classify it by stratum, we annotate the training data at the pixel level. The labeling map adopts the index map mode. The specific categories and index values are shown in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>
<bold>(A)</bold> The satellite Bouguer gravity data of the study area with the resolution of 2&#x2032;&#xd7;2&#x2032; and the range of E65&#xb0;-95&#xb0;, N40&#xb0;-55&#xb0;. <bold>(B)</bold> The residual gravity anomaly of the study area: The regional gravity field is obtained by upward continuing the satellite Bouguer gravity data to a depth of 10 km. The residual gravity anomaly is subsequently calculated by subtracting the regional gravity field from the original satellite Bouguer gravity anomaly.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The data annotation of the study area: Considering that the resolution of the satellite Bouguer gravity anomaly data used in this paper is 2&#x2032;&#xd7;2&#x2032;, the matching 1:5000000 Asia-European geological map is used as <italic>a priori</italic> information for data annotation.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g002.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Category and index value of annotations.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Stratum/Lithology</th>
<th align="center">Archean (Ar)</th>
<th align="center">Proterozoic (Pt)</th>
<th align="center">Cambrian (&#x2208;)</th>
<th align="center">Ordovician (O)</th>
<th align="center">Silurian (S)</th>
<th align="center">Devonian (D)</th>
<th align="center">Carboniferous (C)</th>
<th align="center">Permian (P)</th>
<th align="center">Jurassic (J)</th>
<th align="center">Granite</th>
<th align="center">Cenozoic (Q,N,E)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">index</td>
<td align="center">0</td>
<td align="center">1</td>
<td align="center">2</td>
<td align="center">3</td>
<td align="center">4</td>
<td align="center">5</td>
<td align="center">6</td>
<td align="center">7</td>
<td align="center">8</td>
<td align="center">9</td>
<td align="center">255</td>
</tr>
<tr>
<td align="center">R</td>
<td align="center">255</td>
<td align="center">177</td>
<td align="center">4</td>
<td align="center">126</td>
<td align="center">152</td>
<td align="center">166</td>
<td align="center">179</td>
<td align="center">227</td>
<td align="center">255</td>
<td align="center">73</td>
<td align="center">218</td>
</tr>
<tr>
<td align="center">G</td>
<td align="center">178</td>
<td align="center">153</td>
<td align="center">249</td>
<td align="center">254</td>
<td align="center">215</td>
<td align="center">76</td>
<td align="center">218</td>
<td align="center">178</td>
<td align="center">247</td>
<td align="center">251</td>
<td align="center">76</td>
</tr>
<tr>
<td align="center">B</td>
<td align="center">255</td>
<td align="center">99</td>
<td align="center">7</td>
<td align="center">127</td>
<td align="center">29</td>
<td align="center">76</td>
<td align="center">217</td>
<td align="center">28</td>
<td align="center">102</td>
<td align="center">250</td>
<td align="center">250</td>
</tr>
<tr>
<td align="center">Percentage (%)</td>
<td align="center">0.6</td>
<td align="center">7.2</td>
<td align="center">10.3</td>
<td align="center">4</td>
<td align="center">7</td>
<td align="center">20.3</td>
<td align="center">30.9</td>
<td align="center">2.1</td>
<td align="center">2.3</td>
<td align="center">15.3</td>
<td align="center">&#x2014;-</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In order to take into account the semantic segmentation accuracy of both small and large targets and provide more global semantic information to the network, we use a multi-scale sliding window clipping method (<xref ref-type="table" rid="T2">Table 2</xref>) to clip the data of the entire region. In order to preserve the details of the remaining gravity anomalies, all image sizes are 2048&#xd7;2048 pixels. A total of 61 samples with bedrock outcrops less than 10% of the total area of the whole map were used as a test set. There are 475 effective samples participating in network training, 15% are randomly selected as the verification set, and the remaining are the training set.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Multi-scale sliding window clipping method.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Crop window size</th>
<th align="center">Sliding interval</th>
<th align="center">Number of samples</th>
<th align="center">Image pixel size</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">1<sup>&#xb0;</sup>&#xd7;1<sup>&#xb0;</sup>
</td>
<td align="center">1<sup>&#xb0;</sup>
</td>
<td align="center">450</td>
<td align="center">2048&#xd7;2048</td>
</tr>
<tr>
<td align="center">5<sup>&#xb0;</sup>&#xd7;5<sup>&#xb0;</sup>
</td>
<td align="center">2.5<sup>&#xb0;</sup>
</td>
<td align="center">55</td>
<td align="center">2048&#xd7;2048</td>
</tr>
<tr>
<td align="center">10<sup>&#xb0;</sup>&#xd7;10<sup>&#xb0;</sup>
</td>
<td align="center">2.5<sup>&#xb0;</sup>
</td>
<td align="center">27</td>
<td align="center">2048&#xd7;2048</td>
</tr>
<tr>
<td align="center">15<sup>&#xb0;</sup>&#xd7;15<sup>&#xb0;</sup>
</td>
<td align="center">5<sup>&#xb0;</sup>
</td>
<td align="center">4</td>
<td align="center">2048&#xd7;2048</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<title>4 Deep convolutional neural network architecture</title>
<p>The general semantic segmentation network mostly pre-trains a backbone on the Imagenet dataset as a feature extractor to obtain the feature map of the image, followed by the feature fusion module and semantic segmentation head to achieve pixel-level segmentation. The shape of satellite gravity anomaly corresponding to each lithology is not the same, but the amplitude is within a certain range. Our segmentation network should not be segmented based on the outline of the target, but rather on the commonality of colors in the same category and the relative position relationship between different categories. In this way, we cannot directly use the existing semantic segmentation network, but should design a personalized network for the task data set.</p>
<p>Manually designing deep neural networks involves the selection of hyperparameters such as the depth of the hidden layer, the width of the network, and the downsampling rate, which is extremely challenging. Neural architecture search (NAS) can help us solve the above problems. Utilizing neural architecture search and human-designed approach, we designed a geological-geophysical mapping network (GGMNet) (<xref ref-type="fig" rid="F3">Figure 3A</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A)</bold> GGMNet Network architecture consists of five modules: A feature encoder to encode the input 3-channel RGB image into a high-dimensional feature map. A multi-resolution feature extraction module (MRFEM) is obtained by searching directly on the target dataset to extract coarse and fine features. Attention Refinement Module is used to refine the features of each stage. A Feature Fusion Module to fuse the features of the three paths. The semantic segmentation module is a matrix with the same size as the original image, and the number of channels is equal to the number of categories. <bold>(B)</bold> Attention Refinement Module (ARM) employs global average pooling to remove the redundant information of the feature map. <bold>(C)</bold> Feature Fusion Module: We concatenate branch outputs, apply batch normalization for scale balance, transform to a feature vector via global pooling, and compute a 1x1 convolution-based weight vector for feature re-weighting, effectively selecting and combining features.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g003.tif"/>
</fig>
<sec id="s4-1">
<title>4.1 Feature encoder</title>
<p>The feature encoder uses a convolution operation to encode the input 3-channel RGB image into a high-dimensional feature map with 1/8 pixel size and 96 channels (<xref ref-type="bibr" rid="B24">Oktay et al., 2018</xref>). It consists of three convolution modules, each of which contains a convolution with the kernel size 3&#xd7;3 and stride &#x3d; 2, followed by a batch normalization layer (BN) (<xref ref-type="bibr" rid="B16">Ioffe et al., 2015</xref>) and a rectification linear unit (ReLU).</p>
</sec>
<sec id="s4-2">
<title>4.2 Multi-resolution feature extraction module</title>
<p>In semantic segmentation task, spatial location information, contextual semantic information, and receptive fields are crucial for segmentation accuracy. The increase of network depth can obtain better contextual semantic information. The skip connections (<xref ref-type="bibr" rid="B12">He et al., 2015</xref>; <xref ref-type="bibr" rid="B13">2016</xref>) can enrich spatial location information. The network depth, the size of the convolution kernel, and the position of the skip connections will affect the receptive field of the feature map used for segmentation. Increasing the width of the network can improve the receptive field of the feature map, at the same time, the number of network parameters also increase dramatically. With limited data sets, it is difficult to train a large network with strong generalization capabilities. It is a challenging task to make the network have rich contextual semantic information, precise spatial location information, and sufficient receptive field, which is the ultimate goal of network design.</p>
<p>With the increase of GPU computing power, neural architecture search algorithms are more and more widely used. The Efficientnet (<xref ref-type="bibr" rid="B29">Tan et al., 2019</xref>), MobilenetV2 (<xref ref-type="bibr" rid="B28">Sandler et al., 2018</xref>), Auto-DeepLab (<xref ref-type="bibr" rid="B22">Liu et al., 2020</xref>) are all designed using neural architecture search algorithms. They have achieved good performance in image classification, detection, segmentation and other tasks. We adopt the neural architecture search algorithm (<xref ref-type="bibr" rid="B36">Zoph and Quoc, 2016</xref>; <xref ref-type="bibr" rid="B2">Brock et al., 2017</xref>; <xref ref-type="bibr" rid="B1">Bender et al., 2018</xref>; <xref ref-type="bibr" rid="B25">Pham et al., 2018</xref>; <xref ref-type="bibr" rid="B9">Gong et al., 2019</xref>) to design multi-resolution feature extraction module. Inspired by <xref ref-type="bibr" rid="B4">Chen, (2019)</xref>, a multi-resolution feature extraction module (MRFEM) is obtained by searching directly on the target dataset. In order to take into account the experience of manual design and the flexibility of neural architecture search, we designed the search space consists of 5 operators (<xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Search space and operators.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Search space</th>
<th align="center">Operators</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Skip Connections</td>
<td align="center">Identity</td>
</tr>
<tr>
<td align="center">3&#xd7;3Conv</td>
<td align="center">Conv2d (3&#xd7;3)&#x2b; BatchNorm2d &#x2b; ReLU</td>
</tr>
<tr>
<td align="center">5&#xd7;5Conv</td>
<td align="center">Conv2d (5&#xd7;5) &#x2b; BatchNorm2d &#x2b; ReLU</td>
</tr>
<tr>
<td align="center">zoomed 3&#xd7;3Conv</td>
<td align="center">bilinear downsampling &#x2b; 3&#xd7;3Conv &#x2b; bilinear upsampling</td>
</tr>
<tr>
<td align="center">zoomed 5&#xd7;5Conv</td>
<td align="center">bilinear downsampling &#x2b; 5&#xd7;5Conv &#x2b; bilinear upsampling</td>
</tr>
<tr>
<td align="center">Downsampling with Conv2d (3&#xd7;3)</td>
<td align="center">Conv2d (3&#xd7;3) with stride of 2 &#x2b; BatchNorm2d &#x2b; ReLU</td>
</tr>
<tr>
<td align="center">Pooling</td>
<td align="center">Maxpool</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The zoomed convolution, proposed by <xref ref-type="bibr" rid="B4">Chen, (2019)</xref>, reduces the size of the input feature map by bilinear downsampling, followed by a convolution operation, and finally restores the output to the original input size by bilinear upsampling. This special design enjoys a lower calculation amount and 2 times larger receptive field compared to standard convolution.</p>
<p>During the neural network search, we fix the network depth of a multi-resolution branch and simultaneously search the paths of three resolution branches with the sampling rates of 1/8, 1/16, and 1/32. For each layer of a single branch, the expansion rate of the network width can be any value in {2, 4, 6, 8}. The operation type can be any one in the search space, and the position of the skip connections can be selected arbitrarily. Since we are more concerned with the accuracy of network segmentation, we use weighted bootstrapped cross-entropy loss function in the search process.</p>
</sec>
<sec id="s4-3">
<title>4.3 Attention refinement module</title>
<p>In order to provide the maximum receptive field with global context information, we use Attention Refinement Module (<xref ref-type="bibr" rid="B35">Yu et al., 2018</xref>) to refine the features of each stage. Attention Refinement Module (ARM) employs global average pooling to remove the redundant information of the feature map (<xref ref-type="fig" rid="F3">Figure 3B</xref>). It extracts the global context semantic information through the convolution operation with a kernel size of 1&#xd7;1. The attention vector is calculated through the sigmoid function which is merged into the output to guide the feature learning during the training process.</p>
</sec>
<sec id="s4-4">
<title>4.4 Feature fusion module</title>
<p>The features of the three paths are different in level of feature representation. The 1/8 resolution branch represents the relatively macroscopic and superficial semantic information, while the 1/16 and 1/32 resolution branches focus on high-level semantic information such as microscopic details and inter-pixel relationships. Therefore, we cannot simply sum up these features. Moreover, we also need to introduce spatial position information for each resolution branch to achieve precise pixel positioning. We employ the skip connection to integrate the original image information into the output feature map, forming a new feature map that contains rich semantic information and accurate spatial location information. Therefore, we employ a specific Feature Fusion Module (<xref ref-type="fig" rid="F3">Figure 3C</xref>) (<xref ref-type="bibr" rid="B35">Yu et al., 2018</xref>) to fuse these features.</p>
<p>We first concatenate the output features of each branch and then utilize the batch normalization to balance the scales of the features. Next, we use global pooling to transform the concatenated feature to a feature vector. We compute a weight vector through the convolution operation with a kernel size of 1&#xd7;1. The weight vector can re-weight the features, which amounts to feature selection and combination.</p>
</sec>
<sec id="s4-5">
<title>4.5 Semantic segmentation head</title>
<p>The output of the semantic segmentation module is a matrix with the same size as the original image, and the number of channels is equal to the number of categories. Each element of the matrix stores the category of the current pixel. A bilinear up-sampling of the feature map after feature fusion is required to restore its size to the original map. We bilinear upsample the 1/8 resolution feature map to 1/4 size, and cascade it with the original 1/4 resolution feature map through a long skip connection. The feature map is sequentially processed with the deformable convolution (<xref ref-type="bibr" rid="B6">Dai et al., 2017</xref>) with a kernel size of 3&#xd7;3, batch normalization layer (BN) and rectification linear unit (ReLU). By doing the same operation on the feature map with 1/4 and 1/2 resolution, the final feature map with the same size as the original pixel is obtained. The network output is converted to pixel classification results by a 1&#xd7;1 convolution layer.</p>
</sec>
</sec>
<sec id="s5">
<title>5 Experiments and discussion</title>
<p>In all experiments, we use Nvidia GeForce GTX 2080Ti GPU, CUDA 10.0, and CUDNN V7. The deep learning framework is PyTorch 1.4.0. Firstly, we introduce the implementation details of GGMNet and the evaluation strategy. We conducted ablation experiments to study the contribution of each component to the network performance. Finally, we compare GGMNet with the current well-performing excellent networks.</p>
<sec id="s5-1">
<title>5.1 Implementation details</title>
<p>We trained all the models directly on the target data set without pre-training. We initialized the network weight parameters by random initialization. We use 4 GPUs for parallel training, with each card having a batch size of 4 and the training period is 600 epochs. The specific training details are as follows.</p>
<sec id="s5-1-1">
<title>5.1.1 Data augmentation</title>
<p>In general, successful neural networks have millions of parameters. It needs a large amount of data to drive the optimization of the network parameters. In reality, there is not as much data as we need. When the training sample is limited, we will adopt a data augmentation strategy during neural network training. Data augmentation can increase the data diversity, prevent overfitting of the training process, and make the neural network robust and generalizable. In this paper, we use random cropping (1,024&#xd7;1,024), random Gaussian noise (1%&#x2013;10%), random flip (horizontal, vertical), random rotation (0&#xb0;&#x2013;180&#xb0;), random translation, random contrast increase or decrease (lower&#x3d;0.5, upper&#x3d;1.5), and random brightness variation (lower&#x3d;0.8, upper&#x3d;1.2).</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Loss function</title>
<p>The proportions of the various strata vary considerably in the actual geological problem (<xref ref-type="table" rid="T1">Table 1</xref>). It leads to a serious category imbalance in the training data. We adopt the weighted bootstrapped cross-entropy loss function to solve the category imbalance problem (<xref ref-type="bibr" rid="B33">Wu et al., 2016</xref>; <xref ref-type="bibr" rid="B3">Bulo et al., 2017</xref>; <xref ref-type="bibr" rid="B34">Yang et al., 2019</xref>). We calculate the category weight w<sub>i</sub> based on the proportion of each category in the dataset. Then, we obtain the weighted cross-entropy loss for each pixel and we sort the pixels based on the cross-entropy loss. We only backpropagate the errors in the top-K positions (hard example mining). We set K &#x3d; 0.15 N, where <italic>N</italic> is the total number of pixels in the image. Moreover, we weigh the pixel loss based on instance sizes, putting more emphasis on small instances. Specifically, the weighted bootstrapped cross-entropy loss is defined by:<disp-formula id="e19">
<mml:math id="m23">
<mml:mrow>
<mml:mi mathvariant="script">l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mi>t</mml:mi>
<mml:mi>K</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>where <italic>y</italic>
<sub>
<italic>i</italic>
</sub> is the target class label for pixel <italic>i</italic>, <italic>p</italic>
<sub>
<italic>i,yi</italic>
</sub> is the predicted posterior probability for pixel <italic>i</italic> and class <italic>y</italic>
<sub>
<italic>i</italic>
</sub>, and 1{<italic>x</italic>} &#x3d; 1 if <italic>x</italic> is true and 0 otherwise. The threshold <italic>t</italic>
<sub>
<italic>K</italic>
</sub> is the posterior probability of the top-K pixel in descending order according to the loss function.</p>
</sec>
<sec id="s5-1-3">
<title>5.1.3 Learning rate policy</title>
<p>We train the neural network by using both Adam and SGD with momentum. When using the Adam algorithm, we set the initial learning rate lr &#x3d; 0.001 and the learning rate will be adjusted adaptively during the training. When switching to the SGD algorithm with momentum, the WarmupMultiStepLR learning rate policy is used, with the initial learning rate is the final learning rate of the previous phase, momentum &#x3d; 0.9, weight_decay &#x3d; 5e-4. Warmup was performed for the first 5 epochs, after which the learning rate is decreased by a factor of 10 for every 50 epochs.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 Evaluation metrics</title>
<p>For the problem of bedrock prediction in covered areas, which is the focus of this article, it can be formulated as a semantic segmentation task. The metrics employed to evaluate performance include Mean Pixel Accuracy (MPA) and Intersection over Union (IoU). Given the category imbalance present in our dataset, to more objectively assess the network&#x2019;s predictive efficacy, in addition to the aforementioned evaluation criteria, we use Mean Pixel Accuracy (MPA) and Frequency Weighted Intersection over Union (FWIoU) as the evaluation metrics in order to evaluate the network performance more objectively.</p>
<p>Mean Pixel Accuracy is the average ratio for all categories of samples between the number of correctly classified pixels and the total number of pixels.<disp-formula id="e20">
<mml:math id="m24">
<mml:mrow>
<mml:mtext>MPA</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>
</p>
<p>Where k is the total number of categories; p<sub>ij</sub> is the number of pixels of class i but is predicted to be class j.</p>
<p>FWIoU is a weighted summation on the IOU<sub>i</sub> of each category and its weight w<sub>i</sub>. Where w<sub>i</sub> is calculated according to the frequency of each class in the data set.<disp-formula id="e21">
<mml:math id="m25">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>area</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mtext>area</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x222a;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>
<disp-formula id="e22">
<mml:math id="m26">
<mml:mrow>
<mml:mtext>FWIoU</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mtext>IoU</mml:mtext>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>
</p>
<p>Under the aforementioned training strategy and initial conditions, the proposed GGMNet behaves stably during the training process (<xref ref-type="fig" rid="F4">Figure 4</xref>). At the beginning of training, the Adam algorithm adaptively adjusts the learning rate, the network converges rapidly, the loss function curve decreases linearly (<xref ref-type="fig" rid="F4">Figure 4A</xref>), and the frequency-weighted cross-parallel curve rises rapidly (<xref ref-type="fig" rid="F4">Figure 4B</xref>). When the loss function curve tends to flatten, the optimization algorithm switches to the SGD algorithm with momentum. After many iterations of the network, the FWIoU curve becomes smooth which means that the network performance is almost saturated. It is time to terminate network training.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold> Loss curves of GGMNet on training and validation sets. <bold>(B)</bold> The FWIoU curve of GGMNet on training and validation sets.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g004.tif"/>
</fig>
</sec>
<sec id="s5-3">
<title>5.3 Results and discussion</title>
<sec id="s5-3-1">
<title>5.3.1 Comparison with different depths of the MRFEM</title>
<p>We study the effect of the depth of the multi-resolution feature extraction module on the performance of the neural network, with fixed parameters for the other modules of the network. We set the initial depth to 16, and then increase the depth in increments of 2 until the depth is 24. The experimental results (<xref ref-type="table" rid="T4">Table 4</xref>) demonstrate that in the depth range of 16&#x2013;22, the number of parameters in the network increases with depth, and its feature expression ability increases. At this time, the network is able to extract richer feature information, thus significantly improving the network performance. When the network parameters are already sufficient to characterize the dataset, the increase in depth leads to an excess of network parameters. In this case, the dataset cannot drive the network to learn sufficiently, resulting in network underfitting. Therefore, we set the depth of the multi-resolution feature extraction module to 22 layers.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>The effect of the depth of the MRFEM on the performance of the neural network.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Depth of MRFEM</th>
<th align="center">Parameters (M)</th>
<th align="center">MPA (%)</th>
<th align="center">FWIoU (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">16</td>
<td align="center">4.7</td>
<td align="center">50.9</td>
<td align="center">36.8</td>
</tr>
<tr>
<td align="center">18</td>
<td align="center">6.5</td>
<td align="center">55.3</td>
<td align="center">38.33</td>
</tr>
<tr>
<td align="center">20</td>
<td align="center">8.6</td>
<td align="center">59.8</td>
<td align="center">40.31</td>
</tr>
<tr>
<td align="center">22</td>
<td align="center">10.3</td>
<td align="center">63.1</td>
<td align="center">42.88</td>
</tr>
<tr>
<td align="center">24</td>
<td align="center">12.4</td>
<td align="center">61.5</td>
<td align="center">41.12</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5-3-2">
<title>5.3.2 Ablation study for each component in GGMNet</title>
<p>In this subsection, we detailed investigate the effect of each component in our proposed GGMNet step by step. We use the same training strategy to train 600 epochs on five control networks and evaluate the performance of each one on the validation set. We use the U-shape structure (<xref ref-type="bibr" rid="B27">Ronneberger et al., 2015</xref>) as our baseline (<xref ref-type="table" rid="T5">Table 5a</xref>), in which the feature sequentially processed with Encoder, MRFEM, Decoder and a 1&#xd7;1 convolution layer. The b-d network adds modules in sequence to the baseline.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Ablation study for each component in GGMNet.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">Attention refinement module</th>
<th align="center">Feature fusion module</th>
<th align="center">Deformable convolution</th>
<th align="center">Residual connections</th>
<th align="center">MPA (%)</th>
<th align="center">FWIoU (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">a</td>
<td align="left"/>
<td align="left"/>
<td align="left"/>
<td align="left"/>
<td align="center">48.38</td>
<td align="center">32.3</td>
</tr>
<tr>
<td align="center">b</td>
<td align="center">&#x221a;</td>
<td align="left"/>
<td align="left"/>
<td align="left"/>
<td align="center">52.5</td>
<td align="center">36.5</td>
</tr>
<tr>
<td align="center">c</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="left"/>
<td align="left"/>
<td align="center">56.2</td>
<td align="center">38.1</td>
</tr>
<tr>
<td align="center">d</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="left"/>
<td align="center">60.4</td>
<td align="center">40.3</td>
</tr>
<tr>
<td align="center">ours</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="center">&#x221a;</td>
<td align="center">63.1</td>
<td align="center">42.88</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results (<xref ref-type="table" rid="T5">Table 5</xref>) demonstrate that the addition of the ARM increases the MPA from 48.38% to 52.5%, and the FWIoU from 32.31% to 36.5%. The FFM enables the network to well integrate features of different resolutions to achieve multi-scale prediction with the MPA is increased by 4.12 and the FWIoU is increased by 1.6. With the addition of deformable convolution, the network&#x2019;s receptive field is further enlarged, and the description of irregular boundary contours is more accurate. The GGMNet&#x2019;s performance is greatly improved, with the MPA is increased by 4.2 and the FWIoU is increased by 2.2. The introduction of residual connectivity allows the network to focus on the residuals between input and output during the learning process, without having to fit a large amount of redundant information. It also releases the learning ability of the network, making the network training more stable and faster.</p>
</sec>
<sec id="s5-3-3">
<title>5.3.3 Comparison with state-of-the-arts methods</title>
<p>In order to further validate the performance of the GGMNet on the target dataset, we compared the proposed GGMNet with three state-of-the-arts semantic segmentation networks. <xref ref-type="table" rid="T6">Table 6</xref> shows that the performance of the semantic segmentation networks on the target datasets are unsatisfactory, with the MPA less than 60%. The GGMNet, which is designed by utilizing neural architecture search (NAS) and human-designed approach, has fewer parameters than HRNetV1-W32. However, GGMNet has a better performance, with the MPA &#x3d; 63.1% and the FWIoU &#x3d; 42.88.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Performance comparison of the GGMNet against other state-of-the-arts semantic segmentation networks on the validation set.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">Design approach</th>
<th align="center">Parameters (M)</th>
<th align="center">MPA (%)</th>
<th align="center">FWIoU (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">FC-HarDNet-70</td>
<td align="center">Human-designed</td>
<td align="center">4.1</td>
<td align="center">46.1</td>
<td align="center">30.1</td>
</tr>
<tr>
<td align="center">FasterSeg</td>
<td align="center">NAS</td>
<td align="center">4.7</td>
<td align="center">51.56</td>
<td align="center">37.8</td>
</tr>
<tr>
<td align="center">HRNetV1-W32</td>
<td align="center">Human-designed</td>
<td align="center">28.5</td>
<td align="center">57.1</td>
<td align="center">39.9</td>
</tr>
<tr>
<td align="center">GGMNet, ours</td>
<td align="center">NAS and Human-designed</td>
<td align="center">10.3</td>
<td align="center">63.1</td>
<td align="center">42.88</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T7">Table 7</xref> shows the IoU and MPA of each category on the validation set. The Archean is unable to predict effectively because of the small number of samples. The remaining 9 categories can be successfully predicted by the GGMNet, and the accuracy of the prediction is positively correlated with the number of samples. The experimental results demonstrate the effectiveness and superiority of the neural network design method which performed neural network search directly on the target dataset and followed by manual optimization based on the target task characteristics. GGMNet&#x2019;s prediction results on the validation set (<xref ref-type="fig" rid="F5">Figure 5</xref>) show that the network has a strong predictive ability especially for categories with a high percentage of pixels. GGMNet can accurately depict the outline of the target body and accurately classify it. Compared with ground truth, the prediction results give richer detailed information.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>The IoU and MPA of GGMNet for each category on the validation set.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Stratum/Lithology</th>
<th align="center">Archean</th>
<th align="center">Proterozoic</th>
<th align="center">Cambrian</th>
<th align="center">Ordovician</th>
<th align="center">Silurian</th>
<th align="center">Devonian</th>
<th align="center">Carboniferous</th>
<th align="center">Permian</th>
<th align="center">Jurassic</th>
<th align="center">Granite</th>
<th align="center">Validation set</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">percentage (%)</td>
<td align="center">0.6</td>
<td align="center">7.2</td>
<td align="center">10.3</td>
<td align="center">4</td>
<td align="center">7</td>
<td align="center">20.3</td>
<td align="center">30.9</td>
<td align="center">2.1</td>
<td align="center">2.3</td>
<td align="center">15.3</td>
<td align="left"/>
</tr>
<tr>
<td align="center">IoU</td>
<td align="center">&#x2014;-</td>
<td align="center">22.8</td>
<td align="center">28.2</td>
<td align="center">20.3</td>
<td align="center">22.4</td>
<td align="center">50.3</td>
<td align="center">61.1</td>
<td align="center">15.3</td>
<td align="center">15.9</td>
<td align="center">40.4</td>
<td align="center">42.88</td>
</tr>
<tr>
<td align="center">MPA (%)</td>
<td align="center">&#x2014;-</td>
<td align="center">55.8</td>
<td align="center">62.5</td>
<td align="center">51.4</td>
<td align="center">53.6</td>
<td align="center">82.7</td>
<td align="center">90.2</td>
<td align="center">46.8</td>
<td align="center">48.6</td>
<td align="center">76.3</td>
<td align="center">63.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>GGMNet&#x2019;s prediction results on the validation set. First row: the prediction. Second row: ground truth. Third row: the residual gravity anomaly.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g005.tif"/>
</fig>
</sec>
</sec>
<sec id="s5-4">
<title>5.4 Prediction of Junggar Basin</title>
<p>The residual Bouguer gravity anomalies in the Junggar Basin and its surrounding areas are feed into the trained GGMNet for prediction. <xref ref-type="fig" rid="F6">Figure 6</xref> is a visualization of the prediction results. In the northern margin of the Junggar Basin, there is a local high gravity anomaly in the south of Fuhai-Fuyun and north of Kelamayi-Mulei. The GGMNet predicts the coexistence of Carboniferous and Devonian strata. The North-East trending Carboniferous and Devonian strata are also widespread in the bedrock outcrops at the periphery of the basin. The local low gravity anomaly is predicted to be granite. In the south of Karamay-Mulei, there is a local negative anomaly area, which is predicted to be Proterozoic, Cambrian strata and Granitic acidic intrusive rocks.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Prediction of Junggar Basin. GGMNet anticipates the presence of both Carboniferous and Devonian formations along the basin&#x2019;s northern edge, characterized by northeast-oriented outcroppings prevalent in the bedrock perimeter. Areas of diminished gravity are inferred as granite. In the south of Karamay-Mulei, the prediction includes Proterozoic formations, Cambrian strata, and granitic acid intrusion rocks.</p>
</caption>
<graphic xlink:href="feart-12-1407173-g006.tif"/>
</fig>
<p>Through deep neural network prediction, the distribution of concealed formations or rock bodies throughout the entire Junggar Basin has been ascertained. Within the basin, the major north-northwest trending Kalamayi-Mulei-Hami fault serves as a boundary; to its north, Carboniferous and Devonian strata predominate, while to its south, a mixture of Proterozoic, Cambrian, and Ordovician strata dominate, with acid granite intrusions occurring along the stratigraphic interfaces. The prediction indicates that in the Fuhai area along the northern margin of the Junggar Basin, primarily Carboniferous and Devonian strata are present, intermixed with granitic intrusions. This outcome is consistent with the deep geological structures revealed by traditional geophysical methods.</p>
<p>Studies on the crystalline basement of the Junggar Basin suggest the possible existence of a continental crustal crystalline basement, with its lower portion consisting of strata predating the Neoproterozoic and an upper section characterized by widely distributed Devonian and Carboniferous folded basements. This aligns well with the distribution of concealed geological bodies beneath the Junggar Basin&#x2019;s cover as predicted by our deep neural network model, indicating a high degree of reasonableness in our predictive results. This concurrence signifies that the predictions made in this study offer a credible depiction of the basin&#x2019;s sub-surface geology.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>In this paper, we systematically study the application of deep learning in the overburden area geological mapping, and successfully predict the bedrock of the Junggar Basin by using the satellite Bouguer gravity anomaly and the 1:5000000 Asia-Europe geological map.</p>
<p>Starting from the gravity anomaly formula in the spherical coordinate system, we deduce the non-linear functional between rock density &#x3c1; and rock mineral composition m, content p, buried depth h, diagenesis time t and other variables. We analyze the feasibility of using deep neural network to approximate the above nonlinear generalization. The problem of solving deep neural network parameters is transformed into a non-convex optimization problem. We give an iterative, gradient descent-based solution algorithm for the non-convex optimization problem.</p>
<p>We design a dataset for bedrock prediction using both the satellite Bouguer gravity anomaly with a resolution of 2&#x2032;&#xd7;2&#x2032; in the range of E65<sup>&#xb0;</sup>-95<sup>&#xb0;</sup>, N40<sup>&#xb0;</sup>-55<sup>&#xb0;</sup> and 1:5000000 Asia-European geological map. The dataset contains 536 high-resolution 2048&#xd7;2048 pixel samples at four scales: 1<sup>&#xb0;</sup>&#xd7;1<sup>&#xb0;</sup>, 5<sup>&#xb0;</sup>&#xd7;5<sup>&#xb0;</sup>, 10<sup>&#xb0;</sup>&#xd7;10<sup>&#xb0;</sup>, and 15<sup>&#xb0;</sup>&#xd7;15<sup>&#xb0;</sup>.</p>
<p>Utilizing neural architecture search (NAS) and human-designed approach, we propose a deep neural network (GGMNet) for geological mapping. Experiments have demonstrated that our proposed GGMNet has fast convergence and stable iterations during training. GGMNet also has better performance than a single neural network search or human-designed architectures, with the MAP &#x3d; 63.1% and the FWIoU &#x3d; 42.88.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>YL: Conceptualization, Methodology, Writing&#x2013;original draft. JC: Writing&#x2013;review and editing. QL: Writing&#x2013;review and editing. ZL: Writing&#x2013;review and editing. JL: Writing&#x2013;review and editing. ZF: Data curation, Writing&#x2013;review and editing. LZ: Data curation, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. The authors declare financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the National Key Research and Development Program of China under Grant 2023YFC3008903, and in part by the National Natural Science Foundation of China under Grant 42274184.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>Authors YL, JC, ZL, and JL were employed by China Coal Technology and Engineering Group Corp.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bender</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Jan Kindermans</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zoph</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Vasudevan</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Understanding and simplifying one-shot architecture search</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning</conf-name>, <conf-loc>Stockholm, Sweden</conf-loc>, <fpage>549</fpage>&#x2013;<lpage>558</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Brock</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lim</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ritchie</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Weston</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Smash: one-shot model architecture search through hypernetworks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1708.05344">https://arxiv.org/abs/1708.05344</ext-link>.</comment>
</citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bulo</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Neuhold</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kontschieder</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Loss max-pooling for semantic image segmentation</article-title>,&#x201d; in <conf-name>CVPR</conf-name>, <conf-loc>Honolulu, HI, USA</conf-loc>, <conf-date>July, 2017</conf-date>. <pub-id pub-id-type="doi">10.1109/cvpr.2017.749</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>FasterSeg: searching for faster real-time semantic segmentation</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1912.10917">https://arxiv.org/abs/1912.10917</ext-link>.</comment>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cybenko</surname>
<given-names>G.</given-names>
</name>
</person-group> <article-title>Approximation by superpositions of a sigmoidal function</article-title> [J]. <source>Math. Control, Signals Syst.</source>, <year>1989</year>, <volume>2</volume>(<issue>4</issue>):<fpage>303</fpage>&#x2013;<lpage>314</lpage>. <pub-id pub-id-type="doi">10.1007/bf02551274</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Deformable convolutional networks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1703.06211">https://arxiv.org/abs/1703.06211</ext-link>.</comment>
<pub-id pub-id-type="doi">10.1109/iccv.2017.89</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>1:50 000 bedrock geological mapping in shallow overburden area: a case study of kashkeneshakar sheet (L45E009020) on the northern margin of Junggar Basin</article-title>. <source>Acta Geosci. Sin.</source>, <comment>(In Chinese with English abstract)</comment>.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duchi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hazan</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Singer</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Adaptive subgradient methods for online learning and stochastic optimization</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2121</fpage>&#x2013;<lpage>2159</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Autogan: neural architecture search for generative adversarial networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE International Conference on Computer Vision</conf-name>, <conf-loc>Seoul, Korea (South)</conf-loc>, <conf-date>October, 2019</conf-date>, <fpage>3224</fpage>&#x2013;<lpage>3234</lpage>. <pub-id pub-id-type="doi">10.1109/iccv.2019.00332</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Courville</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Deep learning</source>. <publisher-loc>Cambridge, Massachusetts, United States</publisher-loc>: <publisher-name>The MIT Press</publisher-name>.</citation>
</ref>
<ref id="B11">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hardt</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Recht</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Singer</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Train faster, generalize better: stability of stochastic gradient descent</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1509.01240">https://arxiv.org/abs/1509.01240</ext-link>.</comment>
</citation>
</ref>
<ref id="B12">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Deep residual learning for image recognition</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1512.03385">https://arxiv.org/abs/1512.03385</ext-link>.</comment>
</citation>
</ref>
<ref id="B13">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Identity mappings in deep residual networks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1603.05027">https://arxiv.org/abs/1603.05027</ext-link>.</comment>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heck</surname>
<given-names>S. K.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>A comparison of the tesseroid, prism and point-mass approaches for mass reductions in gravity field modelling</article-title>. <source>J. Geodesy</source> <volume>81</volume> (<issue>2</issue>), <fpage>121</fpage>&#x2013;<lpage>136</lpage>. <pub-id pub-id-type="doi">10.1007/s00190-006-0094-0</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hornik</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Stinchcombe</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>White</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>Multilayer feedforward networks are universal approximators</article-title>. <source>Neural Netw.</source> <volume>2</volume> (<issue>5</issue>), <fpage>359</fpage>&#x2013;<lpage>366</lpage>. <pub-id pub-id-type="doi">10.1016/0893-6080(89)90020-8</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Batch normalization: accelerating deep network training by reducing internal covariate shift</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1502.03167">https://arxiv.org/abs/1502.03167</ext-link>.</comment>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jian</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kr&#xf6;ner</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jahn</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Windley</surname>
<given-names>B. F.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Zircon dating of Neoproterozoic and Cambrian ophiolites in West Mongolia and implications for the timing of orogenic processes in the central part of the Central Asian Orogenic Belt</article-title>. <source>Earth-Science Rev.</source> <volume>133</volume>, <fpage>62</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1016/j.earscirev.2014.02.006</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jinyi</surname>
<given-names>Li</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Late Neoproterozoic and Paleozoic tectonic framework and evolution of eastern Xinjiang</article-title>. <source>Geol. Rev.</source> <volume>50</volume> (<issue>3</issue>), <fpage>304</fpage>&#x2013;<lpage>322</lpage>. <comment>(In Chinese with English abstract)</comment>.</citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Keskar</surname>
<given-names>N. S.</given-names>
</name>
<name>
<surname>Socher</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Improving generalization performance by switching from Adam to SGD</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1712.07628">https://arxiv.org/abs/1712.07628</ext-link>.</comment>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ba</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Adam: a method for stochastic optimization</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR 2015)</conf-name>, <conf-loc>San Diego, CA, USA</conf-loc>, <conf-date>May, 2015</conf-date>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Simchowitz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jordan</surname>
<given-names>M. I.</given-names>
</name>
<name>
<surname>Recht</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Gradient descent converges to minimizers</source>. <publisher-loc>Berkeley</publisher-loc>: <publisher-name>University of California</publisher-name>, <fpage>16</fpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schroff</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hua</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Auto-DeepLab: hierarchical neural architecture search for semantic image segmentation</article-title>,&#x201d; in <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Long Beach, CA, USA</conf-loc>, <conf-date>June, 2020</conf-date>.</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>Q. A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X. H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Geochronology and geochemistry of carboniferous metabasalts in eastern tianshan, central Asia: evidence of a back-arc basin</article-title>. <source>Int. Geol. Rev.</source> <volume>58</volume> (<issue>6</issue>), <fpage>756</fpage>&#x2013;<lpage>772</lpage>. <pub-id pub-id-type="doi">10.1080/00206814.2015.1114433</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Oktay</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Schlemper</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Folgoc</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heinrich</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Misawa</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Attention U-net: learning where to look for the pancreas</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1804.03999">https://arxiv.org/abs/1804.03999</ext-link>.</comment>
</citation>
</ref>
<ref id="B25">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Pham</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>M. Y.</given-names>
</name>
<name>
<surname>Zoph</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q. V.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Efficient neural architecture search via parameter sharing</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1802.03268">https://arxiv.org/abs/1802.03268</ext-link>.</comment>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Robbins</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sutton</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>1951</year>). <article-title>A stochastic approximation method</article-title>. <source>Ann. Math. statistics</source> <volume>22</volume>, <fpage>400</fpage>&#x2013;<lpage>407</lpage>. <pub-id pub-id-type="doi">10.1214/aoms/1177729586</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>,&#x201d; in <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name>, <conf-loc>Vancouver/Canada</conf-loc>, <conf-date>October, 2015</conf-date>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L. C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>MobileNetV2: inverted residuals and linear bottlenecks</article-title>,&#x201d; in <conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <pub-id pub-id-type="doi">10.1109/cvpr.2018.00474</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>QuocEfficientnet</surname>
<given-names>V.Le.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Rethinking model scaling for convolutional neural networks</article-title>. <source>ICML</source>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tieleman</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Lecture 6.5-RMSProp: divide the gradient by a running average of its recent magnitude</article-title>. <source>COURSERA Neural Netw. Mach. Learn.</source> <volume>4</volume>.</citation>
</ref>
<ref id="B32">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Wilson</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Roelofs</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Stern</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Srebro</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Recht</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The marginal value of adaptive gradient methods in machine learning</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1705.08292">https://arxiv.org/abs/1705.08292</ext-link>.</comment>
</citation>
</ref>
<ref id="B33">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>van den Hengel</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Bridging category-level and instance-level semantic image segmentation</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1605.06885">https://arxiv.org/abs/1605.06885</ext-link>.</comment>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Windley</surname>
<given-names>B. F.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2006</year>). <article-title>SHRIMP zircon age of the aermantai ophiolite in the north xinjiang area, China and its tectonic implications</article-title>. <source>Acta Geol. Sin.</source> <volume>80</volume> (<issue>1</issue>), <fpage>32</fpage>&#x2013;<lpage>37</lpage>. <comment>(In Chinese with English abstract)</comment>.</citation>
</ref>
<ref id="B34">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>T.-Ju</given-names>
</name>
<name>
<surname>Collins</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hwang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>DeeperLab: single-shot image parser</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1902.05093">https://arxiv.org/abs/1902.05093</ext-link>.</comment>
</citation>
</ref>
<ref id="B35">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sang</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>BiSeNet: bilateral segmentation network for real-time semantic segmentation</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1808.00897">https://arxiv.org/abs/1808.00897</ext-link>.</comment>
</citation>
</ref>
<ref id="B36">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Zoph</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Quoc</surname>
<given-names>V. Le</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Neural architecture search with reinforcement learning</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1611.01578">https://arxiv.org/abs/1611.01578</ext-link>.</comment>
</citation>
</ref>
</ref-list>
</back>
</article>