<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2021.761050</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Pyramid-Net: Intra-layer Pyramid-Scale Feature Aggregation Network for Retinal Vessel Segmentation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Zhang</surname> <given-names>Jiawei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1446436/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname> <given-names>Yanchun</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/819665/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Qiu</surname> <given-names>Hailong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1436545/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Xie</surname> <given-names>Wen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1529660/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yao</surname> <given-names>Zeyang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1389109/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yuan</surname> <given-names>Haiyun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/757552/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jia</surname> <given-names>Qianjun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Tianchen</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Shi</surname> <given-names>Yiyu</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Huang</surname> <given-names>Meiping</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1502423/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhuang</surname> <given-names>Jian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c003"><sup>&#x0002A;</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Xu</surname> <given-names>Xiaowei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c004"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1465476/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Guangdong Provincial Key Laboratory of South China Structural Heart Disease, Guangdong Provincial People&#x00027;s Hospital, Guangdong Cardiovascular Institute, Guangdong Academy of Medical Sciences</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Shanghai key Laboratory of Data Science, School of Computer Science, Fudan University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Computer Science and Engineering, University of Notre Dame</institution>, <addr-line>Notre Dame, IN</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Oujiang Laboratory (Zhejiang Lab for Regenerative Medicine, Vision and Brain Health)</institution>, <addr-line>Wenzhou</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>Cyberspace Institute of Advanced Technology, Guangzhou University</institution>, <addr-line>Guangzhou</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>College of Engineering and Science, Victoria University</institution>, <addr-line>Melbourne, VIC</addr-line>, <country>Australia</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Jun Feng, Northwest University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Juanying Xie, Shaanxi Normal University, China; M&#x000E1;rton Szemenyei, Budapest University of Technology and Economics, Hungary; Erlei Zhang, Northwest A&#x00026;F University, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Yanchun Zhang <email>yanchun.zhang&#x00040;vu.edu.au</email></corresp>
<corresp id="c002">Meiping Huang <email>huangmeiping&#x00040;126.com</email></corresp>
<corresp id="c003">Jian Zhuang <email>Zhuangjian5413&#x00040;163.com</email></corresp>
<corresp id="c004">Xiaowei Xu <email>xiao.wei.xu&#x00040;foxmail.com</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Precision Medicine, a section of the journal Frontiers in Medicine</p></fn>
<fn fn-type="equal" id="fn002"><p>&#x02020;These authors have contributed equally to this work</p></fn></author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>12</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>8</volume>
<elocation-id>761050</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>08</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>11</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2021 Zhang, Zhang, Qiu, Xie, Yao, Yuan, Jia, Wang, Shi, Huang, Zhuang and Xu.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Zhang, Zhang, Qiu, Xie, Yao, Yuan, Jia, Wang, Shi, Huang, Zhuang and Xu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract><p>Retinal vessel segmentation plays an important role in the diagnosis of eye-related diseases and biomarkers discovery. Existing works perform multi-scale feature aggregation in an inter-layer manner, namely <bold>inter-layer feature aggregation</bold>. However, such an approach only fuses features at either a lower scale or a higher scale, which may result in a limited segmentation performance, especially on thin vessels. This discovery motivates us to fuse multi-scale features in each layer, <bold>intra-layer feature aggregation</bold>, to mitigate the problem. Therefore, in this paper, we propose Pyramid-Net for accurate retinal vessel segmentation, which features intra-layer pyramid-scale aggregation blocks (IPABs). At each layer, IPABs generate two associated branches at a higher scale and a lower scale, respectively, and the two with the main branch at the current scale operate in a <bold>pyramid-scale</bold> manner. Three further enhancements including pyramid inputs enhancement, deep pyramid supervision, and pyramid skip connections are proposed to boost the performance. We have evaluated Pyramid-Net on three public retinal fundus photography datasets (DRIVE, STARE, and CHASE-DB1). The experimental results show that Pyramid-Net can effectively improve the segmentation performance especially on thin vessels, and outperforms the current state-of-the-art methods on all the adopted three datasets. In addition, our method is more efficient than existing methods with a large reduction in computational cost. We have released the source code at <ext-link ext-link-type="uri" xlink:href="https://github.com/JerRuy/Pyramid-Net">https://github.com/JerRuy/Pyramid-Net</ext-link>.</p></abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>neural network</kwd>
<kwd>feature aggregation</kwd>
<kwd>pyramid scale</kwd>
<kwd>retinal vessel segmentation</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Key Research and Development Program of China<named-content content-type="fundref-id">10.13039/501100012166</named-content></contract-sponsor>
<contract-sponsor id="cn002">Science and Technology Planning Project of Guangdong Province<named-content content-type="fundref-id">10.13039/501100012245</named-content></contract-sponsor>
<contract-sponsor id="cn003">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content></contract-sponsor>
<counts>
<fig-count count="5"/>
<table-count count="7"/>
<equation-count count="14"/>
<ref-count count="58"/>
<page-count count="12"/>
<word-count count="8333"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>The subtle changes in the retinal vascular, including vessel width, tortuosity, and branching features, indicate mass eye-related diseases, such as diabetic retinopathy (<xref ref-type="bibr" rid="B1">1</xref>), glaucoma (<xref ref-type="bibr" rid="B2">2</xref>), and macular degeneration (<xref ref-type="bibr" rid="B3">3</xref>). Meanwhile, those characteristics are important biomarkers for numerous systemic diseases, including hypertension (<xref ref-type="bibr" rid="B4">4</xref>) and cardiovascular diseases (<xref ref-type="bibr" rid="B5">5</xref>). Retinal vessel segmentation is one of the cornerstones to access those characteristics, particularly for automatic retinal image analysis (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>). For example, hypertensive retinopathy is a retinal disease, which is caused by hypertension. Increased vascular curvature or stenosis can be found in patients with hypertension (<xref ref-type="bibr" rid="B8">8</xref>). Conventionally, manual segmentation is laborious and time-consuming, and suffers subjectivity among experts. To improve efficiency and reliability and reduce the workload of doctors, the clinical practice puts forward high requirements for automatic segmentation (<xref ref-type="bibr" rid="B9">9</xref>).</p>
<p>Recently, deep neural networks have boosted the segmentation performance of retinal vessel segmentation (<xref ref-type="bibr" rid="B10">10</xref>, <xref ref-type="bibr" rid="B12">12</xref>) by a large margin compared with traditional methods (<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>). However, thin vessels cannot be segmented accurately. For example, <xref ref-type="fig" rid="F1">Figure 1</xref> demonstrates a commonly-seen fundus image containing numerous thin vessels and thick vessels, and corresponding segmentation (<xref ref-type="bibr" rid="B11">11</xref>) and ground truth. We can easily notice that the thick vessels enjoy a promising performance, but the thin vessels suffer a big miss. A potential reason is that the continuous pooling operations in most neural networks are used to encode the features, which leads to a mass loss of appearance information and harms the segmentation accuracy, especially on thin vessels. Note that in practice, it is also difficult to segment these thin vessels for experts due to low contrast and ambiguousness. Currently, some works have been proposed to tackle the above problems, e.g., a particular processing branch for thin vessels (<xref ref-type="bibr" rid="B12">12</xref>), a new loss function to emphasize thin vessels (<xref ref-type="bibr" rid="B10">10</xref>). However, the segmentation performance is still limited considering the clinical requirement of retinal image analysis.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Examples of challenging thin vessels in retinal vessel segmentation. The retinal fundus image (left) contains numerous thin vessels (1&#x02013;2 pixels wide) and thick vessels (3 pixels wide or more) (<xref ref-type="bibr" rid="B10">10</xref>). Regions of representative thin and thick vessels, and their corresponding ground truth and predictions (<xref ref-type="bibr" rid="B11">11</xref>) are shown in the right. It can be noticed that the thick vessels obtain a better segmentation performance, while the thin vessels suffer a big miss (indicated by red rectangles).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-08-761050-g0001.tif"/>
</fig>
<p>Meanwhile, <bold>multi-scale feature aggregation</bold> to fuse coarse-to-fine context information has been popular to segment thin/small objects (<xref ref-type="bibr" rid="B15">15</xref>&#x02013;<xref ref-type="bibr" rid="B19">19</xref>). There are mainly two approaches: input-output level category and intra-network level category. In the input-output level category, connections exist between inputs at various scales and corresponding intermediate layers (<xref ref-type="bibr" rid="B15">15</xref>), or between the intermediate layers and the final predictions with corresponding scales (<xref ref-type="bibr" rid="B18">18</xref>). In the intra-network level category, features from previous layers are adjusted in channel numbers and spatial dimension and then aggregated with the ones in the later layer (<xref ref-type="bibr" rid="B16">16</xref>). However, current multi-scale feature aggregation works in an inter-layer manner, <bold>inter-layer feature aggregation</bold>, which can only fuse features at either a lower scale or a higher scale. For example, in the encoder, feature maps at the lower scale cannot be fused by that at the current scale because of the processing order of the layers. A possible solution is to fuse multi-scale features in each layer, <bold>intra-layer feature aggregation</bold>, to consider features at both the high scale and the low scale.</p>
<p>Motivated by the above discoveries, in this paper, we propose Pyramid-Net for accurate retinal vessel segmentation. In each layer of Pyramid-Net, intra-layer pyramid-scale aggregation blocks (IPABs) are employed in both the encoder and the decoder to aggregate features at pyramid scales (the higher scale, the lower scale, and the current scale). In this way, two associated branches at the higher scale and the lower scale are generated to assist the main branch at the current scale. Therefore, coarse-to-fine context information is shared and aggregated in each layer, thus improving the segmentation accuracy of capillaries. To further improve the performance, three optimizations, including pyramid inputs enhancement, deep pyramid supervision, and pyramid skip connections, are applied to IPABs. We have conducted comprehensive experiments on three retinal vessel image segmentation datasets, including DRIVE (<xref ref-type="bibr" rid="B20">20</xref>), STARE (<xref ref-type="bibr" rid="B21">21</xref>), and CHASE-DB1 (<xref ref-type="bibr" rid="B22">22</xref>) with various segmentation networks. The experimental results show that our method can significantly improve the segmentation performance, especially on thin vessels, and achieves state-of-the-art performance on the three public datasets. In addition, our method is more efficient than the existing method with a large reduction in computational cost.</p>
<p>Overall, this work makes the following contributions:</p>
<list list-type="simple">
<list-item><p>1) We discovered that thin vessels suffer a big miss in the segmentation results of existing methods;</p></list-item>
<list-item><p>2) We proposed Pyramid-Net for retinal vessel segmentation in which intra-layer pyramid-scale aggregation blocks (IPABs) aggregate features at the higher, current, and lower scales to fuse coarse-to-fine context information in each layer;</p></list-item>
<list-item><p>3) We further propose three enhancements: pyramid input enhancement, deep pyramid supervision, and pyramid skip connections to boost the performance;</p></list-item>
<list-item><p>4) We conducted comprehensive experiments on three public vessel image datasets (DRIVE, STARE, and CHASE-DB1), and our method achieves the state-of-the-art performance on three datasets.</p></list-item>
</list>
<p>The remainder of this paper is organized as follows. Section 2 introduces related works and the motivation of the proposed method. Section 3 details the overall framework of the proposed Pyramid-Net, including IPABs and three optimizations (pyramid inputs enhancement, deep pyramid supervision, and pyramid skip connections). Section 4 first introduces datasets, implementation, and evaluation. Second, quantitative evaluations on three vessel image datasets, comparisons with the state-of-the-art algorithms, and several visual retinal segmentation results are presented. Third, several ablation studies that included evaluating the thin vessel, ablation analysis, and cross-training evaluation are discussed. Section 5 concludes the paper.</p>
</sec>
<sec id="s2">
<title>2. Related Work and Motivation</title>
<sec>
<title>2.1. Vessel Image Segmentation</title>
<p>With the emergence of numerous public-available retinal image datasets (<xref ref-type="bibr" rid="B20">20</xref>&#x02013;<xref ref-type="bibr" rid="B22">22</xref>), the supervised vessel segmentation methods became popular in the community. Commonly-seen supervised methods consist of two steps: feature extraction and classification. Some methods extracted the color intensity (<xref ref-type="bibr" rid="B24">24</xref>) and principle components (<xref ref-type="bibr" rid="B25">25</xref>) from the images, while some methods utilized wavelet (<xref ref-type="bibr" rid="B26">26</xref>) and edge responses (<xref ref-type="bibr" rid="B27">27</xref>). In terms of classification, various classic classifiers, including Support Vector Machine (SVM) (<xref ref-type="bibr" rid="B28">28</xref>), perceptron (<xref ref-type="bibr" rid="B29">29</xref>), random decision forests (<xref ref-type="bibr" rid="B30">30</xref>), and Gaussian model (<xref ref-type="bibr" rid="B26">26</xref>) are commonly seen and widely used in traditional supervised vessel image segmentation. Recently, in the light of fully convolutional networks (FCNs) (<xref ref-type="bibr" rid="B31">31</xref>) and U-Net (<xref ref-type="bibr" rid="B23">23</xref>), data-driven deep learning-based methods have demonstrated promising results and dominated the area of vessel image segmentation. Yan et al. (<xref ref-type="bibr" rid="B10">10</xref>) pointed out that the training loss tends to ignore the loss of thin vessels and is dominated by the thick vessels, which may be caused by the imbalance between thin vessels and thick vessels. Furthermore, Yan et al. (<xref ref-type="bibr" rid="B12">12</xref>) explored a three-stage network separating the segmentation of thick vessels, thin vessels, and the vessel fusion into different stages to make full use of the difference between thick and thin vessels to improve the overall segmentation performance. Considering that the consecutive pooling may lead to accuracy loss, CE-Net (<xref ref-type="bibr" rid="B32">32</xref>) encodes the high-dimension information and preserves spatial information to improve the overall segmentation. HA-Net (<xref ref-type="bibr" rid="B33">33</xref>) dynamically assigns the regions in the image hard regions or simple regions, and then introduces attention modules to help the network concentrate on the hard region for accurate vessel image segmentation. Meanwhile, some works introduce the spatial attention (<xref ref-type="bibr" rid="B34">34</xref>) and the channel attention (<xref ref-type="bibr" rid="B34">34</xref>) to the vessel segmentation domain and achieve promising results. The proposed method extends considerably to our previous work (<xref ref-type="bibr" rid="B35">35</xref>), which only supply some simplified evaluation on two public available vessel segmentation datasets. In this work, we have added a new module named &#x0201C;pyramid skip connections,&#x0201D; which furthers boost the performance. Meanwhile, we have added another widely-used dataset (STARE) to demonstrate the generalization of our proposed Pyramid-Net. Moreover, in terms of the analysis, we have supplied in-depth analyses of our method including evaluation on thin vessel segmentation, ablation analysis, and cross-training evaluation.</p>
</sec>
<sec>
<title>2.2. Motivation</title>
<p>Multi-scale feature aggregation is widely used in medical image segmentation, which fuses the previous feature maps with different scales to improve the network performance. As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, recent works (<xref ref-type="bibr" rid="B36">36</xref>&#x02013;<xref ref-type="bibr" rid="B39">39</xref>) introduced multi-scale feature aggregation to strengthen feature propagation, alleviate the vanishing gradient problem, and improve the overall segmentation. We divide those methods into two major categories: input-output level and intra-network level.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Illustrations of network structures of <bold>(a)</bold> basic U-Net (<xref ref-type="bibr" rid="B23">23</xref>) and <bold>(b&#x02013;e)</bold> existing multi-scale feature aggregation methods, which mainly consist of two major categories: input-output level and intra-network level. The input-output level category means that the network employs multiple scaled inputs, and the scaled ground truth supervises the inter feature maps. In the intra-network level category, the encoder level, the decoder level, and the cross-level indicate implemented multi-scale feature aggregation in the encoder, the decoder, and their cross, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-08-761050-g0002.tif"/>
</fig>
<p><bold>Input-output level category</bold>: The connections exist between inputs at various scales and corresponding intermediate layers, or between the intermediate layers and the final predictions with corresponding scales. For example, Wu et al. (<xref ref-type="bibr" rid="B40">40</xref>) generated multi-scale feature maps by max-pooling and up-sampling layer and employed two sub-models to extract and aggregate features at multiple scales. MIMO-Net (<xref ref-type="bibr" rid="B41">41</xref>) fused scaled input images with multiple resolutions into the intermediate layers of the network in the encoder, and optimized the features in the decoder to improve the overall segmentation performance. MILD-Net (<xref ref-type="bibr" rid="B42">42</xref>) fused scaled original images with multiple resolutions to alleviate the potential accuracy decline caused by max-pooling.</p>
<p><bold>Intra-network level category</bold>: In this approach, features from previous layers are adjusted in channel numbers and spatial dimension and then aggregated with the ones in the later layer. For ease of discussion, we discuss the network structures of related works based on the U-Net as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. Note that U-Net is the most widely-used network in medical image segmentation. These works contain three main approaches: dense connections in the encoder (encoder sub-level), dense connections in the decoder (decoder sub-level) and dense connections in the cross of the encoder and the decoder (cross sub-level): (1) Encoder sub-level: (<xref ref-type="bibr" rid="B15">15</xref>) aggregated the scale inputs into the intermediate layers in the encoder to alleviate the accuracy decline caused by pooling; (2) Decoder sub-level: Dense decoder short connections (<xref ref-type="bibr" rid="B18">18</xref>) made full use of the feature maps in the decoder by fusing them with the feature maps in later layers; (3) Cross sub-level: Complete bipartite networks (<xref ref-type="bibr" rid="B16">16</xref>) inspired by the structure of complete bipartite graphs connected every layer in the encoder and the decoder.</p>
<p>Though multi-scale feature aggregation can significantly improve segmentation performance, we discover that they usually work in an inter-layer manner, <bold>inter-layer feature aggregation</bold>. In such a manner, features at either a lower scale or a higher scale are fused by the current layer. For example, in the encoder, feature maps at the lower scale cannot be fused by that at the current scale because of the processing order of the layers. The same phenomenon also exists in the decoder. Note that a successful segmentation needs to consider both feature maps at high scales for global localization information and low scales for detailed appearance information. Thus, we may mitigate the above problem by performing multi-scale feature aggregation in each layer of the network, <bold>intra-layer feature aggregation</bold>. How to obtain the multi-scale features in each layer becomes another problem. We may use pooling and upsampling to obtain two associated branches operating on a higher scale and a low scale, respectively. In this way, there exist three branches at three different scales (namely <bold>pyramid scales</bold>) in each layer, which is like a ResNet block (<xref ref-type="bibr" rid="B43">43</xref>). In this way, we may aggregate coarse-to-fine context information from pyramid-scale feature maps in each layer to further improve the segmentation performance.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3. Methods</title>
<p>In this section, we first introduce IPABs and then describe three optimizations, including pyramid input enhancement, deep pyramid supervision, and pyramid skip connections. <xref ref-type="fig" rid="F3">Figure 3</xref> presents the structure details of Pyramid-Net.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>The network structure of the proposed Pyramid-Net. IPABs (green rectangle) not only aggregate features at <bold>pyramid scales</bold> [the current scale (green line), the higher scale (dark green line) and the lower scale (bright green line)] containing coarse-to-fine context information. Meanwhile, pyramid input enhancement (yellow rectangle), deep pyramid supervision (purple rectangle), and pyramid skip connections (rad rectangle) are employed to further improve the overall segmentation. Best viewed in color.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-08-761050-g0003.tif"/>
</fig>
<sec>
<title>3.1. Intra-layer Pyramid-Scale Aggregation Block</title>
<p>Intra-layer pyramid-scale aggregation block are based on the ResNet block (<xref ref-type="bibr" rid="B43">43</xref>), which is widely adopted in deep learning. <xref ref-type="fig" rid="F4">Figure 4</xref> illustrates the structure of the ResNet block (<xref ref-type="bibr" rid="B43">43</xref>), which is formulated as</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <italic>X</italic><sub><italic>l</italic></sub> and <italic>X</italic><sub><italic>l</italic>&#x0002B;1</sub> are the input and the output of the current layer, while <italic>f</italic>(&#x000B7;) represents the main branch of the current layer. ResNet learns the additive residual function <italic>f</italic>(&#x000B7;) with respect to the unit input through a shortcut connection between them. Meanwhile, the multi-scale feature aggregation inspires us to propose associated branches to learn coarse-to-fine features in each residual branch. <xref ref-type="fig" rid="F4">Figure 4</xref> illustrates the detailed structures of traditional ResNet blocks and our IPABs. Different from ResNet blocks, in each layer, IPABs generate two associated branches to aggregate coarse-to-fine feature maps to assist the main branch at the current scale. In each branch, the processing steps are almost the same as those in traditional ResNet blocks. Some extra steps such as up-sampling and down-sampling are adopted at the higher and the lower scales to adjust scales. In order to reduce the potential increase of computational cost, the number of channels of the inputs <italic>X</italic><sub><italic>l</italic></sub> in the main branch has been reduced to half, while the number of channels of resized inputs <inline-formula><mml:math id="M2"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M3"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> in the associated branches is reduced to one-fourth. The feature maps with channel adjustment are fed to the processing steps at three scales and are processed in parallel. The three outputs at pyramid scales are then concatenated. The whole process is formulated as follows,</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M4"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>l</mml:mi><mml:mi>d</mml:mi></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M5"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M6"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the up-sampled and the down-sampled results of the current input <italic>X</italic><sub><italic>l</italic></sub> with channel adjustment, respectively. <inline-formula><mml:math id="M7"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M8"><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M9"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the enhanced results using pyramid input enhancement, which only exists in the encoder and is detailed in section 3.2. Meanwhile, <inline-formula><mml:math id="M10"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M11"><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math id="M12"><mml:msubsup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are replaced by <inline-formula><mml:math id="M13"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M14"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math id="M15"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> in the decoder, which represents the enhancement results by pyramid skip connections and are detailed in section 3.4. <italic>H</italic>(&#x000B7;) represents the aggregation process, which performs re-scaling and feature concatenation. <inline-formula><mml:math id="M16"><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is the strengthened results of <italic>X</italic><sub><italic>l</italic>&#x0002B;1</sub> by IPAB.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>The network structure of <bold>(A)</bold> ResNet blocks and <bold>(B)</bold> our intra-layer pyramid-scale aggregation blocks (IPABs). IPABs (marked by green rectangles) aggregate coarse-to-fine features at the current scale and both the higher scale and the lower scale (<bold>pyramid scales</bold>). Meanwhile, pyramid input enhancement (marked by yellow rectangles) and deep pyramid supervision (marked by purple rectangles) are employed to fuse the original images with corresponding scales, and supervise the intermediate results in each layer of the decoder, respectively.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-08-761050-g0004.tif"/>
</fig>
<p>The channel attention module selectively emphasizes interdependent channel maps by integrating associated features among all channel maps. To improve the efficiency of feature extraction, we also employ an attention mechanism (<xref ref-type="bibr" rid="B44">44</xref>, <xref ref-type="bibr" rid="B45">45</xref>) in IPAB as follows,</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M17"><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003A6;</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>Q</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003A6;</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>Q</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003A6;</mml:mi><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E4"><label>(4)</label><mml:math id="M18"><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003A8;</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003C3;</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003A6;</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02297;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where &#x003A8;(&#x000B7;) is the operation of attention process, <bold>Q</bold> is the conventional operation using 1&#x000D7;1 kernels for channel adjustment, and &#x003C3; is the activation function. Average-pooling &#x003A6;<sub><italic>Avg</italic></sub>(&#x000B7;) and max-pooling &#x003A6;<sub><italic>Max</italic></sub>(&#x000B7;) are adopted to aggregate channel information. By utilizing IPAB, each layer of the network aggregates the feature with pyramid scales, which helps fuse coarse-to-fine context information to improve the overall segmentation performance.</p>
</sec>
<sec>
<title>3.2. Pyramid Input Enhancement</title>
<p>Pyramid input enhancement fuses the input image with multiple scales to IPABs to reduce the loss of information caused by re-scaling and enhance feature fusion. Pooling operations with various pooling sizes are used to guarantee spatial resolution consistency. Particularly, in each layer, the input image is scaled at higher, current, and lower scales, and fed to three parallel processing steps at multiple scales in the IPAB. Pooling operations over larger regions successively reinforce the scale and translation invariance while reducing noise sensitivity at the same time as more and more context information is added. The aggregation should facilitate discrimination between relevant features and local noises. The above three pyramid-scale images are concatenated with corresponding outputs of up-sampling, down-sampling, and channel adjustment, respectively. Suppose that <italic>X</italic><sub><italic>l</italic></sub> is denoted as the input of the current layer, and <inline-formula><mml:math id="M19"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are results at the higher scale and the lower scale, respectively. Meanwhile, <italic>I</italic><sub><italic>l</italic>&#x02212;1</sub>, <italic>I</italic><sub><italic>l</italic></sub> and <italic>I</italic><sub><italic>l</italic>&#x0002B;1</sub> are the scaled inputs of <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <italic>X</italic><sub><italic>l</italic></sub>, and <inline-formula><mml:math id="M22"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> with the same size, respectively. The fusion process of the current scale is formulated as follows,</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M23"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>l</mml:mi><mml:mi>d</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>W</mml:mi></mml:mstyle><mml:mi>d</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E6"><label>(6)</label><mml:math id="M24"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>W</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E7"><label>(7)</label><mml:math id="M25"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>W</mml:mi></mml:mstyle><mml:mi>p</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <bold>W</bold><sup><italic>p</italic></sup>(&#x000B7;), <bold>W</bold><sup><italic>d</italic></sup>(&#x000B7;), and <bold>W</bold>(&#x000B7;) represents 3&#x000D7;3 convolutional operations and is applied before concatenating to the pyramid-scale features, and <italic>H</italic>(&#x000B7;) denotes channel adjustment.</p>
</sec>
<sec>
<title>3.3. Deep Pyramid Supervision</title>
<p>Deep pyramid supervision optimizes feature maps at multiple scales to improve the segmentation of multi-scale objects and fast the training process. Similar to pyramid input enhancement, deep pyramid supervision connects the intermediate layer to the final prediction thus fusing coarse-to-fine context information. Particularly, the feature maps at multiple scales from each IPAB in the decoder are fed into a plain 3 &#x000D7; 3 convolutional layer followed by Sigmoid function. Deep pyramid supervision at the <italic>l</italic>th scale of the decoder can be defined as,</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M26"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>Y</mml:mi><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>Y</mml:mi><mml:mi>l</mml:mi><mml:mi>d</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>The ground truths <italic>M</italic> are scaled to the same size as the pyramid-scale feature maps for deep supervision, e.g., <inline-formula><mml:math id="M27"><mml:msubsup><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math id="M28"><mml:msubsup><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are supervised by the corresponding ground truth <italic>M</italic><sub><italic>l</italic>&#x02212;1</sub>, <italic>M</italic><sub><italic>l</italic></sub>, and <italic>M</italic><sub><italic>l</italic>&#x0002B;1</sub>, respectively. Note that the feature maps in each layer can be directly fused with the final prediction and optimized without massive convolutional processing. Therefore, deep pyramid supervision can be adapted to different depths for different tasks in training, which supply adaptive model capacity, thereby facilitating the segmentation of objects with different scales.</p>
</sec>
<sec>
<title>3.4. Pyramid Skip Connections</title>
<p>Pyramid skip connections perform feature reuse among the three scaled feature maps (the higher scale, the current scale, and the lower scale) in each IPAB module. Suppose that <italic>X</italic><sub><italic>l</italic></sub> is the input of the current layer in the decoder, and <inline-formula><mml:math id="M29"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and <inline-formula><mml:math id="M30"><mml:msubsup><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the results at the higher scale and the lower scale, respectively. Meanwhile, <inline-formula><mml:math id="M31"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, <inline-formula><mml:math id="M32"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, and <inline-formula><mml:math id="M33"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> are three groups of learned feature maps from the encoder, and feature maps in each group have the same spatial dimension with the corresponding scaled input <inline-formula><mml:math id="M34"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="M35"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math id="M36"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, respectively. The fusion process of the current scale is formulated as follows,</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M37"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>l</mml:mi><mml:mi>d</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>d</mml:mi></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E10"><label>(10)</label><mml:math id="M38"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>l</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>p</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mi>l</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>d</mml:mi></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E11"><label>(11)</label><mml:math id="M39"><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>H</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mi>l</mml:mi><mml:mi>d</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mover accent='true'><mml:mi>X</mml:mi><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>p</mml:mi></mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>where <italic>H</italic>(&#x000B7;) denotes channel adjustment. We can see that features at the current-scale <italic>l</italic> can reuse and aggregate feature maps at most five scales (<italic>l</italic> &#x02212; 2, <italic>l</italic> &#x02212; 1, <italic>l, l</italic> &#x0002B; 1, and<italic>l</italic> &#x0002B; 2).</p>
</sec>
</sec>
<sec id="s4">
<title>4. Experiments</title>
<sec>
<title>4.1. Datasets</title>
<p>We used three public available retinal vessel datasets, DRIVE (<xref ref-type="bibr" rid="B20">20</xref>), STARE (<xref ref-type="bibr" rid="B21">21</xref>), and CHASE-DB1 (<xref ref-type="bibr" rid="B22">22</xref>) for evaluation. The images in the three datasets are collected using digital retinal imaging, a standard method of documenting the appearance of the retina. More details of the datasets are as follows.</p>
<p><bold>DRIVE:</bold> The DRIVE dataset (<xref ref-type="bibr" rid="B20">20</xref>) consists of 40 images with a resolution of 565 &#x000D7; 584 pixels, which were acquired using a Canon CR5 non-mydriatic 3CCD camera with a 45-degree field of view (FOV). Two trained human observers labeled the vessels in all images, and the ones from the first observer were used for network training. The dataset has been divided into a training and a test set (<xref ref-type="bibr" rid="B20">20</xref>), both of which contain 20 images.</p>
<p><bold>CHASE-DB1:</bold> The CHASE-DB1 dataset (<xref ref-type="bibr" rid="B22">22</xref>) contains vascular patch images with a resolution of 999 &#x000D7; 960, which were acquired from 28 eyes of 14 ten-year-old children. Since images were captured in subdued lighting and the operators adjusted illumination settings, the images contain more illumination variation in CHASE-DB1 compared with the DRIVE datasets. Following the configuration in Li et al. (<xref ref-type="bibr" rid="B46">46</xref>), the first 20 images and the remaining 8 images are employed as the training set and the test set, respectively.</p>
<p><bold>STARE:</bold> The STARE dataset (<xref ref-type="bibr" rid="B21">21</xref>) consists of 20 equal-sized images with a resolution of 700 &#x000D7; 605 pixels. Each image is with a 35&#x000B0; FOV, and half of the images of eyes are with ocular pathology. As the training set and the test set are not explicitly specified, the same leave-one-out cross-validation is adopted (<xref ref-type="bibr" rid="B33">33</xref>) for performance evaluation, where models are iteratively trained on 19 images and tested on the rest images. Liking other methods (<xref ref-type="bibr" rid="B10">10</xref>), manual annotations generated by the first observer are used for both training and test.</p>
</sec>
<sec>
<title>4.2. Implementations</title>
<p>All experiments were conducted on an Nvidia GeForce Titan X (pascal) containing 12 GB memory. Meanwhile, we employed CE-Net (<xref ref-type="bibr" rid="B32">32</xref>), one of the state-of-the-art methods in retinal vessel segmentation, as the backbone models to implement IPABs, pyramid input enhancement, deep pyramid supervision, and pyramid skip connections. Normalization of the training data has been implemented. In order to express the details of multi-scale feature fusion more clearly, we use U-Net as the basic network to explain, which is widely used in the medical image segmentation domain. In practice, we use the state-of-the-art method CE-Net to replace U-Net to obtain better performance. During training, we adopted Adaptive Moment Estimation (Adam) as the learning optimizer with a batch size of 4. Data augmentation operations including horizontal flip, vertical flip, and diagonal flip are used to enlarge the train samples. We use a threshold to obtain the final segmentation from pixel probability vectors. Particularly, the pixels with values smaller than the threshold are assigned to the background class, and the remaining pixels with values equal to or greater than the threshold are categorized as the vessel class. The final prediction is the ensemble of the segmentation output of the vessel images, its rotation (90&#x000B0;), and its flip (horizontal and vertical).</p>
</sec>
<sec>
<title>4.3. Evaluation Metrics</title>
<p>We introduce four evaluation metrics including Sensitivity (Sens), Specificity (Spec), Accuracy (Acc), and Area Under the ROC Curve (AUC) to validate our proposed Pyramid-Net. The metrics are calculated as follows:</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M40"><mml:mrow><mml:mtext>Sensitivity</mml:mtext><mml:mo>=</mml:mo><mml:mtext>TP</mml:mtext><mml:mo>/</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FN</mml:mtext><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E13"><label>(13)</label><mml:math id="M41"><mml:mrow><mml:mtext>Specificity</mml:mtext><mml:mo>=</mml:mo><mml:mtext>TN</mml:mtext><mml:mo>/</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mtext>TN</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E14"><label>(14)</label><mml:math id="M42"><mml:mrow><mml:mtext>Accuracy</mml:mtext><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FN</mml:mtext><mml:mo stretchy='false'>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>True positive (TP) and true negative (TN) present that pixels are correctly classified to objects or backgrounds, respectively. Meanwhile, pixels will be labeled as false positive (FP) or false negative (FN), if they are misclassified to objects or backgrounds, respectively.</p>
</sec>
<sec>
<title>4.4. Quantitative Results</title>
<p>We compared our Pyramid-Net with existing state-of-the-art works on three vessel image segmentation datasets (DRIVE, CHASE-DB1, and STARE). <xref ref-type="table" rid="T1">Tables 1</xref>&#x02013;<xref ref-type="table" rid="T3">3</xref> illustrate the comparison results of Pyramid-Net and the current state-of-the-art methods. For the DRIVE dataset, Pyramid-Net achieves a high score of 82.38, 98.19, 96.26, and 98.32% on Sens, Spec, Acc, and AUC, respectively, and outperforms state-of-the-art methods in three metrics including Spec, Acc, and AUC. In terms of Sens, CE-Net achieves the best performance of 83.09%, while our method achieves a comparable result, which is 0.71% lower. Overall, Pyramid-Net achieves higher overall performance than CE-Net. For the CHASE-DB1 dataset, compared with the state-of-the-art results, the proposed Pyramid-Net achieves high score of 81.17, 98.26, 96.89, and 98.92% for Sens, Spec, Acc, and AUC, respectively, which consistently enjoys a better performance than all the current state-of-the-art methods. For the STARE dataset, Pyramid-Net achieves a promising score of 82.35, 98.87, 97.19, and 98.62% for Sens, Spec, Acc, and AUC, respectively, which is also consistently better than all the current state-of-the-art methods. The consistent improvements in <xref ref-type="table" rid="T1">Tables 1</xref>&#x02013;<xref ref-type="table" rid="T3">3</xref> indicate the effectiveness and robustness of our Pyramid-Net.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Performance comparison of Pyramid-Net and the state-of-the-art methods on the DRIVE dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Sens (%)</bold></th>
<th valign="top" align="center"><bold>Spec (%)</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FCN (<xref ref-type="bibr" rid="B31">31</xref>)</td>
<td valign="top" align="center">74.89</td>
<td valign="top" align="center">96.21</td>
<td valign="top" align="center">94.13</td>
<td valign="top" align="center">95.67</td>
</tr>
<tr>
<td valign="top" align="left">U-Net (<xref ref-type="bibr" rid="B23">23</xref>)</td>
<td valign="top" align="center">75.31</td>
<td valign="top" align="center">96.45</td>
<td valign="top" align="center">94.45</td>
<td valign="top" align="center">96.01</td>
</tr>
<tr>
<td valign="top" align="left">DeepVessel (<xref ref-type="bibr" rid="B11">11</xref>)</td>
<td valign="top" align="center">76.12</td>
<td valign="top" align="center">97.68</td>
<td valign="top" align="center">95.23</td>
<td valign="top" align="center">97.52</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="center">76.53</td>
<td valign="top" align="center">98.18</td>
<td valign="top" align="center">95.42</td>
<td valign="top" align="center">97.52</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B47">47</xref>)</td>
<td valign="top" align="center">77.92</td>
<td valign="top" align="center">98.13</td>
<td valign="top" align="center">95.56</td>
<td valign="top" align="center">97.84</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B40">40</xref>)</td>
<td valign="top" align="center">78.44</td>
<td valign="top" align="center">98.07</td>
<td valign="top" align="center">95.67</td>
<td valign="top" align="center">98.19</td>
</tr>
<tr>
<td valign="top" align="left">CE-Net (<xref ref-type="bibr" rid="B32">32</xref>)</td>
<td valign="top" align="center">83.09</td>
<td valign="top" align="center">97.47</td>
<td valign="top" align="center">95.45</td>
<td valign="top" align="center">97.79</td>
</tr>
<tr>
<td valign="top" align="left">BTS-DSN (<xref ref-type="bibr" rid="B48">48</xref>)</td>
<td valign="top" align="center">78.91</td>
<td valign="top" align="center">98.04</td>
<td valign="top" align="center">95.61</td>
<td valign="top" align="center">98.06</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B49">49</xref>)</td>
<td valign="top" align="center">79.16</td>
<td valign="top" align="center">98.11</td>
<td valign="top" align="center">95.70</td>
<td valign="top" align="center">98.10</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B50">50</xref>)</td>
<td valign="top" align="center">79.40</td>
<td valign="top" align="center">98.16</td>
<td valign="top" align="center">95.67</td>
<td valign="top" align="center">97.72</td>
</tr>
<tr>
<td valign="top" align="left">Vessel-Net (<xref ref-type="bibr" rid="B51">51</xref>)</td>
<td valign="top" align="center">80.38</td>
<td valign="top" align="center">98.02</td>
<td valign="top" align="center">95.78</td>
<td valign="top" align="center">98.21</td>
</tr>
<tr>
<td valign="top" align="left">MResU-Net (<xref ref-type="bibr" rid="B52">52</xref>)</td>
<td valign="top" align="center">79.69</td>
<td valign="top" align="center">97.99</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">97.99</td>
</tr>
<tr>
<td valign="top" align="left">CTF-Net (<xref ref-type="bibr" rid="B53">53</xref>)</td>
<td valign="top" align="center">78.49</td>
<td valign="top" align="center">98.13</td>
<td valign="top" align="center">95.67</td>
<td valign="top" align="center">97.88</td>
</tr>
<tr>
<td valign="top" align="left">Hybrid-Net (<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="center"><bold>83.53</bold></td>
<td valign="top" align="center">97.51</td>
<td valign="top" align="center">95.79</td>
<td valign="top" align="center">-</td>
</tr>
<tr>
<td valign="top" align="left">HA-Net (<xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center">79.91</td>
<td valign="top" align="center">98.13</td>
<td valign="top" align="center">95.81</td>
<td valign="top" align="center">98.23</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Pyramid-Net</bold></td>
<td valign="top" align="center">82.38</td>
<td valign="top" align="center"><bold>98.19</bold></td>
<td valign="top" align="center"><bold>96.26</bold></td>
<td valign="top" align="center"><bold>98.32</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Performance comparison of Pyramid-Net and the state-of-the-art methods on the CHASE-DB1 dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Sens (%)</bold></th>
<th valign="top" align="center"><bold>Spec (%)</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B54">54</xref>)</td>
<td valign="top" align="center">76.15</td>
<td valign="top" align="center">95.75</td>
<td valign="top" align="center">94.67</td>
<td valign="top" align="center">96.23</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B46">46</xref>)</td>
<td valign="top" align="center">75.07</td>
<td valign="top" align="center">97.93</td>
<td valign="top" align="center">95.81</td>
<td valign="top" align="center">97.16</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B55">55</xref>)</td>
<td valign="top" align="center">81.94</td>
<td valign="top" align="center">97.39</td>
<td valign="top" align="center">96.30</td>
<td valign="top" align="center">-</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="center">76.33</td>
<td valign="top" align="center">98.09</td>
<td valign="top" align="center">96.10</td>
<td valign="top" align="center">97.81</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B47">47</xref>)</td>
<td valign="top" align="center">77.56</td>
<td valign="top" align="center">98.20</td>
<td valign="top" align="center">96.34</td>
<td valign="top" align="center">98.15</td>
</tr>
<tr>
<td valign="top" align="left">FCN (<xref ref-type="bibr" rid="B31">31</xref>)</td>
<td valign="top" align="center">76.41</td>
<td valign="top" align="center">98.06</td>
<td valign="top" align="center">96.07</td>
<td valign="top" align="center">97.76</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B56">56</xref>)</td>
<td valign="top" align="center">81.55</td>
<td valign="top" align="center">97.52</td>
<td valign="top" align="center">96.10</td>
<td valign="top" align="center">98.04</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B48">48</xref>)</td>
<td valign="top" align="center">78.88</td>
<td valign="top" align="center">98.01</td>
<td valign="top" align="center">96.27</td>
<td valign="top" align="center">98.40</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B50">50</xref>)</td>
<td valign="top" align="center">80.74</td>
<td valign="top" align="center">98.21</td>
<td valign="top" align="center">96.61</td>
<td valign="top" align="center">98.12</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B51">51</xref>)</td>
<td valign="top" align="center">81.32</td>
<td valign="top" align="center">98.14</td>
<td valign="top" align="center">96.61</td>
<td valign="top" align="center">98.60</td>
</tr>
<tr>
<td valign="top" align="left">Three-stage (<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="center">76.41</td>
<td valign="top" align="center">98.06</td>
<td valign="top" align="center">96.07</td>
<td valign="top" align="center">97.76</td>
</tr>
<tr>
<td valign="top" align="left">CTF-Net (<xref ref-type="bibr" rid="B52">52</xref>)</td>
<td valign="top" align="center">79.48</td>
<td valign="top" align="center"><bold>98.42</bold></td>
<td valign="top" align="center">96.48</td>
<td valign="top" align="center">98.47</td>
</tr>
<tr>
<td valign="top" align="left">Hybrid-Net (<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="center">81.76</td>
<td valign="top" align="center">97.76</td>
<td valign="top" align="center">96.32</td>
<td valign="top" align="center">-</td>
</tr>
<tr>
<td valign="top" align="left">HA-Net (<xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center"><bold>82.39</bold></td>
<td valign="top" align="center">98.13</td>
<td valign="top" align="center">96.70</td>
<td valign="top" align="center">98.70</td>
</tr>
<tr>
<td valign="top" align="left">Pyramid-Net</td>
<td valign="top" align="center">81.17</td>
<td valign="top" align="center">98.26</td>
<td valign="top" align="center"><bold>96.89</bold></td>
<td valign="top" align="center"><bold>98.92</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance comparison of Pyramid-Net and the state-of-the-art methods on the STARE dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Sens (%)</bold></th>
<th valign="top" align="center"><bold>Spec (%)</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B54">54</xref>)</td>
<td valign="top" align="center">73.20</td>
<td valign="top" align="center">98.40</td>
<td valign="top" align="center">95.60</td>
<td valign="top" align="center">96.70</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B57">57</xref>)</td>
<td valign="top" align="center">77.91</td>
<td valign="top" align="center">97.58</td>
<td valign="top" align="center">95.54</td>
<td valign="top" align="center">97.48</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B58">58</xref>)</td>
<td valign="top" align="center">76.80</td>
<td valign="top" align="center">97.38</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">-</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="center">75.81</td>
<td valign="top" align="center">98.46</td>
<td valign="top" align="center">96.12</td>
<td valign="top" align="center">98.01</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B56">56</xref>)</td>
<td valign="top" align="center">75.95</td>
<td valign="top" align="center">98.78</td>
<td valign="top" align="center">96.41</td>
<td valign="top" align="center">98.32</td>
</tr>
<tr>
<td valign="top" align="left">Three-stage (<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="center">77.35</td>
<td valign="top" align="center">98.57</td>
<td valign="top" align="center">96.38</td>
<td valign="top" align="center">98.33</td>
</tr>
<tr>
<td valign="top" align="left">MResU-Net (<xref ref-type="bibr" rid="B52">52</xref>)</td>
<td valign="top" align="center">81.01</td>
<td valign="top" align="center">97.95</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">98.16</td>
</tr>
<tr>
<td valign="top" align="left">Hybrid-Net (<xref ref-type="bibr" rid="B6">6</xref>)</td>
<td valign="top" align="center">79.46</td>
<td valign="top" align="center">98.21</td>
<td valign="top" align="center">96.26</td>
<td valign="top" align="center">-</td>
</tr>
<tr>
<td valign="top" align="left">HA-Net (<xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center">81.86</td>
<td valign="top" align="center">98.44</td>
<td valign="top" align="center">96.73</td>
<td valign="top" align="center">98.32</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Pyramid-Net</bold></td>
<td valign="top" align="center"><bold>82.35</bold></td>
<td valign="top" align="center"><bold>98.87</bold></td>
<td valign="top" align="center"><bold>97.19</bold></td>
<td valign="top" align="center"><bold>98.62</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.5. Qualitative Results</title>
<p>The visual comparisons between Pyramid-Net and the state-of-the-art methods, including DeepVessel and CE-Net on the DRIVE dataset and the CHASE-DB1 dataset are shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. White (TP) and black (TN) pixels are correct predictions of vessels and the background, respectively, while red (FP) and green (FN) pixels are incorrect predictions. In <xref ref-type="fig" rid="F5">Figure 5</xref>, dark yellow rectangles contain the selected areas used for detail comparison, and the bright yellow rectangles contain the zoomed area in the dark yellow rectangle. We can notice that current methods enjoy a good performance on the segmentation of main retinal vessels, but the effect on some capillaries is poor. For example, Row 1 of <xref ref-type="fig" rid="F5">Figure 5</xref> shows that the result of DeepVessel misses a large number of thin vessels on the DRIVE dataset, and that of CE-Net obtains a much better accuracy on thin vessels. However, in Row 2, there is no significant difference between the results of the two methods. In both Rows 1 and 2 of <xref ref-type="fig" rid="F5">Figure 5</xref>, our method can achieve much higher accuracy, but we can still notice that our method cannot segment them correctly if the vessels are too thin. We can further observe that our method has much fewer false-negative pixels (indicated by green) than the other two. This may due to the fact that our proposed IPABs can consider more scales thus improving the segmentation accuracy. Overall, our proposed Pyramid-Net evidently improves the segmentation performance, especially for those narrow, low-contrast, and ambiguous retinal vessels.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Visual comparison of Pyramid-Net and the state-of-the-art methods including DeepVessel (<xref ref-type="bibr" rid="B11">11</xref>) and CE-Net (<xref ref-type="bibr" rid="B32">32</xref>) on DRIVE (Row 1&#x02013;2), CHASE-DB1 (Row 3&#x02013;4), and STARE (Row 5) datasets. White (TP) and black (TN) pixels indicate correct predictions of object and background, respectively, while red (FP) and green (FN) pixels indicate incorrect predictions. The dark yellow rectangle contains the area used to compare segmentation details, and the bright yellow rectangle contains the zoomed area in the dark yellow rectangle. Best viewed in color.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-08-761050-g0005.tif"/>
</fig>
</sec>
<sec>
<title>4.6. Evaluation on Thin Vessels</title>
<p>In the previous subsection, the results in <xref ref-type="fig" rid="F5">Figure 5</xref> indicate that though the main vessels enjoy a promising segmentation performance, the segmentation of thin vessels always suffers a big miss in the prediction. In practice, it is challenging to segment the thin vessels from the complex retina background, which are always low-contrast and extremely narrow (1&#x02013;2 pixels). Thus, in this subsection, to evaluate the effectiveness of Pyramid-Net on thin vessels, we compared Pyramid-Net with the state-of-the-art methods on an additional dataset only containing thin vessel labels. Vessels with a width of 1 or 2 pixels are commonly regarded as the thin vessels in the DRIVE dataset. To avoid potential unfair in the evaluation on the manual addition label of the thin vessel, we distinguish thick vessels from thin vessels by an opening operation (<xref ref-type="bibr" rid="B10">10</xref>). The evaluation results are summarized in <xref ref-type="table" rid="T4">Table 4</xref>. It can be noticed that Pyramid-Net achieves a high ACC score of 96.26, 96.51, and 91.64% on all vessels, thick vessels, and thin vessels, respectively. Overall, our method outperforms the state-of-the-art methods on all metrics. As for the thin vessel segmentation, our methods achieve an improvement of 4.73% over backbone model CE-Net and outperforms the state-of-the-art method by about 3.86%. The experiment results indicate that our Pyramid-Net is particularly effective on thin vessels.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Performance comparison on thick and thin vessels of Pyramid-Net on the DRIVE dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>All vessel (%)</bold></th>
<th valign="top" align="center"><bold>Thick vessel (%)</bold></th>
<th valign="top" align="center"><bold>Thin vessel (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B10">10</xref>)</td>
<td valign="top" align="center">95.42</td>
<td valign="top" align="center">95.78</td>
<td valign="top" align="center">87.78</td>
</tr>
<tr>
<td valign="top" align="left">CE-Net (<xref ref-type="bibr" rid="B32">32</xref>)</td>
<td valign="top" align="center">95.45</td>
<td valign="top" align="center">95.96</td>
<td valign="top" align="center">86.91</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Pyramid-Net</bold></td>
<td valign="top" align="center"><bold>96.26</bold></td>
<td valign="top" align="center"><bold>96.51</bold></td>
<td valign="top" align="center"><bold>91.64</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.7. Ablation Analysis</title>
<p>To justify the effectiveness of IPABs, pyramid input enhancement, deep pyramid supervision, and pyramid skip connections in the proposed Pyramid-Net, we conduct ablation analysis using the DRIVE dataset as a vehicle. The ablation experimental results are summarized in <xref ref-type="table" rid="T5">Table 5</xref>. We use CE-Net (<xref ref-type="bibr" rid="B32">32</xref>) as our backbone, which achieves a good score of 95.45 and 97.79% on Acc and on AUC, respectively. Firstly, we evaluate the effectiveness of IPABs on the backbone. Benefiting from aggregating coarse-to-fine context information from pyramid scale in each layer, the backbone model with IPABs achieves improvements of 0.62% on Acc and 0.30% on AUC. Second, we evaluate pyramid input enhancement and deep pyramid supervision to feed the original image at multiple scales into the network and supervise the immediate layers contains features at various scales. In <xref ref-type="table" rid="T5">Table 5</xref>, we can notice that the above two optimizations achieve improvements of more than 0.10 and 0.07% in AUC, respectively. Third, pyramid skip connections connect the encoder and the decoder and make full use of the features from multiple layers and scales in the encoder, which achieves an improvement of about 0.15% on AUC. Overall, integrating the pyramid-scale concept into the design of the basic unit and skip connections can obviously improve the network segmentation, and the other two optimizations also bring some improvement.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Ablation analysis of Pyramid-Net on the DRIVE dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Baseline</td>
<td valign="top" align="center">95.45</td>
<td valign="top" align="center">97.79</td>
</tr>
<tr>
<td valign="top" align="left">Baseline &#x0002B; IPABs</td>
<td valign="top" align="center">96.07</td>
<td valign="top" align="center">98.09</td>
</tr>
<tr>
<td valign="top" align="left">Baseline &#x0002B; IPABs &#x0002B; pyramid input</td>
<td valign="top" align="center">96.10</td>
<td valign="top" align="center">98.15</td>
</tr>
<tr>
<td valign="top" align="left">Baseline &#x0002B; IPABs &#x0002B; Pyramid supervision</td>
<td valign="top" align="center">96.15</td>
<td valign="top" align="center">98.12</td>
</tr>
<tr>
<td valign="top" align="left">Baseline &#x0002B; IPABs &#x0002B; pyramid skip connection</td>
<td valign="top" align="center">96.21</td>
<td valign="top" align="center">98.24</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Pyramid-Net</bold></td>
<td valign="top" align="center"><bold>96.26</bold></td>
<td valign="top" align="center"><bold>98.32</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.8. Cross-Training Evaluation</title>
<p>To evaluate the generalization of Pyramid-Net, we performed a cross-training evaluation on the DRIVE dataset and the STARE dataset. We directly implemented our models trained on the source dataset and tested on the target dataset for fair comparisons. The experimental results are summarized in <xref ref-type="table" rid="T6">Table 6</xref>. Overall, our method achieves the state-of-the-art transfer performance on both configurations. Particularly, for the configuration that models are trained on the STARE dataset and tested on the DRIVE dataset, it can be noticed that the transfer model can achieve competitive results on Spec and suffer a big loss of accuracy on Sens. The potential reason is the imbalance between thick vessels and thin vessels in the STARE dataset. Manual annotations of the STARE dataset contain more thick vessels than thin vessels, which led that the pre-trained model on the STARE dataset obtains a bad segmentation performance of thin vessels on the DRIVE dataset. When the conditions are reversed, the above situation is alleviated, and the corresponding scores on Sens, Spec, Acc, and AUC on the STARE dataset are comparable with the model trained on the STARE dataset.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Cross-training evaluation on the DRIVE dataset and the STARE dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Sens (%)</bold></th>
<th valign="top" align="center"><bold>Spec (%)</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="5"><bold>DRIVE (train) -&#x0003E;</bold> <bold>STARE (test)</bold></td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="center">70.14</td>
<td valign="top" align="center">98.02</td>
<td valign="top" align="center">94.44</td>
<td valign="top" align="center">95.68</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B56">56</xref>)</td>
<td valign="top" align="center">65.05</td>
<td valign="top" align="center">99.14</td>
<td valign="top" align="center">94.81</td>
<td valign="top" align="center">97.18</td>
</tr>
<tr>
<td valign="top" align="left">HA-Net (<xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center">71.40</td>
<td valign="top" align="center">98.79</td>
<td valign="top" align="center">95.30</td>
<td valign="top" align="center">97.58</td>
</tr>
<tr>
<td valign="top" align="left">Pyramid-Net</td>
<td valign="top" align="center"><bold>75.71</bold></td>
<td valign="top" align="center"><bold>98.86</bold></td>
<td valign="top" align="center"><bold>95.57</bold></td>
<td valign="top" align="center"><bold>97.78</bold></td>
</tr>
<tr>
<td valign="top" align="left" colspan="5"><bold>STARE (train) -&#x0003E; DRIVE (test)</bold></td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B12">12</xref>)</td>
<td valign="top" align="center">73.19</td>
<td valign="top" align="center">98.40</td>
<td valign="top" align="center">95.80</td>
<td valign="top" align="center">96.78</td>
</tr>
<tr>
<td valign="top" align="left">(<xref ref-type="bibr" rid="B56">56</xref>)</td>
<td valign="top" align="center">70.00</td>
<td valign="top" align="center">97.59</td>
<td valign="top" align="center">94.74</td>
<td valign="top" align="center">97.18</td>
</tr>
<tr>
<td valign="top" align="left">HA-Net (<xref ref-type="bibr" rid="B33">33</xref>)</td>
<td valign="top" align="center">81.87</td>
<td valign="top" align="center"><bold>98.79</bold></td>
<td valign="top" align="center">95.30</td>
<td valign="top" align="center">97.58</td>
</tr>
<tr>
<td valign="top" align="left">Pyramid-Net</td>
<td valign="top" align="center"><bold>82.67</bold></td>
<td valign="top" align="center">98.76</td>
<td valign="top" align="center"><bold>95.36</bold></td>
<td valign="top" align="center"><bold>97.72</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.9. Comparison With Multi-Scale Aggregation Methods</title>
<p>To evaluate the effectiveness of the multi-scale information aggregated in the proposed Pyramid-Net, we compare existing multi-scale aggregation methods, including Dense Pooling Connections (<xref ref-type="bibr" rid="B15">15</xref>), Complete Bipartite Network (CB-Net) (<xref ref-type="bibr" rid="B16">16</xref>), Dense Decoder Short Connections (DDSC) (<xref ref-type="bibr" rid="B18">18</xref>), and U-Net&#x0002B;&#x0002B; (<xref ref-type="bibr" rid="B17">17</xref>) on the DRIVE dataset. For fair comparisons, we directly implement those different connection styles and our Pyramid-Net on U-Net (<xref ref-type="bibr" rid="B23">23</xref>). The comparison results and the <italic>p</italic>-values for the paired <italic>t</italic>-test are summarized in <xref ref-type="table" rid="T7">Table 7</xref>. Compared with existing methods, our method outperforms them by 0.65&#x02013;0.99% and 0.67&#x02013;1.50% on Acc and AUC, respectively. On the other hand, we also compare the computational cost of the proposed Pyramid-Net with existing methods. Obviously, existing methods improve the network performance and increase the computational cost by 16.38&#x02013;493.74G (104.9&#x02013;247.4%) on FLOPs from the numerous feature reuse. Particularly, our proposed Pyramid-Net achieves state-of-the-art performance with a computational cost reduced by 216.8G (64.7%) on FLOPs. The reason for the above phenomenon is the channel reduction in each IPAB. The channels&#x00027; main branch is reduced to half, while the number of channels at associated branches is half of that of the main branch. Overall, our method achieves the state-of-the-art performance of 96.26% on Acc and 98.32% on AUC with a 64.7% reduction on FLOPs.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Comparison with existing multi-scale aggregation methods on the DRIVE Dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
<th valign="top" align="center"><bold>FLOPs</bold></th>
<th valign="top" align="center"><bold><italic>p</italic>-values</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">U-Net (<xref ref-type="bibr" rid="B23">23</xref>)</td>
<td valign="top" align="center">94.45</td>
<td valign="top" align="center">96.01</td>
<td valign="top" align="center">334.95G</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">DPC (<xref ref-type="bibr" rid="B15">15</xref>)</td>
<td valign="top" align="center">95.56</td>
<td valign="top" align="center">97.65</td>
<td valign="top" align="center">351.33G</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">CB-Net (<xref ref-type="bibr" rid="B16">16</xref>)</td>
<td valign="top" align="center">95.61</td>
<td valign="top" align="center">97.52</td>
<td valign="top" align="center">441.62G</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">DDSC (<xref ref-type="bibr" rid="B18">18</xref>)</td>
<td valign="top" align="center">95.42</td>
<td valign="top" align="center">97.48</td>
<td valign="top" align="center">381.07G</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">U-Net &#x0002B;&#x0002B; (<xref ref-type="bibr" rid="B17">17</xref>)</td>
<td valign="top" align="center">95.27</td>
<td valign="top" align="center">96.82</td>
<td valign="top" align="center">828.69G</td>
<td valign="top" align="center">&#x0003C;0.01</td>
</tr>
<tr>
<td valign="top" align="left">CE-Net (<xref ref-type="bibr" rid="B32">32</xref>)</td>
<td valign="top" align="center">95.45</td>
<td valign="top" align="center">97.79</td>
<td valign="top" align="center">-</td>
<td valign="top" align="center">&#x0003C;0.05</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Pyramid-Net</bold></td>
<td valign="top" align="center"><bold>96.26</bold></td>
<td valign="top" align="center"><bold>98.32</bold></td>
<td valign="top" align="center">188.15G</td>
<td valign="top" align="center">-</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Bold values mean the state-of-the-art performance</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusions" id="s5">
<title>5. Conclusion</title>
<p>In this paper, we introduced Pyramid-Net for accurate retinal vessel segmentation. In Pyramid-Net, the proposed IPABs are utilized to generalize two associated branches to aggregate coarse-to-fine feature maps at pyramid scales to improve the segmentation performance. Meanwhile, three optimizations including pyramid inputs enhancement, deep pyramid supervision, and pyramid skip connections are implemented with IPABs in the encoder, the decoder, and the cross of the two to further improve performance, respectively. Comprehensive experiments have been conducted on three retinal vessel segmentation datasets, including DRIVE (<xref ref-type="bibr" rid="B20">20</xref>), STARE (<xref ref-type="bibr" rid="B21">21</xref>), and CHASE-DB1 (<xref ref-type="bibr" rid="B22">22</xref>). Experimental results demonstrate that our IPABs can efficiently improve the segmentation performance, especially for thin vessels. In addition, our method is also much more efficient than existing methods with a large reduction in computational cost.</p>
</sec>
<sec sec-type="data-availability" id="s6">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors/s.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>XX is the guarantor of the manuscript. JZh implemented the experiments and wrote the first draft of the manuscript. HQ, WX, and ZY managed the result analysis. All authors contributed to drawing up the manuscript.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>This work was supported by the National Key Research and Development Program of China (no. 2018YFC1002600), the Science and Technology Planning Project of Guangdong Province, China (nos. 2017B090904034, 2017B030314109, 2018B090944002, and 2019B020230003), Guangdong Peak Project (no. DFJH201802), and the National Natural Science Foundation of China (no. 62006050).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Winder</surname> <given-names>RJ</given-names></name> <name><surname>Morrow</surname> <given-names>PJ</given-names></name> <name><surname>McRitchie</surname> <given-names>IN</given-names></name> <name><surname>Bailie</surname> <given-names>J</given-names></name> <name><surname>Hart</surname> <given-names>PM</given-names></name></person-group>. <article-title>Algorithms for digital image processing in diabetic retinopathy</article-title>. <source>Comput Med Imaging Graph</source>. (<year>2009</year>) <volume>33</volume>:<fpage>608</fpage>&#x02013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2009.06.003</pub-id><pub-id pub-id-type="pmid">19616920</pub-id></citation></ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mitchell</surname> <given-names>P</given-names></name> <name><surname>Leung</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>JJ</given-names></name> <name><surname>Rochtchina</surname> <given-names>E</given-names></name> <name><surname>Lee</surname> <given-names>AJ</given-names></name> <name><surname>Wong</surname> <given-names>TY</given-names></name> <etal/></person-group>. <article-title>Retinal vessel diameter and open-angle glaucoma: the blue mountains eye study</article-title>. <source>Ophthalmology</source>. (<year>2005</year>) <volume>112</volume>:<fpage>245</fpage>&#x02013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1016/j.ophtha.2004.08.015</pub-id><pub-id pub-id-type="pmid">15691558</pub-id></citation></ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yannuzzi</surname> <given-names>LA</given-names></name> <name><surname>Negr ao</surname> <given-names>S</given-names></name> <name><surname>Tomohiro</surname> <given-names>I</given-names></name> <name><surname>Carvalho</surname> <given-names>C</given-names></name> <name><surname>Rodriguez-Coleman</surname> <given-names>H</given-names></name> <name><surname>Slakter</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Retinal angiomatous proliferation in age-related macular degeneration</article-title>. <source>Retina</source>. (<year>2012</year>) <volume>32</volume>:<fpage>416</fpage>&#x02013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1097/IAE.0b013e31823f9b3b</pub-id><pub-id pub-id-type="pmid">22451953</pub-id></citation></ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ikram</surname> <given-names>MK</given-names></name> <name><surname>Witteman</surname> <given-names>JC</given-names></name> <name><surname>Vingerling</surname> <given-names>JR</given-names></name> <name><surname>Breteler</surname> <given-names>MM</given-names></name> <name><surname>Hofman</surname> <given-names>A</given-names></name> <name><surname>de Jong</surname> <given-names>PT</given-names></name></person-group>. <article-title>Retinal vessel diameters and risk of hypertension: the Rotterdam Study</article-title>. <source>Hypertension</source>. (<year>2006</year>) <volume>47</volume>:<fpage>189</fpage>&#x02013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1161/01.HYP.0000199104.61945.33</pub-id><pub-id pub-id-type="pmid">16380526</pub-id></citation></ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gishti</surname> <given-names>O</given-names></name> <name><surname>Jaddoe</surname> <given-names>VW</given-names></name> <name><surname>Felix</surname> <given-names>JF</given-names></name> <name><surname>Klaver</surname> <given-names>CC</given-names></name> <name><surname>Hofman</surname> <given-names>A</given-names></name> <name><surname>Wong</surname> <given-names>TY</given-names></name> <etal/></person-group>. <article-title>Retinal microvasculature and cardiovascular health in childhood</article-title>. <source>Pediatrics</source>. (<year>2015</year>) <volume>135</volume>:<fpage>678</fpage>&#x02013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1542/peds.2014-3341</pub-id><pub-id pub-id-type="pmid">30371260</pub-id></citation></ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>L</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Zeng</surname> <given-names>Q</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Bian</surname> <given-names>G</given-names></name></person-group>. <article-title>A hybrid deep segmentation network for fundus vessels via deep-learning framework</article-title>. <source>Neurocomputing</source>. (<year>2021</year>) <volume>448</volume>:<fpage>168</fpage>&#x02013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.03.085</pub-id></citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>T</given-names></name> <name><surname>Kang</surname> <given-names>H</given-names></name> <name><surname>Li</surname> <given-names>N</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>K</given-names></name></person-group>. <article-title>L-Seg: an end-to-end unified framework for multi-lesion segmentation of fundus images</article-title>. <source>Neurocomputing</source>. (<year>2019</year>) <volume>349</volume>:<fpage>52</fpage>&#x02013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2019.04.019</pub-id></citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cheung</surname> <given-names>CYl</given-names></name> <name><surname>Zheng</surname> <given-names>Y</given-names></name> <name><surname>Hsu</surname> <given-names>W</given-names></name> <name><surname>Lee</surname> <given-names>ML</given-names></name> <name><surname>Lau</surname> <given-names>QP</given-names></name> <name><surname>Mitchell</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>Retinal vascular tortuosity, blood pressure, and cardiovascular risk factors</article-title>. <source>Ophthalmology</source>. (<year>2011</year>) <volume>118</volume>:<fpage>812</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1016/j.ophtha.2010.08.045</pub-id><pub-id pub-id-type="pmid">21146228</pub-id></citation></ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>X</given-names></name> <name><surname>Lu</surname> <given-names>Q</given-names></name> <name><surname>Yang</surname> <given-names>L</given-names></name> <name><surname>Hu</surname> <given-names>S</given-names></name> <name><surname>Chen</surname> <given-names>D</given-names></name> <name><surname>Hu</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>Quantization of fully convolutional networks for accurate biomedical image segmentation</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>. <publisher-loc>Salt Lake city, UT</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2018</year>). p. <fpage>8300</fpage>&#x02013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname> <given-names>Z</given-names></name> <name><surname>Yang</surname> <given-names>X</given-names></name> <name><surname>Cheng</surname> <given-names>KT</given-names></name></person-group>. <article-title>Joint segment-level and pixel-wise losses for deep learning based retinal vessel segmentation</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2018</year>) <volume>65</volume>:<fpage>1912</fpage>&#x02013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2018.2828137</pub-id><pub-id pub-id-type="pmid">29993396</pub-id></citation></ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fu</surname> <given-names>H</given-names></name> <name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>Lin</surname> <given-names>S</given-names></name> <name><surname>Wong</surname> <given-names>DWK</given-names></name> <name><surname>Liu</surname> <given-names>J</given-names></name></person-group>. <article-title>Deepvessel: retinal vessel segmentation via deep learning and conditional random field</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Athens</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>132</fpage>&#x02013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname> <given-names>Z</given-names></name> <name><surname>Yang</surname> <given-names>X</given-names></name> <name><surname>Cheng</surname> <given-names>KT</given-names></name></person-group>. <article-title>A three-stage deep learning model for accurate retinal vessel segmentation</article-title>. <source>Biomed Health Inf IEEE J</source>. (<year>2019</year>) <volume>23</volume>:<fpage>1427</fpage>&#x02013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2018.2872813</pub-id><pub-id pub-id-type="pmid">30281503</pub-id></citation></ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Katz</surname> <given-names>N</given-names></name> <name><surname>Goldbaum</surname> <given-names>M</given-names></name> <name><surname>Nelson</surname> <given-names>M</given-names></name> <name><surname>Chaudhuri</surname> <given-names>S</given-names></name></person-group>. <article-title>An image processing system for automatic retina diagnosis</article-title>. In: <source>Three-Dimensional Imaging and Remote Sensing Imaging. Vol. 902.</source> <publisher-loc>Los Angeles, CA</publisher-loc>: <publisher-name>International Society for Optics and Photonics</publisher-name>. (<year>1988</year>). p. <fpage>131</fpage>&#x02013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Spencer</surname> <given-names>T</given-names></name> <name><surname>Olson</surname> <given-names>JA</given-names></name> <name><surname>McHardy</surname> <given-names>KC</given-names></name> <name><surname>Sharp</surname> <given-names>PF</given-names></name> <name><surname>Forrester</surname> <given-names>JV</given-names></name></person-group>. <article-title>An image-processing strategy for the segmentation and quantification of microaneurysms in fluorescein angiograms of the ocular fundus</article-title>. <source>Comput Biomed Res</source>. (<year>1996</year>) <volume>29</volume>:<fpage>284</fpage>&#x02013;<lpage>302</lpage>. <pub-id pub-id-type="doi">10.1006/cbmr.1996.0021</pub-id><pub-id pub-id-type="pmid">8812075</pub-id></citation></ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Playout</surname> <given-names>C</given-names></name> <name><surname>Duval</surname> <given-names>R</given-names></name> <name><surname>Cheriet</surname> <given-names>F</given-names></name></person-group>. <article-title>A multitask learning architecture for simultaneous segmentation of bright and red lesions in fundus images</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Granada</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2018</year>). p. <fpage>101</fpage>&#x02013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>J</given-names></name> <name><surname>Banerjee</surname> <given-names>S</given-names></name> <name><surname>Grama</surname> <given-names>A</given-names></name> <name><surname>Scheirer</surname> <given-names>WJ</given-names></name> <name><surname>Chen</surname> <given-names>DZ</given-names></name></person-group>. <article-title>Neuron segmentation using deep complete bipartite networks</article-title>. <source>In: International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Quebec, QC</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2017</year>). p. <fpage>21</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-66185-8_3</pub-id></citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Z</given-names></name> <name><surname>Siddiquee</surname> <given-names>MMR</given-names></name> <name><surname>Tajbakhsh</surname> <given-names>N</given-names></name> <name><surname>Liang</surname> <given-names>J</given-names></name></person-group>. <article-title>Unet&#x0002B;&#x0002B;: A nested u-net architecture for medical image segmentation</article-title>. In: <source>Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support</source>. <publisher-loc>Granada</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2018</year>). p. <fpage>3</fpage>&#x02013;<lpage>11</lpage>. <pub-id pub-id-type="pmid">32613207</pub-id></citation></ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Bilinski</surname> <given-names>P</given-names></name> <name><surname>Prisacariu</surname> <given-names>V</given-names></name></person-group>. <article-title>Dense decoder shortcut connections for single-pass semantic segmentation</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>. <publisher-loc>Granada</publisher-loc> (<year>2018</year>) p. <fpage>6596</fpage>&#x02013;<lpage>605</lpage>.</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>H</given-names></name> <name><surname>Pan</surname> <given-names>Z</given-names></name> <name><surname>Cen</surname> <given-names>Q</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Chen</surname> <given-names>S</given-names></name></person-group>. <article-title>Multi-scale fully convolutional network for gland segmentation using three-class classification</article-title>. <source>Neurocomputing</source>. (<year>2020</year>) <volume>380</volume>:<fpage>150</fpage>&#x02013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2019.10.097</pub-id></citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Staal</surname> <given-names>J</given-names></name> <name><surname>Abr&#x000E0;moff</surname> <given-names>MD</given-names></name> <name><surname>Niemeijer</surname> <given-names>M</given-names></name> <name><surname>Viergever</surname> <given-names>MA</given-names></name> <name><surname>Van Ginneken</surname> <given-names>B</given-names></name></person-group>. <article-title>Ridge-based vessel segmentation in color images of the retina</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2004</year>) <volume>23</volume>:<fpage>501</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2004.825627</pub-id><pub-id pub-id-type="pmid">15084075</pub-id></citation></ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hoover</surname> <given-names>A</given-names></name> <name><surname>Kouznetsova</surname> <given-names>V</given-names></name> <name><surname>Goldbaum</surname> <given-names>M</given-names></name></person-group>. <article-title>Locating blood vessels in retinal images by piecewise threshold probing of a matched filter response</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2000</year>) <volume>19</volume>:<fpage>203</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1109/42.845178</pub-id><pub-id pub-id-type="pmid">10875704</pub-id></citation></ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fraz</surname> <given-names>MM</given-names></name> <name><surname>Remagnino</surname> <given-names>P</given-names></name> <name><surname>Hoppe</surname> <given-names>A</given-names></name> <name><surname>Uyyanonvara</surname> <given-names>B</given-names></name> <name><surname>Rudnicka</surname> <given-names>AR</given-names></name> <name><surname>Owen</surname> <given-names>CG</given-names></name> <etal/></person-group>. <article-title>An ensemble classification-based approach applied to retinal blood vessel segmentation</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2012</year>) <volume>59</volume>:<fpage>2538</fpage>&#x02013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2012.2205687</pub-id><pub-id pub-id-type="pmid">22736688</pub-id></citation></ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O</given-names></name> <name><surname>Fischer</surname> <given-names>P</given-names></name> <name><surname>Brox</surname> <given-names>T</given-names></name></person-group>. <source>U-Net: Convolutional Networks for Biomedical Image Segmentation</source>. <publisher-loc>Munich</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name> (<year>2015</year>).</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Niemeijer</surname> <given-names>M</given-names></name> <name><surname>Staal</surname> <given-names>J</given-names></name> <name><surname>van Ginneken</surname> <given-names>B</given-names></name> <name><surname>Loog</surname> <given-names>M</given-names></name> <name><surname>Abramoff</surname> <given-names>MD</given-names></name></person-group>. <article-title>Comparative study of retinal vessel segmentation methods on a new publicly available database</article-title>. In: <source>Medical imaging 2004: image processing. Vol. 5370. International Society for Optics and Photonics.</source> <publisher-loc>San Diego, CA</publisher-loc> (<year>2004</year>). p. <fpage>648</fpage>&#x02013;<lpage>56</lpage>.</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sinthanayothin</surname> <given-names>C</given-names></name> <name><surname>Boyce</surname> <given-names>JF</given-names></name> <name><surname>Cook</surname> <given-names>HL</given-names></name> <name><surname>Williamson</surname> <given-names>TH</given-names></name></person-group>. <article-title>Automated localisation of the optic disc, fovea, and retinal blood vessels from digital colour fundus images</article-title>. <source>Br J Ophthalmol</source>. (<year>1999</year>) <volume>83</volume>:<fpage>902</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1136/bjo.83.8.902</pub-id><pub-id pub-id-type="pmid">10413690</pub-id></citation></ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Soares</surname> <given-names>JV</given-names></name> <name><surname>Leandro</surname> <given-names>JJ</given-names></name> <name><surname>Cesar</surname> <given-names>RM</given-names></name> <name><surname>Jelinek</surname> <given-names>HF</given-names></name> <name><surname>Cree</surname> <given-names>MJ</given-names></name></person-group>. <article-title>Retinal vessel segmentation using the 2-D Gabor wavelet and supervised classification</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2006</year>) <volume>25</volume>:<fpage>1214</fpage>&#x02013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2006.879967</pub-id><pub-id pub-id-type="pmid">16967806</pub-id></citation></ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rangayyan</surname> <given-names>RM</given-names></name> <name><surname>Ayres</surname> <given-names>FJ</given-names></name> <name><surname>Oloumi</surname> <given-names>F</given-names></name> <name><surname>Oloumi</surname> <given-names>F</given-names></name> <name><surname>Eshghzadeh-Zanjani</surname> <given-names>P</given-names></name></person-group>. <article-title>Detection of blood vessels in the retina with multiscale Gabor filters</article-title>. <source>J Electron Imaging</source>. (<year>2008</year>) <volume>17</volume>:<fpage>023018</fpage>. <pub-id pub-id-type="doi">10.1117/1.2907209</pub-id><pub-id pub-id-type="pmid">17645476</pub-id></citation></ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ricci</surname> <given-names>E</given-names></name> <name><surname>Perfetti</surname> <given-names>R</given-names></name></person-group>. <article-title>Retinal blood vessel segmentation using line operators and support vector classification</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2007</year>) <volume>26</volume>:<fpage>1357</fpage>&#x02013;<lpage>1365</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2007.898551</pub-id><pub-id pub-id-type="pmid">17948726</pub-id></citation></ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Franklin</surname> <given-names>SW</given-names></name> <name><surname>Rajan</surname> <given-names>SE</given-names></name></person-group>. <article-title>Retinal vessel segmentation employing ANN technique by Gabor and moment invariants-based features</article-title>. <source>Appl Soft Comput</source>. (<year>2014</year>) <volume>22</volume>:<fpage>94</fpage>&#x02013;<lpage>100</lpage>. <pub-id pub-id-type="doi">10.1016/j.asoc.2014.04.024</pub-id></citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Chen</surname> <given-names>Y</given-names></name> <name><surname>Bekkers</surname> <given-names>E</given-names></name> <name><surname>Wang</surname> <given-names>M</given-names></name> <name><surname>Dashtbozorg</surname> <given-names>B</given-names></name> <name><surname>ter Haar Romeny</surname> <given-names>BM</given-names></name></person-group>. <article-title>Retinal vessel delineation using a brain-inspired wavelet transform and random forest</article-title>. <source>Pattern Recognit</source>. (<year>2017</year>) <volume>69</volume>:<fpage>107</fpage>&#x02013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2017.04.008</pub-id></citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Long</surname> <given-names>J</given-names></name> <name><surname>Shelhamer</surname> <given-names>E</given-names></name> <name><surname>Darrell</surname> <given-names>T</given-names></name></person-group>. <article-title>Fully convolutional networks for semantic segmentation</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>. <publisher-loc>Boston, MA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2015</year>). p. <fpage>3431</fpage>&#x02013;<lpage>40</lpage>.</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>Z</given-names></name> <name><surname>Cheng</surname> <given-names>J</given-names></name> <name><surname>Fu</surname> <given-names>H</given-names></name> <name><surname>Zhou</surname> <given-names>K</given-names></name> <name><surname>Hao</surname> <given-names>H</given-names></name> <name><surname>Zhao</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>CE-Net: context encoder network for 2D medical image segmentation</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2019</year>) <volume>38</volume>:<fpage>2281</fpage>&#x02013;<lpage>2292</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2019.2903562</pub-id><pub-id pub-id-type="pmid">30843824</pub-id></citation></ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>D</given-names></name> <name><surname>Haytham</surname> <given-names>A</given-names></name> <name><surname>Pottenburgh</surname> <given-names>J</given-names></name> <name><surname>Saeedi</surname> <given-names>OJ</given-names></name> <name><surname>Tao</surname> <given-names>Y</given-names></name></person-group>. <article-title>Hard attention net for automatic retinal vessel segmentation</article-title>. <source>IEEE J Biomed Health Inf</source> . (<year>2020</year>) <volume>24</volume>:<fpage>3384</fpage>&#x02013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2020.3002985</pub-id><pub-id pub-id-type="pmid">32750941</pub-id></citation></ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>C</given-names></name> <name><surname>Szemenyei</surname> <given-names>M</given-names></name> <name><surname>Yi</surname> <given-names>Y</given-names></name> <name><surname>Zhou</surname> <given-names>W</given-names></name> <name><surname>Bian</surname> <given-names>H</given-names></name></person-group>. <article-title>Residual spatial attention network for retinal vessel segmentation</article-title>. In: <source>International Conference on Neural Information Processing</source>. <publisher-loc>San Diego, CA</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2020</year>). p. <fpage>509</fpage>&#x02013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Xu</surname> <given-names>X</given-names></name></person-group>. <article-title>Pyramid U-net for retinal vessel segmentation</article-title>. In: <source>ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>. <publisher-loc>Toronto, ON</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2021</year>). p. <fpage>1125</fpage>&#x02013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Drozdzal</surname> <given-names>M</given-names></name> <name><surname>Vorontsov</surname> <given-names>E</given-names></name> <name><surname>Chartrand</surname> <given-names>G</given-names></name> <name><surname>Kadoury</surname> <given-names>S</given-names></name> <name><surname>Pal</surname> <given-names>C</given-names></name></person-group>. <article-title>The importance of skip connections in biomedical image segmentation</article-title>. In: <source>Deep Learning and Data Labeling for Medical Applications</source>. <publisher-loc>Athens</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>179</fpage>&#x02013;<lpage>87</lpage>.</citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Jin</surname> <given-names>Y</given-names></name> <name><surname>Xu</surname> <given-names>J</given-names></name> <name><surname>Xu</surname> <given-names>X</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name></person-group>. <article-title>Mdu-net: Multi-scale densely connected u-net for biomedical image segmentation</article-title>. <source>arXiv preprint</source> arXiv:181200352. (<year>2018</year>).</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>H</given-names></name> <name><surname>Shi</surname> <given-names>J</given-names></name> <name><surname>Qi</surname> <given-names>X</given-names></name> <name><surname>Wang</surname> <given-names>X</given-names></name> <name><surname>Jia</surname> <given-names>J</given-names></name></person-group>. <article-title>Pyramid scene parsing network</article-title>. In: <source>IEEE Conference on Computer Vision and Pattern Recognition</source>. <publisher-loc>Honolulu, HI</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>6230</fpage>&#x02013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>LC</given-names></name> <name><surname>Yang</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Xu</surname> <given-names>W</given-names></name> <name><surname>Yuille</surname> <given-names>AL</given-names></name></person-group>. <article-title>Attention to scale: Scale-aware semantic image segmentation</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> <publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2016</year>). p. <fpage>3640</fpage>&#x02013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Xia</surname> <given-names>Y</given-names></name> <name><surname>Song</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Cai</surname> <given-names>W</given-names></name></person-group>. <article-title>Multiscale network followed network model for retinal vessel segmentation</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Granada</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2018</year>). p. <fpage>119</fpage>&#x02013;<lpage>26</lpage>.</citation>
</ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Raza</surname> <given-names>SEA</given-names></name> <name><surname>Cheung</surname> <given-names>L</given-names></name> <name><surname>Epstein</surname> <given-names>D</given-names></name> <name><surname>Pelengaris</surname> <given-names>S</given-names></name> <name><surname>Khan</surname> <given-names>M</given-names></name> <name><surname>Rajpoot</surname> <given-names>NM</given-names></name></person-group>. <article-title>MIMO-Net: a multi-input multi-output convolutional neural network for cell segmentation in fluorescence microscopy images</article-title>. In: <source>2017 IEEE 14th International Symposium on Biomedical Imaging (ISBI 2017)</source> <publisher-loc>Melbourne, VIC</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>337</fpage>&#x02013;<lpage>40</lpage>.</citation>
</ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Graham</surname> <given-names>S</given-names></name> <name><surname>Chen</surname> <given-names>H</given-names></name> <name><surname>Gamper</surname> <given-names>J</given-names></name> <name><surname>Dou</surname> <given-names>Q</given-names></name> <name><surname>Heng</surname> <given-names>PA</given-names></name> <name><surname>Snead</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>MILD-Net: Minimal information loss dilated network for gland instance segmentation in colon histology images</article-title>. <source>Med Image Anal</source>. (<year>2019</year>) <volume>52</volume>:<fpage>199</fpage>&#x02013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1016/j.media.2018.12.001</pub-id><pub-id pub-id-type="pmid">30594772</pub-id></citation></ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> <publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2016</year>). p. <fpage>770</fpage>&#x02013;<lpage>8</lpage>. <pub-id pub-id-type="pmid">32166560</pub-id></citation></ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>J</given-names></name> <name><surname>Shen</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>G</given-names></name></person-group>. <article-title>Squeeze-and-excitation networks</article-title>. In: <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> <publisher-loc>Salt Lake City, UT</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2018</year>). p. <fpage>7132</fpage>&#x02013;<lpage>41</lpage>.</citation>
</ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>S</given-names></name> <name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Ruan</surname> <given-names>C</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name></person-group>. <article-title>Multi-stage attention-unet for wireless capsule endoscopy image bleeding area segmentation</article-title>. In: <source>2019 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</source>. <publisher-loc>San Diego, CA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2019</year>). p. <fpage>818</fpage>&#x02013;<lpage>25</lpage>.</citation>
</ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Q</given-names></name> <name><surname>Feng</surname> <given-names>B</given-names></name> <name><surname>Xie</surname> <given-names>L</given-names></name> <name><surname>Liang</surname> <given-names>P</given-names></name> <name><surname>Zhang</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>T</given-names></name></person-group>. <article-title>A cross-modality learning approach for vessel segmentation in retinal images</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2015</year>) <volume>35</volume>:<fpage>109</fpage>&#x02013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2015.2457891</pub-id><pub-id pub-id-type="pmid">26208306</pub-id></citation></ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alom</surname> <given-names>MZ</given-names></name> <name><surname>Hasan</surname> <given-names>M</given-names></name> <name><surname>Yakopcic</surname> <given-names>C</given-names></name> <name><surname>Taha</surname> <given-names>TM</given-names></name> <name><surname>Asari</surname> <given-names>VK</given-names></name></person-group>. <article-title>Recurrent residual convolutional neural network based on u-net (r2u-net) for medical image segmentation</article-title>. <source>arXiv[Preprint].</source>arXiv:180206955. (<year>2018</year>) <pub-id pub-id-type="doi">10.1109/NAECON.2018.8556686</pub-id><pub-id pub-id-type="pmid">30944843</pub-id></citation></ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>K</given-names></name> <name><surname>Kang</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Gao</surname> <given-names>Y</given-names></name> <name><surname>Li</surname> <given-names>T</given-names></name></person-group>. <article-title>BTS-DSN: deeply supervised neural network with short connections for retinal vessel segmentation</article-title>. <source>Int J Med Inform</source>. (<year>2019</year>) <volume>126</volume>:<fpage>105</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.03.015</pub-id><pub-id pub-id-type="pmid">31029251</pub-id></citation></ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>W</given-names></name> <name><surname>Yu</surname> <given-names>S</given-names></name> <name><surname>Ma</surname> <given-names>K</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Ding</surname> <given-names>X</given-names></name> <name><surname>Zheng</surname> <given-names>Y</given-names></name></person-group>. <article-title>Multi-task neural networks with spatial activation for retinal vessel segmentation and artery/vein classification</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Shenzhen</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2019</year>). p. <fpage>769</fpage>&#x02013;<lpage>78</lpage>.</citation>
</ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>B</given-names></name> <name><surname>Qiu</surname> <given-names>S</given-names></name> <name><surname>He</surname> <given-names>H</given-names></name></person-group>. <article-title>Dual encoding u-net for retinal vessel segmentation</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Shenzhen</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2019</year>). p. <fpage>84</fpage>&#x02013;<lpage>92</lpage>.</citation>
</ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y</given-names></name> <name><surname>Xia</surname> <given-names>Y</given-names></name> <name><surname>Song</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>D</given-names></name> <name><surname>Liu</surname> <given-names>D</given-names></name> <name><surname>Zhang</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Vessel-Net: retinal vessel segmentation under multi-path supervision</article-title>. In: <source>International Conference on Medical Image Computing and Computer-Assisted Intervention</source>. <publisher-loc>Shenzhen</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2019</year>). p. <fpage>264</fpage>&#x02013;<lpage>72</lpage>.</citation>
</ref>
<ref id="B52">
<label>52.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D</given-names></name> <name><surname>Dharmawan</surname> <given-names>DA</given-names></name> <name><surname>Ng</surname> <given-names>BP</given-names></name> <name><surname>Rahardja</surname> <given-names>S</given-names></name></person-group>. <article-title>Residual u-net for retinal vessel segmentation</article-title>. In: <source>2019 IEEE International Conference on Image Processing (ICIP)</source>. <publisher-loc>Taipei</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2019</year>). p. <fpage>1425</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="pmid">34422093</pub-id></citation></ref>
<ref id="B53">
<label>53.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>K</given-names></name> <name><surname>Zhang</surname> <given-names>X</given-names></name> <name><surname>Huang</surname> <given-names>S</given-names></name> <name><surname>Wang</surname> <given-names>Q</given-names></name> <name><surname>Chen</surname> <given-names>F</given-names></name></person-group>. <article-title>Ctf-net: retinal vessel segmentation via deep coarse-to-fine supervision network</article-title>. In: <source>2020 IEEE 17th International Symposium on Biomedical Imaging (ISBI)</source>. <publisher-loc>Iowa City, IA</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2020</year>). p. <fpage>1237</fpage>&#x02013;<lpage>41</lpage>.</citation>
</ref>
<ref id="B54">
<label>54.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roychowdhury</surname> <given-names>S</given-names></name> <name><surname>Koozekanani</surname> <given-names>DD</given-names></name> <name><surname>Parhi</surname> <given-names>KK</given-names></name></person-group>. <article-title>Iterative vessel segmentation of fundus images</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2015</year>) <volume>62</volume>:<fpage>1738</fpage>&#x02013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2015.2403295</pub-id><pub-id pub-id-type="pmid">25700436</pub-id></citation></ref>
<ref id="B55">
<label>55.</label>
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kassim</surname> <given-names>YM</given-names></name> <name><surname>Palaniappan</surname> <given-names>K</given-names></name></person-group>. <article-title>Extracting retinal vascular networks using deep learning architecture</article-title>. In: <source>2017 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</source>. <publisher-loc>Kansas City, MO</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>1170</fpage>&#x02013;<lpage>4</lpage>.</citation>
</ref>
<ref id="B56">
<label>56.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>Q</given-names></name> <name><surname>Meng</surname> <given-names>Z</given-names></name> <name><surname>Pham</surname> <given-names>TD</given-names></name> <name><surname>Chen</surname> <given-names>Q</given-names></name> <name><surname>Wei</surname> <given-names>L</given-names></name> <name><surname>Su</surname> <given-names>R</given-names></name></person-group>. <article-title>DUNet: A deformable network for retinal vessel segmentation</article-title>. <source>Knowl Based Syst</source>. (<year>2019</year>) <volume>178</volume>:<fpage>149</fpage>&#x02013;<lpage>62</lpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2019.04.025</pub-id></citation>
</ref>
<ref id="B57">
<label>57.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Dashtbozorg</surname> <given-names>B</given-names></name> <name><surname>Bekkers</surname> <given-names>E</given-names></name> <name><surname>Pluim</surname> <given-names>JP</given-names></name> <name><surname>Duits</surname> <given-names>R</given-names></name> <name><surname>ter Haar Romeny</surname> <given-names>BM</given-names></name></person-group>. <article-title>Robust retinal vessel segmentation via locally adaptive derivative frames in orientation scores</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2016</year>) <volume>35</volume>:<fpage>2631</fpage>&#x02013;<lpage>2644</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2016.2587062</pub-id><pub-id pub-id-type="pmid">27514039</pub-id></citation></ref>
<ref id="B58">
<label>58.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Orlando</surname> <given-names>JI</given-names></name> <name><surname>Prokofyeva</surname> <given-names>E</given-names></name> <name><surname>Blaschko</surname> <given-names>MB</given-names></name></person-group>. <article-title>A discriminatively trained fully connected conditional random field model for blood vessel segmentation in fundus images</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2016</year>) <volume>64</volume>:<fpage>16</fpage>&#x02013;<lpage>27</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2016.2535311</pub-id><pub-id pub-id-type="pmid">26930672</pub-id></citation></ref>
</ref-list>
</back>
</article> 
