<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1526412</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2024.1526412</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Character-interested binary-like image learning for text image demoir&#xe9;ing</article-title>
<alt-title alt-title-type="left-running-head">Zhang et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2024.1526412">10.3389/fphy.2024.1526412</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Zhanpei</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2885521/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liang</surname>
<given-names>Beicheng</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ren</surname>
<given-names>Tingting</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fan</surname>
<given-names>Chengmiao</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Rui</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Mu</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2892703/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff>
<institution>Department of Computer Science and Technology, Harbin Institute of Technology</institution>, <addr-line>Shenzhen</addr-line>, <addr-line>Guangdong</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1298136/overview">Zhiqin Zhu</ext-link>, Chongqing University of Posts and Telecommunications, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1131617/overview">Lingxiao Yang</ext-link>, Sun Yat-sen University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2893919/overview">Duo Chen</ext-link>, University of Electronic Science and Technology of China, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2894572/overview">Xixi Jia</ext-link>, Xidian University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Mu Li, <email>limu2022@hit.edu.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>13</day>
<month>12</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1526412</elocation-id>
<history>
<date date-type="received">
<day>11</day>
<month>11</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>11</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Zhang, Liang, Ren, Fan, Li and Li.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zhang, Liang, Ren, Fan, Li and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Despite the fact that the text image-based optical character recognition (OCR) methods have been applied to a wide range of applications, they do suffer from performance degradation when the image is contaminated with moir&#xe9; patterns for the sake of interference between the display screen and the camera. To tackle this problem, we propose a novel network for text image demoir&#xe9;ing. Specifically, to encourage our study on text images, we collected a dataset including a number of pairs of images with/without moir&#xe9; patterns, which is specific for text image demoir&#xe9;ing. In addition, due to the statistical differences among various channels on moir&#xe9; patterns, a multi-channel strategy is proposed, which roughly extracts the information associated with moir&#xe9; patterns and subsequently contributes to moir&#xe9; removal. In addition, our purpose on the text image is to increase the OCR accuracy, while other background pixels are insignificant. Instead of restoring all pixels like those in natural images, a character attention module is conducted, allowing the network to pay more attention on the optical character-associated pixels and also achieving a consistent image style. As a result from this method, characters can be more easily detected and more accurately recognized. Dramatic experimental results on our conducted dataset demonstrate the significance of our study and the superiority of our proposed method compared with state-of-the-art image restoration approaches. Specifically, the metrics of recall and F1-measure on recognition are increased from 56.32%/70.18% to 85.34%/89.36%.</p>
</abstract>
<kwd-group>
<kwd>multi-sensor imaging</kwd>
<kwd>deep learning</kwd>
<kwd>text image</kwd>
<kwd>demoir&#xe9;ing</kwd>
<kwd>multi-channel</kwd>
<kwd>moir&#xe9; pattern</kwd>
<kwd>optical character recognition</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Radiation Detectors and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Due to the huge number of text images, the automatic text recognition from a given image is quite necessary in recent years. Thanks to the techniques of optical character recognition (OCR) [<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B3">3</xref>], image-based text detection [<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>] and recognition [<xref ref-type="bibr" rid="B6">6</xref>] have been effectively improved and are widely applied to many applications, such as ID card recognition [<xref ref-type="bibr" rid="B7">7</xref>], table recognition <xref ref-type="bibr" rid="B8">[8],</xref> and license plate recognition [<xref ref-type="bibr" rid="B9">9</xref>, <xref ref-type="bibr" rid="B10">10</xref>]. Despite the fact that these methods have achieved satisfactory performances, they are sensitively influenced by the quality of images. As displayed in <xref ref-type="fig" rid="F1">Figure 1</xref>, it is a general and inevitable phenomenon that the captured image is corrupted with diverse moir&#xe9; patterns due to interference between the display screen and the camera, resulting in significant performance degradation in both character detection and recognition. Thus, in this paper, we focus on the moir&#xe9; pattern removal from the text images for OCR.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>OCR on the images with <bold>(A)</bold> or without <bold>(B)</bold> moir&#xe9; patterns. Characters in green denote the accurate recognition, and characters in red denote the inaccurate recognition. Making a comparison between <bold>(A)</bold> and <bold>(B)</bold>, recognition accuracy on the text image is significantly influenced by the moir&#xe9; patterns.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g001.tif"/>
</fig>
<p>It is particularly challenging to remove moir&#xe9; patterns from photographs. Different from other corruptions, such as noise [<xref ref-type="bibr" rid="B11">11</xref>, <xref ref-type="bibr" rid="B12">12</xref>], rain <xref ref-type="bibr" rid="B13">[13</xref>, <xref ref-type="bibr" rid="B14">14]</xref>, and haze [<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>], the moir&#xe9; pattern exhibits a diverse range of characteristics. Specifically, as shown in <xref ref-type="fig" rid="F1">Figure 1B</xref>, colors, thickness, and shapes (stripes or ripples) are even diverse across different areas in a photograph, and the frequency domain, as analyzed in <xref ref-type="bibr" rid="B17">[17],</xref> further demonstrates its complexity.</p>
<p>To restore the image, [<xref ref-type="bibr" rid="B18">18</xref>] proposed a convolutional neural network (CNN), in which a multi-resolution strategy is adopted to remove the moir&#xe9; patterns from a wide range of frequencies. Inspired by this work, other studies [<xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B19">19</xref>&#x2013;<xref ref-type="bibr" rid="B22">22</xref>] have been proposed for image demoir&#xe9;ing. Despite the fact that these aforementioned works effectively obtain a moir&#xe9;-free image from the input, they are only adaptive for natural images, as the structures between text images and natural images differ significantly. Compared with natural images, the key information in text images is the optical characters. In other words, the purpose of text image demoir&#xe9;ing is to improve the accuracy of text recognition after restoration, which encourages us to pay more attention on the optical character-associated pixels. Thus, not only the moir&#xe9; patterns should be removed from the raw image but also the semantic structures of optical characters should be enhanced.</p>
<p>To achieve this goal, we propose the text image demoir&#xe9;ing network (TIDNet). Considering that the moir&#xe9; pattern in the G (green) channel is statistically weaker than that in the R (red) and B (blue) channels [<xref ref-type="bibr" rid="B17">17</xref>], its edge patterns are roughly but adaptively extracted by our presented rough moir&#xe9; pattern extraction module, regardless of whether the scales of values in the R, G, and B channels are different or similar. Furthermore, we also propose a character attention module, allowing the network to particularly pay much more attention on the optical characters for our OCR application. In detail, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, it is obvious that under different viewpoints and capturing distances, colors of moir&#xe9;-contaminated images captured from the same image differ significantly, making complete recovery more difficult. In addition, if an image is covered by watermarks (<xref ref-type="fig" rid="F2">Figure 2A</xref>), it seems impossible to restore it from the contaminated images due to the missing information in image collection (<xref ref-type="fig" rid="F2">Figures 2B&#x2013;D</xref>). Subsequently, the inaccurate background pixel estimation may even inversely result in the degradation of performance. In fact, we need to improve the recognition accuracy. The greater the difference between the foreground and background, the easier it is to detect and recognize the text. Thus, apart from image demoir&#xe9;ing, we further transform diverse image styles to a consistent version, where the background pixels are white, while the foreground characters are black. Thanks to this strategy, not only the estimation for the complex background is avoided but also the difference between the characters and background pixels is enlarged, contributing to both character detection and recognition. In addition, a mask strategy and a semantic measurement are jointly introduced, allowing our model to pay much more attention on the character-associated pixels.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>
<bold>(A)</bold> Image without moir&#xe9; patterns. <bold>(B&#x2013;D)</bold> Images with moir&#xe9; patterns, which are captured under different viewpoints and distances.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g002.tif"/>
</fig>
<p>In order to achieve moir&#xe9; pattern removal, a dataset is necessary. In addition, we create a text image dataset named HITSZ-TID, which is composed of 3,739 pairs of images. For each pair, it consists of an image contaminated with moir&#xe9; patterns, as well as its associated reference image without moir&#xe9; patterns. Particularly, we extract the contaminated image under multiple devices, viewpoints, and distances, ensuring the diversity and generalization of our collected dataset.</p>
<p>The main contributions of this paper are as follows:<list list-type="simple">
<list-item>
<p>&#x2022; A text image demoir&#xe9;ing network (TIDNet) is particularly designed for text image demoir&#xe9;ing. Thanks to our proposed method, the recognition accuracy on text images contaminated with moir&#xe9; patterns is greatly improved. Values of Recall and F1-measure on recognition increased from 56.32%/70.18% to 85.34%/89.36%.</p>
</list-item>
<list-item>
<p>&#x2022; The rough moir&#xe9; pattern extraction module and character attention module are jointly introduced into our TIDNet. Due to the differences in different channels on the moir&#xe9; patterns, the moir&#xe9; is first removed roughly. Furthermore, the textural and semantic characters are also exploited, which are specifically adaptive for text image moir&#xe9; removal.</p>
</list-item>
<list-item>
<p>&#x2022; A dataset HITSZ-TID which is for text image demoir&#xe9;ing is created. It consists of 3,739 image pairs, where each pair contains an image contaminated with moir&#xe9; patterns and its corresponding reference image free from the moir&#xe9; patterns. This dataset fills the gap between the OCR and image demoir&#xe9;ing, contributing to the research study on these two fields.</p>
</list-item>
</list>
</p>
<p>The rest of this paper is organized as follows. In <xref ref-type="sec" rid="s2">Section 2</xref>, some related works about image demoir&#xe9;ing and text image processing are briefly described. Our created dataset and proposed TIDNet are then introduced in <xref ref-type="sec" rid="s3">Section 3</xref> and <xref ref-type="sec" rid="s4">Section 4</xref>, respectively. To demonstrate the significance of text image demoir&#xe9;ing for OCR and the effectiveness of our proposed method, we conducted experiments in <xref ref-type="sec" rid="s5">Section 5</xref>, followed by conclusion in <xref ref-type="sec" rid="s6">Section 6</xref>.</p>
</sec>
<sec id="s2">
<title>2 Related works</title>
<p>In this section, we briefly introduce the related works on image demoir&#xe9;ing and text image processing.</p>
<sec id="s2-1">
<title>2.1 Image demoir&#xe9;ing</title>
<p>Due to the interference of different repetitive patterns, the image contaminated with moir&#xe9; patterns is an inevitable phenomenon. In recent years, various methods have been proposed for moir&#xe9; pattern removal. By exploiting the prior assumption that moir&#xe9; patterns are dissimilar on textures, a low-rank and sparse matrix decomposition method [<xref ref-type="bibr" rid="B23">23</xref>] was developed to achieve demoir&#xe9;ing on high-frequency textures. Different from this hand-crafted feature-based method, [<xref ref-type="bibr" rid="B18">18</xref>] primarily utilized the CNN for moir&#xe9; image restoration. Considering that moir&#xe9; patterns widely span in different resolution scales, multiple resolutions were jointly exploited in [<xref ref-type="bibr" rid="B18">18</xref>]. Followed by it, [<xref ref-type="bibr" rid="B21">21</xref>] also presented a multi-scale feature enhancement network for moir&#xe9; image restoration. In addition, a coarse-to-fine strategy was presented in [<xref ref-type="bibr" rid="B24">24</xref>], which introduced another fine-scale network to refine the demoir&#xe9;d image obtained from the coarse-scale network. In addition, instead of relying solely on real captured images like [<xref ref-type="bibr" rid="B18">18</xref>], [<xref ref-type="bibr" rid="B24">24</xref>] modeled the formation of moir&#xe9; patterns and generated a large-scale synthetic dataset. Furthermore, [<xref ref-type="bibr" rid="B20">20</xref>] proposed a learnable bandpass filter and a two-step tone-mapping strategy for moir&#xe9; pattern removal and color restoration, respectively. [<xref ref-type="bibr" rid="B25">25</xref>] constructed a moir&#xe9; removal and brightness improvement (MRBI) database using aligned moir&#xe9;-free and moir&#xe9; images and proposed a CNN with additive and multiplicative modules to transfer the low light moir&#xe9; image to the bright moir&#xe9;-free image. Considering that the moir&#xe9; patterns mainly located on the high-frequency domain, the wavelet was embedded into the network [<xref ref-type="bibr" rid="B26">26</xref>], in which the features represented by the wavelet transformation were then processed. To compensate for the difference in domains between the training and the testing sets, a domain adaptation mechanism was further exploited to fine-tune the output. Similarly, [<xref ref-type="bibr" rid="B27">27</xref>] also introduced a wavelet-based dual-branch network to separate the frequencies of moir&#xe9; patterns from the image content. By exploiting progressive feature fusion and channel-wise attention, the attentive fractal network was proposed in [<xref ref-type="bibr" rid="B28">28</xref>]. In addition, [<xref ref-type="bibr" rid="B29">29</xref>] proposed another attention network named C3Net, which focuses on channel, color, and concatenation. Different from these aforementioned methods from single-image demoir&#xe9;ing, the multi-frame-based image demoir&#xe9;ing was also studied in [<xref ref-type="bibr" rid="B19">19</xref>].</p>
<p>Despite the fact that a number of deep learning-based approaches have been proposed for moir&#xe9;-free image restoration, almost all of them are designed for natural images, which are not particularly adaptive for the text images.</p>
</sec>
<sec id="s2-2">
<title>2.2 Text image processing</title>
<p>The quality of the text image has a key influence on the accuracy of ORC. According to this purpose, some works on text image processing have been studied. For instance, several artificial filters were compared on low-resolution text images [<xref ref-type="bibr" rid="B30">30</xref>]. Subsequently, SRCNN [<xref ref-type="bibr" rid="B31">31</xref>] was applied to the text image super-resolution [<xref ref-type="bibr" rid="B32">32</xref>]. To achieve the scene text image super-resolution, [<xref ref-type="bibr" rid="B33">33</xref>] designed a text-oriented network, in which the sequential information and character boundaries were enhanced. In addition, in <xref ref-type="bibr" rid="B34">[34],</xref> the image was decomposed into the text, foreground, and background, which were beneficial for text boundary recovery and color restoration, respectively. Considering the text-specific properties, [<xref ref-type="bibr" rid="B35">35</xref>] utilized the text-level layouts and character-level details for text image super-resolution. Apart from this super-resolution application, some deblurring approaches [<xref ref-type="bibr" rid="B36">36</xref>&#x2013;<xref ref-type="bibr" rid="B42">42</xref>] have also been proposed for text images. Specifically,[<xref ref-type="bibr" rid="B38">38</xref>] introduced two-tone prior to estimate the kernel for image deblurring. The deep neural network followed by sequential highway connections was exploited to restore the blurry image to a clear image. Furthermore, by constructing a text-specific hybrid dictionary, the powerful contextual information was then extracted for blind text image deblurring [<xref ref-type="bibr" rid="B39">39</xref>, <xref ref-type="bibr" rid="B43">43</xref>, <xref ref-type="bibr" rid="B44">44</xref>]. For the text image detection and recognition, [<xref ref-type="bibr" rid="B45">45</xref>] proposed a mathematical model based on the Riesz fractional operator to enhance details of the edge information in license plate images, hence improving the performance. In addition, a method [<xref ref-type="bibr" rid="B46">46</xref>] for predicting hidden (masked) text parts was proposed to fill the gaps of non-transcribable parts in the unstructured document OCR.</p>
<p>Although these methods were studied for text image super-resolution or deblurring, they are not adaptive for the application of demoir&#xe9;ing, due to the much more complex distributions or structures of the moir&#xe9; patterns. Therefore, it is quite significant to propose a specific network for text image demoir&#xe9;ing. A related work named MsMa-Net [<xref ref-type="bibr" rid="B47">47</xref>] was proposed for moir&#xe9; removal in document images. However, our proposed method is quite different from that of [<xref ref-type="bibr" rid="B47">47</xref>]. Referring to the dataset, only 80 images were used for dataset construction, whereas 551 images were used in our dataset, resulting in 3,739 pairs. Furthermore, we further take text priors, e.g., gradient, channel, and semantic information, into account, which contribute to our performance improvement on detection and recognition. In addition, although MsMa-Net also mentioned binarization for the output, it still first enforced the output to be the same to the reference image in the color version, which was then followed by a threshold processing to achieve binarization. By contrast, our proposed dataset TIDNet directly transforms various inputs to a binary-like ground-truth without any reference estimation, making it easier to remove the influences of diverse backgrounds and contributes to image reconstruction. Thus, this work will considerably benefit future research on text image processing and OCR.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Dataset</title>
<p>In this study, for training and testing purposes, we collect 3,739 pairs of contaminated moir&#xe9; images and uncontaminated reference images to serve as a text image benchmark for moir&#xe9; pattern removal. Specifically, we download the reference text images in Chinese or English from the internet, which are then used for capturing contaminated images.</p>
<sec id="s3-1">
<title>3.1 Image capture</title>
<p>Similar to [<xref ref-type="bibr" rid="B18">18</xref>], each reference image is surrounded by a black border for alignment, which will be analyzed in Subsection 3.2. As displayed in <xref ref-type="fig" rid="F3">Figure 3</xref>, the image is first located in the center of the display screen, which is then captured using a mobile phone. Notably, the black border is always completely captured, and each photo is taken from a random distance or viewpoint, guaranteeing the diversity of the moir&#xe9; patterns.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Image alignment. The reference image and the captured image contaminated with moir&#xe9; patterns are aligned according to their corresponding corners: <bold>(A)</bold> reference image, <bold>(B)</bold> captured image, and <bold>(C)</bold> aligned image.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g003.tif"/>
</fig>
<p>To further enhance diversity in our created dataset, we use a variety of mobile phones and monitor screens. <xref ref-type="table" rid="T1">Table 1</xref> lists the detailed information of our used mobile phones and display screens. Specifically, eight types of mobile phones and seven types of display screens are used for capturing images. Taking other aforementioned variables into account such as distances and viewpoints, 3,739 pairs of images are totally obtained.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Detailed information of mobile phones and display screens for capturing images.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Mobile phone</th>
<th colspan="2" align="center">Display screen</th>
</tr>
<tr>
<th align="center">Model</th>
<th align="center">Model</th>
<th align="center">Resolution</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Huawei Mate 30 Pro</td>
<td align="center">AIDU LJ240S</td>
<td align="center">
<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mn>1920</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Redmi Note 11 Pro</td>
<td align="center">Redmi RMMNT238NF</td>
<td align="center">
<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mn>1920</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">iPhone 8 Plus</td>
<td align="center">Hanpon E2206</td>
<td align="center">
<inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mn>1920</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">VIVO X21S</td>
<td align="center">ThinkPad E450</td>
<td align="center">
<inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mn>1366</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>768</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Redmi MAX3</td>
<td align="center">ThinkPad E14</td>
<td align="center">
<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mn>1920</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">iPhone 8</td>
<td align="center">ThinkPad E490</td>
<td align="center">
<inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mn>1920</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1080</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">Huawei Nova 5</td>
<td align="center">&#x2014;</td>
<td align="center">&#x2014;</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-2">
<title>3.2 Image alignment</title>
<p>To achieve the training phase in an end-to-end way, the contaminated image should be aligned with its corresponding reference image at the pixel-to-pixel level. Although [<xref ref-type="bibr" rid="B18">18</xref>, <xref ref-type="bibr" rid="B25">25</xref>] proposed the corner or patch matching algorithms for image alignment, these automatic strategies still encounter a slight misalignment. Different from the natural images, the misalignment under even several pixels would make a great influence on the text image restoration. Thus, we manually detect the corresponding corners for the text image alignment. As shown in <xref ref-type="fig" rid="F3">Figures 3A, B</xref>, four corners in the reference image and contaminated image are detected, respectively, through which the geometric transformation between these two images is estimated. Finally, we obtained the aligned image with moir&#xe9; patterns, as displayed in <xref ref-type="fig" rid="F3">Figure 3C</xref>.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Proposed method</title>
<p>The pipeline in our proposed method is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. It is clear that there are two branches for the moir&#xe9;-free image generation. From the bottom to top, our proposed rough moir&#xe9; extraction module and the three-channel network are first exploited to remove the moir&#xe9; pattern in a rough way. By combining the feature maps from this branch with the backbone network and introducing the character attention module, a more accurate moir&#xe9;-free image is generated. Notably, we follow [<xref ref-type="bibr" rid="B48">48</xref>] as the backbone, in which the original resolution subnetwork (ORS-Net), channel attention block (CAB), and supervised attention module (SAM) are utilized.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Pipeline of our proposed TIDNet. The rough moir&#xe9; pattern extraction module in the first branch is introduced to estimate the moir&#xe9;-free image in a simple but efficient way. By enforcing feature maps from this branch to the second branch and taking the character attention module into account, a more accurate image without moir&#xe9; patterns is finally obtained. Notably, all blocks in our proposed method enjoy different weights.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g004.tif"/>
</fig>
<sec id="s4-1">
<title>4.1 Rough moir&#xe9; pattern extraction module</title>
<p>According to [<xref ref-type="bibr" rid="B17">17</xref>], moir&#xe9; patterns are mainly shaped in curves and stripes, which benefit from their specific properties. Obviously, extracting these properties that are different from those in the reference image would help to remove the moir&#xe9; patterns. Fortunately, similar to [<xref ref-type="bibr" rid="B17">17</xref>], we statistically find that by decomposing the contaminated image into R, G, and B (red, green, and blue) channels, the G channel encounters much slighter moir&#xe9; patterns than those in the R and B channels, as displayed in <xref ref-type="fig" rid="F5">Figure 5</xref>. Of course, subtraction between the G channel and the R/B channel is a simple way to roughly obtain moir&#xe9;-associated information for image restoration. However, despite the fact that different channels suffer from different moir&#xe9; patterns, they also exhibit different scales of values. In other words, it is possible that one channel may have much larger or smaller values than that in the remaining one or two channels, subsequently making the aforementioned channel subtraction strategy useless. In order to tackle this problem, in this study, we introduce a learnable strategy through which the differences in value scales are adaptively alleviated.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Display of the transformed channels and their subtraction. Images at the second row denote transformed R <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, G <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and B <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> channels, respectively. At the first row, &#x201c;R-G&#x201d; <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> denotes the subtraction between R and G. So does &#x201c;B-G&#x201d;.<inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g005.tif"/>
</fig>
<p>Mathematically, let the contaminated image be <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the height and width of <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively. By decomposing <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into the three channels, we can obtain <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> corresponding to the R, G, and B channels, respectively. By forwarding these three inputs into their associated convolution blocks, we can obtain<disp-formula id="e1">
<mml:math id="m20">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf20">
<mml:math id="m21">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf21">
<mml:math id="m22">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf22">
<mml:math id="m23">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>Conv</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the convolution blocks and <inline-formula id="inf23">
<mml:math id="m24">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>/<inline-formula id="inf24">
<mml:math id="m25">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>/<inline-formula id="inf25">
<mml:math id="m26">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The moir&#xe9; patterns can then be roughly extracted through<disp-formula id="e2">
<mml:math id="m27">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf27">
<mml:math id="m29">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are both extracted features associated with the moir&#xe9; patterns. In <xref ref-type="disp-formula" rid="e1">Equation 1</xref>, the scales of values for different channels are adaptively transformed to a consistent subspace, which are adaptively tuned through a task-driven strategy, so that the moir&#xe9; patterns can be roughly extracted and contribute to moir&#xe9;-free image generation. As shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, it is easy to observe that our presented technique indeed achieves the superiority.</p>
<p>In addition, since the edges are also an additional prior for moir&#xe9;-contaminated images, we further apply the Sobel operator [<xref ref-type="bibr" rid="B49">49</xref>] to enhance the edge information of three channels, as shown in the second row of <xref ref-type="fig" rid="F6">Figure 6</xref>. Similar to <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, these edge maps associated with the &#x201c;G&#x201d; channel are subtracted from the other two maps via <xref ref-type="disp-formula" rid="e3">Equation 3</xref>.<disp-formula id="e3">
<mml:math id="m30">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Sobel</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Sobel</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Sobel</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Display of the edge maps of the three channels and their subtraction. Images at the second row denote edge maps of R, G, and B channels, respectively. At the first row, &#x201c;Edge R-G&#x201d; denotes the subtraction between Edge R and Edge G. So does &#x201c;Edge B-G&#x201d;.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g006.tif"/>
</fig>
<p>After obtaining <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf32">
<mml:math id="m35">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf33">
<mml:math id="m36">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf34">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, we then concatenate them as a single input, which is combined with three channel inputs. As displayed in the middle part of <xref ref-type="fig" rid="F4">Figure 4</xref>, the concatenated inputs are forwarded into their corresponding convolution block and U-Net-like network. By further making a concatenation and taking the raw image <inline-formula id="inf35">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into account again, the output <inline-formula id="inf36">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is finally obtained through the supervised attention module [<xref ref-type="bibr" rid="B48">48</xref>]. By introducing the Charbonnier loss [<xref ref-type="bibr" rid="B50">50</xref>], <inline-formula id="inf37">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is obtained in a supervised way, as defined in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>:<disp-formula id="e4">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where the constant <inline-formula id="inf38">
<mml:math id="m42">
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is empirically set to <inline-formula id="inf39">
<mml:math id="m43">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf40">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the ground-truth image (we will analyze it in the following <xref ref-type="sec" rid="s4-2">Subsection 4.2</xref>).</p>
</sec>
<sec id="s4-2">
<title>4.2 Character attention module</title>
<p>Different from the natural image-based restoration which focuses on all pixels equally, the purpose of our task is to increase the recognition accuracy after demoir&#xe9;ing. In other words, we focus on the characters rather than the surrounding background pixels. In fact, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, some images indeed include quite complex backgrounds, such as watermarking and diverse colors. Strongly enforcing the inputs to be the same to these reference images with complex backgrounds are impossible. Therefore, in this paper, we first transform the reference image <inline-formula id="inf41">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> into a binary-like version <inline-formula id="inf42">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, where the pixel values of characters are all close to 0, while others are all close to 255, as displayed in <xref ref-type="fig" rid="F7">Figure 7</xref>. The more remarkable the characters, the greater the performance. Thanks to the generation of the image <inline-formula id="inf43">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we can just transform the contaminated images into a consistent style version no matter whether inputs encounter diverse backgrounds, but we can also increase the difference between the characters and the background, contributing to a more accurate text detection and recognition.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Display of the reference image with the complex background and the ground-truth image with the binary-like background.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g007.tif"/>
</fig>
<p>By forwarding <inline-formula id="inf44">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> through the backbone network, we can formulate the residual output <inline-formula id="inf45">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> [<xref ref-type="bibr" rid="B11">11</xref>] guided by <inline-formula id="inf46">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is shown as follows:<disp-formula id="e5">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>
<xref ref-type="disp-formula" rid="e5">Equation 5</xref> is used to encourage the reconstructed image to be similar to the ground-truth at the pixel level. Notably, feature maps obtained from SAM are also introduced into this <inline-formula id="inf47">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-related branch. For <inline-formula id="inf48">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is enforced to be similar to the ground-truth image <inline-formula id="inf49">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the feature maps from SAM would be beneficial for estimating <inline-formula id="inf50">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>In addition, to further allow our model to pay much more attention on the character-associated pixels, we regard <inline-formula id="inf51">
<mml:math id="m56">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the mask for the text image enhancement, which can be formulated as <xref ref-type="disp-formula" rid="e6">Equation 6</xref>:<disp-formula id="e6">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2299;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>Of course, images restored from the contaminated image should be easily recognized by an OCR model. Therefore, to enforce the recovered text images to exhibit their corresponding semantic priors, a text semantic loss is further introduced. Particularly, CRNN [<xref ref-type="bibr" rid="B51">51</xref>] followed by its pre-trained model is exploited. In this study, we use <inline-formula id="inf52">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ocr</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to denote the semantic evaluation on the recovered image, as defined in <xref ref-type="disp-formula" rid="e7">Equation 7</xref>:<disp-formula id="e7">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ocr</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>OCR</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mtext>CRNN</mml:mtext>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">res</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>text</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf53">
<mml:math id="m60">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>text</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> refers to the ground-truth of text information. Notably, the weights in CRNN are fixed, and the gradient would be transported to our designed network for model learning.</p>
<p>Taking the aforementioned analysis into account, the objective function of our proposed method is formulated as <xref ref-type="disp-formula" rid="e8">Equation 8</xref>:<disp-formula id="e8">
<mml:math id="m61">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ocr</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b7;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf54">
<mml:math id="m62">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf55">
<mml:math id="m63">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf56">
<mml:math id="m64">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf57">
<mml:math id="m65">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are non-zero parameters to trade-off these four terms.</p>
</sec>
<sec id="s4-3">
<title>4.3 Implementation details</title>
<p>We implement our TIDNet using PyTorch [<xref ref-type="bibr" rid="B52">52</xref>]. The model runs on two GPUs of NVIDIA RTX 3090 with CUDA version 11.2. Except the OCR-related network CRNN, we optimize our network through the Adam optimizer with the learning rate of <inline-formula id="inf58">
<mml:math id="m66">
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. In this study, we set the maximum of epochs to 50. The learning rate is gradually reduced by following cosine annealing, and the minimum of our learning rate is <inline-formula id="inf59">
<mml:math id="m67">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. In addition, the input image is resized to <inline-formula id="inf60">
<mml:math id="m68">
<mml:mrow>
<mml:mn>256</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, and the batch size is set to 12. Empirically, we first remove <inline-formula id="inf61">
<mml:math id="m69">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the first 40 epochs, after which it is exploited. Referring to the parameters <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf63">
<mml:math id="m71">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf65">
<mml:math id="m73">
<mml:mrow>
<mml:mi>&#x3b7;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we empirically set them to 0.85, 0.5, 0.001, and 0.5, respectively.</p>
</sec>
</sec>
<sec id="s5">
<title>5 Experiments</title>
<p>To demonstrate the significance of text image demoir&#xe9;ing and effectiveness of our proposed TIDNet, experiments are conducted on our collected dataset. In this section, the experimental settings and evaluation metrics are first described. We then conducted ablation studies to substantiate the importance of our introduced strategies. Finally, our proposed method is compared with state of the arts to further show its superiority.</p>
<sec id="s5-1">
<title>5.1 Experimental settings and evaluation metrics</title>
<p>In this study, we divide the dataset into two subsets: one for training and another for testing. Specifically, 3,627 pairs are regarded as the training set and 112 contaminated images are used as the testing set. Notably, in testing images, there are totally 43,152 characters.</p>
<p>Since the final purpose of our TIDNet is to improve the OCR performance, we introduce recall, precision, and F1-measure (F1-m) scores as the quantitative evaluations for both text detection and recognition. Recall is the ratio between the number of correctly predicted characters and the number of labeled characters. It indicates how many items are correctly identified. Correspondingly, precision is the ratio between the number of correctly predicted characters and the number of all predicted characters. F1-m is a metric define by the recall score and the precision score: <inline-formula id="inf66">
<mml:math id="m74">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Notably, most existing methods such as natural image demoir&#xe9;ing and image denoising adopted the widely used quantitative evaluations: peak signal-to-noise ratio (PSNR) and structural similarity (SSIM). However, they are not suitable for our task. In our text image demoir&#xe9;ing, the contaminated images are enforced to be close to the binary-like ground-truths, while these guided references usually encounter imbalanced numbers of foreground and background pixels. Generally, the background pixels cover much more areas than foreground pixels. Due to this constraint, PSNR or SSIM values would be inaccurate if some character-related pixels are erased but the background is clear. In other words, the erased pixels do not make an obvious influence on PSNR or SSIM. By contrast, the detection and recognition performances of images suffered from erased characters would be remarkably influenced. Thus, in this paper, recall, precision, and F1-m are more reasonable for our task.</p>
</sec>
<sec id="s5-2">
<title>5.2 Ablation study</title>
<sec id="s5-2-1">
<title>5.2.1 Is text image demoir&#xe9;ing necessary?</title>
<p>Due to the contamination of moir&#xe9; patterns, it would be difficult to detect and recognize characters from the text image. As tabulated in <xref ref-type="table" rid="T2">Table 2</xref>, metrics of recall and F1-m on the contaminated images are only (53.82% and 69.77%) and (56.32% and 70.18%) for detection and recognition, respectively. However, thanks to our proposed TIDNet, these two metrics exhibit dramatic enhancement, which are (92.92% and 95.73%) and (85.34% and 89.36%). Obviously, it is quite significant for text image demoir&#xe9;ing.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Ablation studies conducted on our collected dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th colspan="3" align="center">Detection</th>
<th colspan="3" align="center">Recognition</th>
</tr>
<tr>
<th align="center">Metrics</th>
<th align="center">Recall</th>
<th align="center">Precision</th>
<th align="center">F1-measure</th>
<th align="center">Recall</th>
<th align="center">Precision</th>
<th align="center">F1-measure</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Moir&#xe9;</td>
<td align="center">53.82%</td>
<td align="center">99.15%</td>
<td align="center">69.77%</td>
<td align="center">56.32%</td>
<td align="center">93.10%</td>
<td align="center">70.18%</td>
</tr>
<tr>
<td align="center">Backbone</td>
<td align="center">24.88%</td>
<td align="center">98.99%</td>
<td align="center">39.77%</td>
<td align="center">24.38%</td>
<td align="center">80.54%</td>
<td align="center">37.43%</td>
</tr>
<tr>
<td align="center">Backbone &#x2b; Mask</td>
<td align="center">58.79%</td>
<td align="center">99.01%</td>
<td align="center">73.77%</td>
<td align="center">53.85%</td>
<td align="center">80.86%</td>
<td align="center">64.65%</td>
</tr>
<tr>
<td align="center">Backbone &#x2b; Mask &#x2b; Channel</td>
<td align="center">64.13%</td>
<td align="center">99.34%</td>
<td align="center">77.94%</td>
<td align="center">57.31%</td>
<td align="center">81.05%</td>
<td align="center">67.15%</td>
</tr>
<tr>
<td align="center">Backbone &#x2b; Mask &#x2b; Channel &#x2b; RMEM</td>
<td align="center">68.93%</td>
<td align="center">98.88%</td>
<td align="center">81.23%</td>
<td align="center">62.02%</td>
<td align="center">83.02%</td>
<td align="center">71.00%</td>
</tr>
<tr>
<td align="center">Backbone &#x2b; Mask &#x2b; Channel &#x2b; RMEM &#x2b; OCR</td>
<td align="center">
<bold>92.92%</bold>
</td>
<td align="center">98.72%</td>
<td align="center">
<bold>95.73%</bold>
</td>
<td align="center">
<bold>85.34%</bold>
</td>
<td align="center">
<bold>93.78%</bold>
</td>
<td align="center">
<bold>89.36%</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;Moir&#xe9;&#x201d; denotes results on the raw image without any processing. &#x201c;Backbone&#x201d; denotes results obtained by the baseline network, which is guided by <inline-formula id="inf67">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. &#x201c;Backbone &#x2b; Mask&#x201d; denotes results by adding the mask loss <inline-formula id="inf68">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. &#x201c;Backbone &#x2b; Mask &#x2b; Channel&#x201d; denotes results by additionally introducing the three-channel network. &#x201c;Backbone &#x2b; Mask &#x2b; Channel &#x2b; RMEM&#x201d; denotes results by additionally introducing the rough moir&#xe9; extraction module (RMEM). Similarly, &#x201c;Backbone &#x2b; Mask &#x2b; Channel &#x2b; OCR&#x201d; denotes results by adding the OCR semantic loss <inline-formula id="inf69">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ocr</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Notably, the best performance is highlighted by &#x201c;bold.&#x201d;</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5-2-2">
<title>5.2.2 Do the rough moir&#xe9; extraction module and three-channel network work?</title>
<p>Inspired by the specific property of moir&#xe9; patterns, the rough moir&#xe9; extraction module is first introduced to extract the edge information related to the moir&#xe9; patterns, which is then followed by our three-channel network. In detail, <xref ref-type="table" rid="T2">Table 2</xref> shows that the 3-channel network leads to significant improvements in recall and F1-measure. By further taking the rough moir&#xe9; extraction module into account, performance continues to increase.</p>
</sec>
<sec id="s5-2-3">
<title>5.2.3 Does the character attention module work?</title>
<p>To enforce the network to highly focus on our interested characters, the mask loss <inline-formula id="inf70">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and OCR loss <inline-formula id="inf71">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ocr</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are introduced into our proposed method. As listed in <xref ref-type="table" rid="T2">Table 2</xref>, these two losses significantly contribute to the performance improvement on both detection and recognition, exhibiting approximately 20%&#x2013;35% increase. Thus, particularly focusing on the character-related pixels and exploiting their semantic information are quite significant. Notably, when only <inline-formula id="inf72">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is utilized, experimental results are even inferior to those obtained from the raw data. Generally, character-associated pixels cover much less areas compared with background pixels, while <inline-formula id="inf73">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> equally pays attention on each pixel. In this case, even if the character estimation is incorrect, the influence on <inline-formula id="inf74">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> may be slight, rendering it worthless. Fortunately, by exploiting the mask loss and OCR loss, the importance on characters are then enhanced.</p>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> displays visualizations corresponding to <xref ref-type="table" rid="T2">Table 2</xref>. It is clear that when the backbone is applied, it removes the moir&#xe9; patterns. However, it also regards optical characters as the moir&#xe9;, which only makes the recovered image smooth and erases many character-related pixels. By contrast, thanks to our mask loss, the model highly focuses on characters, enhancing their associated pixels, as shown in the first image in the second row. Nevertheless, as character-related pixels are quite similar to some edge information, which also exists in the moir&#xe9; patterns, the reconstructed image is also contaminated by moir&#xe9; patterns. Thus, we further introduce the three-channel-based strategy followed our rough moir&#xe9; extraction module (RMEM). Obviously, not just the moir&#xe9; patterns are alleviated, but the backgrounds are also much closer to the ground-truth compared with those obtained by &#x201c;B&#x201d; and &#x201c;B &#x2b; M.&#x201d; Despite the fact that &#x201c;B &#x2b; M &#x2b; C &#x2b; RMEM&#x201d; jointly enhances characters and removes moir&#xe9; patterns, some recovered characters, as displayed in the enlarged details, encounter inaccurate semantic information. Fortunately, thanks to our introduced OCR loss, characters are further restored according to their semantics.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Visualizations of ablation studies corresponding to <xref ref-type="table" rid="T2">Table 2</xref>.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g008.tif"/>
</fig>
</sec>
<sec id="s5-2-4">
<title>5.2.4 Does the binary-like ground-truth work?</title>
<p>In our proposed method, the reference image <inline-formula id="inf75">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with diverse backgrounds is transformed to the ground-truth image <inline-formula id="inf76">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, which is binary-like. In this way, the difference between foreground and background pixels is remarkably enlarged, allowing the network to more easily detect and recognize characters or texts. The comparison by using <inline-formula id="inf77">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf78">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the guidance is shown in <xref ref-type="table" rid="T3">Table 3</xref>, proving the aforementioned analysis.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Results obtained by our proposed method guided by different reference images.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th colspan="3" align="center">Detection</th>
</tr>
<tr>
<th align="center">Metrics</th>
<th align="center">Recall</th>
<th align="center">Precision</th>
<th align="center">F1-measure</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">TIDNet (color)</td>
<td align="center">88.38%</td>
<td align="center">99.12%</td>
<td align="center">93.44%</td>
</tr>
<tr>
<td align="center">TIDNet</td>
<td align="center">
<bold>92.92%</bold>
</td>
<td align="center">98.72%</td>
<td align="center">
<bold>95.73%</bold>
</td>
</tr>
</tbody>
</table>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th colspan="3" align="center">Recognition</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">TIDNet (color)</td>
<td align="center">83.34%</td>
<td align="center">93.52%</td>
<td align="center">88.14%</td>
</tr>
<tr>
<td align="center">TIDNet</td>
<td align="center">
<bold>85.34%</bold>
</td>
<td align="center">
<bold>93.78%</bold>
</td>
<td align="center">
<bold>89.36%</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>&#x201c;TIDNet (color)&#x201d; and &#x201c;TIDNet&#x201d; denote our proposed method is supervised by <inline-formula id="inf79">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with diverse backgrounds and <inline-formula id="inf80">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with the consistent background, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In addition, <xref ref-type="fig" rid="F9">Figure 9</xref> further proves the significance of using the binary-like ground-truth image <inline-formula id="inf81">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as the guidance instead of the reference image <inline-formula id="inf82">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Generally, <inline-formula id="inf83">
<mml:math id="m91">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is corrupted with complex backgrounds such as colors and watermarking. In addition, the contaminated image may miss the information in the data collection, as shown in &#x201c;Moir&#xe9;&#x201d; in <xref ref-type="fig" rid="F9">Figure 9</xref>. Strongly enforcing the input to be identical to <inline-formula id="inf84">
<mml:math id="m92">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is too strict to achieve. As displayed in &#x201c;TIDNet (color)&#x201d; in <xref ref-type="fig" rid="F9">Figure 9</xref>, the background of this recovered image is not just significantly different from <inline-formula id="inf85">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">ref</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, but it also still contains some moir&#xe9; pattern-related contaminations. By contrast, due to the consistent style of the ground-truth image, our TIDNet successfully achieve much better visualization under its guidance.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Displays of demoir&#xe9;d images obtained by our TIDNet when the reference image and ground-truth image are, respectively, used as the supervised guidance.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g009.tif"/>
</fig>
</sec>
</sec>
<sec id="s5-3">
<title>5.3 Comparison with state of the arts</title>
<p>To further demonstrate the effectiveness of our proposed method on the moir&#xe9; pattern removal, we conducted experiments compared with state of the arts, including AFN [<xref ref-type="bibr" rid="B28">28</xref>], WDNet [<xref ref-type="bibr" rid="B27">27</xref>], C3Net [<xref ref-type="bibr" rid="B29">29</xref>], DnCNN [<xref ref-type="bibr" rid="B11">11</xref>], and FFDNet [<xref ref-type="bibr" rid="B53">53</xref>]. Specifically, the first three methods are designed for image demoir&#xe9;ing, and the last two are designed for image restoration. To make a fair comparison, we retrain them on our collected dataset according to their released source codes.</p>
<p>The quantitative results on detection and recognition are tabulated in <xref ref-type="table" rid="T4">Table 4</xref>. Obviously, our presented method TIDNet dramatically outperforms these state of the arts. Compared with AFN, C3Net, and DnCNN, our achieved results are much superior to those computed by them. Specifically, they are all less than 30% and 45%, respectively, on the recall and F1-measure in text detection, whereas TIDNet achieves more than 50% improvement. Referring to FFDNet, although it is slightly better than the aforementioned method, it is still much inferior to TIDNet. In comparison to WDNet, our proposed method also achieves noticeable performance enhancement.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Quantitative results on our collected dataset obtained by different comparison methods and TIDNet.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th colspan="3" align="center">Detection</th>
</tr>
<tr>
<th align="center">Metrics</th>
<th align="center">Recall</th>
<th align="center">Precision</th>
<th align="center">F1-measure</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AFN</td>
<td align="center">22.88%</td>
<td align="center">99.23%</td>
<td align="center">37.19%</td>
</tr>
<tr>
<td align="center">WDNet</td>
<td align="center">73.82%</td>
<td align="center">99.10%</td>
<td align="center">84.61%</td>
</tr>
<tr>
<td align="center">C3Net</td>
<td align="center">27.97%</td>
<td align="center">99.17%</td>
<td align="center">43.64%</td>
</tr>
<tr>
<td align="center">DnCNN</td>
<td align="center">22.86%</td>
<td align="center">99.23%</td>
<td align="center">37.16%</td>
</tr>
<tr>
<td align="center">FFDNet</td>
<td align="center">43.94%</td>
<td align="center">98.97%</td>
<td align="center">60.86%</td>
</tr>
<tr>
<td align="center">TIDNet</td>
<td align="center">
<bold>92.92%</bold>
</td>
<td align="center">98.72%</td>
<td align="center">
<bold>95.73%</bold>
</td>
</tr>
</tbody>
</table>
<table>
<thead valign="top">
<tr>
<th align="center">Task</th>
<th colspan="3" align="center">Recognition</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">AFN</td>
<td align="center">23.09%</td>
<td align="center">80.13%</td>
<td align="center">35.85%</td>
</tr>
<tr>
<td align="center">WDNet</td>
<td align="center">70.79%</td>
<td align="center">91.80%</td>
<td align="center">79.94%</td>
</tr>
<tr>
<td align="center">C3Net</td>
<td align="center">26.42%</td>
<td align="center">79.11%</td>
<td align="center">39.62%</td>
</tr>
<tr>
<td align="center">DnCNN</td>
<td align="center">25.14%</td>
<td align="center">82.33%</td>
<td align="center">38.52%</td>
</tr>
<tr>
<td align="center">FFDNet</td>
<td align="center">38.10%</td>
<td align="center">79.49%</td>
<td align="center">51.51%</td>
</tr>
<tr>
<td align="center">TIDNet</td>
<td align="center">
<bold>85.34%</bold>
</td>
<td align="center">
<bold>93.78%</bold>
</td>
<td align="center">
<bold>89.36%</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The comparison visualizations in Chinese and English are, respectively, shown in <xref ref-type="fig" rid="F10">Figures 10</xref>, <xref ref-type="fig" rid="F11">11</xref>. It is easy to observe that no matter whether the text images are in Chinese or English, our presented method exhibits much better visualizations compared with existing image demoir&#xe9;ing and image restoration methods. Referring to AFN and C3Net, although moir&#xe9; patterns are removed from the contaminated images, many character-related pixels are also erased, significantly making an inferior influence on text detection and recognition. The main reason is that these two methods regard the characters as moir&#xe9; patterns since they have similar attributes. By contrast, WDNet overcomes this problem, however its recovered images are still corrupted by more or less moir&#xe9; patterns. For DnCNN, it also suffers from the similar problem compared with AFN and C3Net. Although a better visualization is obtained by FFDNet in comparison to DnCNN, its reconstructed characters are blurred. Different from these comparison approaches, our proposed method not only efficiently erases moir&#xe9; patterns but also restores the characters which are quite similar to the ground-truth.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Displays of demoir&#xe9;d images (in Chinese) obtained by different comparison methods and TIDNet.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g010.tif"/>
</fig>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Displays of demoir&#xe9;d images (in English) obtained by different comparison methods and TIDNet.</p>
</caption>
<graphic xlink:href="fphy-12-1526412-g011.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>To fill the gap between the OCR and image demoir&#xe9;ing, in this paper, a text image-based dataset is primarily collected for text image demoir&#xe9;ing, allowing for supervised study. Furthermore, we propose a novel network named TIDNet, which is particularly adaptive for text image demoir&#xe9;ing. Inspired by the specific priors of moir&#xe9; patterns, a rough moir&#xe9; extraction module followed by a three-channel network is introduced so that the moir&#xe9; pattern-associated information is easily extracted. Since our purpose is to improve the detection and recognition performance, a character attention module is also proposed in our TIDNet, through which the network highly pays attention on character-associated pixels and their semantic information. As a result of the aforementioned strategies, our proposed method enjoys a dramatic performance improvement on the OCR application.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>ZZ: conceptualization, data curation, formal analysis, investigation, methodology, software, validation, writing&#x2013;original draft, writing&#x2013;review and editing, and visualization. BL: conceptualization, formal analysis, investigation, methodology, software, and writing&#x2013;original draft. TR: data curation, formal analysis, investigation, methodology, and writing&#x2013;original draft. CF: data curation, investigation, software, and writing&#x2013;original draft. RL: data curation, software, and writing&#x2013;original draft. ML: funding acquisition, project administration, resources, supervision, writing&#x2013;original draft, and writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This project was supported in part by the National Natural Scientific Foundation of China 62472124, Shenzhen Colleges and Universities Stable Support Program GXWD20220811170130002.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mori</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Nishida</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yamada</surname>
<given-names>H</given-names>
</name>
</person-group>. <source>Optical character recognition</source>. <publisher-name>John Wiley and Sons, Inc.</publisher-name> (<year>1999</year>).</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>MD</given-names>
</name>
<name>
<surname>Ueda</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Dynamics-based motion deblurring improves the performance of optical character recognition during fast scanning of a robotic eye</article-title>. <source>IEEE/ASME Trans Mechatronics</source> (<year>2018</year>) <volume>23</volume>:<fpage>491</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1109/tmech.2018.2791473</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Oracle recognition of oracle network based on ant colony algorithm</article-title>. <source>Front Phys</source> (<year>2021</year>) <volume>9</volume>:<fpage>768336</fpage>. <pub-id pub-id-type="doi">10.3389/fphy.2021.768336</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Mask-most net: mask approximation based multi-oriented scene text detection network</article-title>. In: <conf-name>2019 IEEE International Conference on Multimedia and Expo (ICME)</conf-name>; <conf-date>08-12 July 2019</conf-date>; <conf-loc>Shanghai, China</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2019</year>) p. <fpage>206</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>K</given-names>
</name>
<etal/>
</person-group> <article-title>Intervoxnet: a novel dual-modal audio-text fusion network for automatic and efficient depression detection from interviews</article-title>. <source>Front Phys</source> (<year>2024</year>) <volume>12</volume>:<fpage>1430035</fpage>. <pub-id pub-id-type="doi">10.3389/fphy.2024.1430035</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhan</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Esir: end-to-end scene text recognition via iterative image rectification</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on computer vision and pattern recognition</conf-name>. <comment>16-20 June 2019; Long Beach, CA, United States: IEEE</comment> (<year>2019</year>) p. <fpage>2059</fpage>&#x2013;<lpage>68</lpage>.</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Satyawan</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Pratama</surname>
<given-names>MO</given-names>
</name>
<name>
<surname>Jannati</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Muhammad</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Fajar</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Hamzah</surname>
<given-names>H</given-names>
</name>
<etal/>
</person-group> <article-title>Citizen id card detection using image processing and optical character recognition</article-title>. In: <conf-name>Journal of physics: Conference series</conf-name>, <volume>1235</volume>. <publisher-name>Bristol, United Kingdom: IOP Publishing</publisher-name> (<year>2019</year>), <fpage>012049</fpage>.</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Schreiber</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Agne</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wolf</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Dengel</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Deepdesrt: deep learning for detection and structure recognition of tables in document images</article-title>. In: <conf-name>2017 14th IAPR international conference on document analysis and recognition (ICDAR)</conf-name>; <conf-date>09-15 November 2017</conf-date>; <conf-loc>Kyoto, Japan</conf-loc>, <volume>1</volume>. <publisher-name>IEEE</publisher-name> (<year>2017</year>) p. <fpage>1162</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1109/icdar.2017.192</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhuang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zha</surname>
<given-names>ZJ</given-names>
</name>
</person-group>. <article-title>Towards human-level license plate recognition</article-title>. In: <conf-name>Proceedings of the European Conference on computer vision</conf-name>. <comment> 8-14 September 2018</comment>. <publisher-name>Munich, Germany: ECCV</publisher-name> (<year>2018</year>) p. <fpage>306</fpage>&#x2013;<lpage>21</lpage>.</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Mazur</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Brain tumor segmentation in mri with multi-modality spatial information enhancement and boundary shape correction</article-title>. <source>Pattern Recognition</source> (<year>2024</year>) <volume>153</volume>:<fpage>110553</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2024.110553</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Beyond a Gaussian denoiser: residual learning of deep cnn for image denoising</article-title>. <source>IEEE Trans image Process</source> (<year>2017</year>) <volume>26</volume>:<fpage>3142</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2017.2662206</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Deep image denoising with adaptive priors</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2022</year>) <volume>32</volume>:<fpage>5124</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2022.3149518</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Progressive image deraining networks: a better and simpler baseline</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>15-20 June 2019</conf-date>; <conf-loc>Long Beach, CA, USA</conf-loc> (<year>2019</year>) p. <fpage>3937</fpage>&#x2013;<lpage>46</lpage>.</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Sindagi</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>VM</given-names>
</name>
</person-group>. <article-title>Image de-raining using a conditional generative adversarial network</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2020</year>) <volume>30</volume>:<fpage>3943</fpage>&#x2013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2019.2920407</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Qu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Enhanced pix2pix dehazing network</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name> (<year>2019</year>) p. <fpage>8160</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>N</given-names>
</name>
</person-group>. <article-title>Tms-gan: a twofold multi-scale generative adversarial network for single image dehazing</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2022</year>) <volume>32</volume>:<fpage>2760</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3097713</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>LY</given-names>
</name>
</person-group>. <article-title>Mop moire patterns using mopnet</article-title>. <source>Proc IEEE/CVF Int Conf Comput Vis</source> (<year>2019</year>) <fpage>2424</fpage>&#x2013;<lpage>32</lpage>.</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Moir&#xe9; photo restoration using multiresolution convolutional neural networks</article-title>. <source>IEEE Trans Image Process</source> (<year>2018</year>) <volume>27</volume>:<fpage>4160</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2018.2834737</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Nan</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Zong</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Mmdm: multi-frame and multi-scale for image demoir&#xe9;ing</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops</conf-name>; <conf-date>14-19 June 2020</conf-date>; <conf-loc>Seattle, WA</conf-loc> (<year>2020</year>) p. <fpage>434</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Slabaugh</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Leonardis</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Image demoireing with learnable bandpass filters</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>13-19 June 2020</conf-date>; <conf-loc>Seattle, WA</conf-loc> (<year>2020</year>) p. <fpage>3636</fpage>&#x2013;<lpage>45</lpage>.</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Moir&#xe9; pattern removal with multi-scale feature enhancing network</article-title>. In: <conf-name>2019 IEEE International Conference on Multimedia and Expo Workshops (ICMEW)</conf-name>; <conf-date>08-12 July 2019</conf-date>; <conf-loc>Shanghai, China</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2019</year>) p. <fpage>240</fpage>&#x2013;<lpage>5</lpage>.</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>A moir&#xe9; removal method based on peak filtering and image enhancement</article-title>. <source>Mathematics</source> (<year>2024</year>) <volume>12</volume>:<fpage>846</fpage>. <pub-id pub-id-type="doi">10.3390/math12060846</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yue</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Moir&#xe9; pattern removal from texture images via low-rank and sparse matrix decomposition</article-title>. In: <conf-name>2015 Visual Communications and Image Processing (VCIP)</conf-name>; <conf-date>13-16 December 2015</conf-date>; <publisher-loc>Singapore</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2015</year>) p. <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Shu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X</given-names>
</name>
</person-group>. <source>Demoir&#xe9;ing of camera-captured screen images using deep convolutional neural network</source> (<year>2018</year>) <comment>arXiv preprint arXiv:1804.03809</comment>.</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yue</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Recaptured screen image demoir&#xe9;ing</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2020</year>) <volume>31</volume>:<fpage>49</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1109/tcsvt.2020.2969984</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>Deep wavelet network with domain adaptation for single image demoireing</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops</conf-name>; <conf-date>14-19 June 2020</conf-date>; <conf-loc>Seattle, WA, USA</conf-loc> (<year>2020</year>) p. <fpage>420</fpage>&#x2013;<lpage>1</lpage>.</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Slabaugh</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Leonardis</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Wavelet-based dual-branch network for image demoir&#xe9;ing</article-title>. In: <conf-name>Computer Vision&#x2013;ECCV 2020: 16th European Conference, Glasgow, UK, August 23&#x2013;28, 2020, Proceedings, Part XIII 16</conf-name>. <publisher-name>Springer</publisher-name> (<year>2020</year>) p. <fpage>86</fpage>&#x2013;<lpage>102</lpage>.</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Moir&#xe9; pattern removal via attentive fractal network</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops</conf-name>; <conf-date>14-19 June 2020</conf-date>; <conf-loc>Seattle, WA</conf-loc> (<year>2020</year>) p. <fpage>472</fpage>&#x2013;<lpage>3</lpage>.</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Nam</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jeong</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>C3net: demoir&#xe9;ing network attentive in channel, color and concatenation</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops</conf-name>; <conf-date>14-19 June 2020</conf-date>; <conf-loc>Seattle, WA</conf-loc> (<year>2020</year>) p. <fpage>426</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mancas-Thillou</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Mirmehdi</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>An introduction to super-resolution text</article-title>. In: <source>Digital document processing</source>. <publisher-name>Springer</publisher-name> (<year>2007</year>) p. <fpage>305</fpage>&#x2013;<lpage>27</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Loy</surname>
<given-names>CC</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Image super-resolution using deep convolutional networks</article-title>. <source>IEEE Trans pattern Anal machine intelligence</source> (<year>2015</year>) <volume>38</volume>:<fpage>295</fpage>&#x2013;<lpage>307</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2015.2439281</pub-id>
</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Loy</surname>
<given-names>CC</given-names>
</name>
<name>
<surname>Qiao</surname>
<given-names>Y</given-names>
</name>
</person-group>. <source>Boosting optical character recognition: a super-resolution approach</source> (<year>2015</year>) <comment>arXiv preprint arXiv:1506.02211</comment>.</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<etal/>
</person-group> <article-title>Scene text image super-resolution in the wild</article-title>. In: <conf-name>European Conference on Computer Vision. 2020: 16th European Conference. August 23&#x2013;28</conf-name>. <publisher-name>Glasgow, United Kingdom: Springer</publisher-name> (<year>2020</year>) p. <fpage>650</fpage>&#x2013;<lpage>66</lpage>.</citation>
</ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>TH</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Text image super-resolution by image matting and text label supervision</article-title>. In: <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)</conf-name>; <conf-date>16-17 June 2019</conf-date>; <conf-loc>Long Beach, CA, USA</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2019</year>) p. <fpage>1722</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Scene text telescope: text-focused scene image super-resolution</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>20-25 June 2021</conf-date>; <conf-loc>Nashville, TN, USA</conf-loc> (<year>2021</year>) p. <fpage>12026</fpage>&#x2013;<lpage>35</lpage>.</citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mei</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Qiao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Deepdeblur: text image recovery from blur to sharp</article-title>. <source>Multimedia tools Appl</source> (<year>2019</year>) <volume>78</volume>:<fpage>18869</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-019-7251-y</pub-id>
</citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cho</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Text image deblurring using text-specific properties</article-title>. In: <conf-name>European Conference on Computer Vision</conf-name>. <comment>October 7-13</comment>. <publisher-name>Florence, Italy: Springer</publisher-name> (<year>2012</year>) p. <fpage>524</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Text image deblurring via two-tone prior</article-title>. <source>Neurocomputing</source> (<year>2017</year>) <volume>242</volume>:<fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2017.01.080</pub-id>
</citation>
</ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>Blind deblurring of text images using a text-specific hybrid dictionary</article-title>. <source>IEEE Trans Image Process</source> (<year>2019</year>) <volume>29</volume>:<fpage>710</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2019.2933739</pub-id>
</citation>
</ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F</given-names>
</name>
<etal/>
</person-group> <article-title>Drpl: deep regression pair learning for multi-focus image fusion</article-title>. <source>IEEE Trans Image Process</source> (<year>2020</year>) <volume>29</volume>:<fpage>4816</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2020.2976190</pub-id>
</citation>
</ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>From global to local: multi-patch and multi-scale contrastive similarity learning for unsupervised defocus blur detection</article-title>. <source>IEEE Trans Image Process</source> (<year>2023</year>) <volume>32</volume>:<fpage>1158</fpage>&#x2013;<lpage>69</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2023.3240856</pub-id>
</citation>
</ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y</given-names>
</name>
<etal/>
</person-group> <article-title>Layer-output guided complementary attention learning for image defocus blur detection</article-title>. <source>IEEE Trans Image Process</source> (<year>2021</year>) <volume>30</volume>:<fpage>3748</fpage>&#x2013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2021.3065171</pub-id>
</citation>
</ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Rethinking the effectiveness of objective evaluation metrics in multi-focus image fusion: a statistic-based approach</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2024</year>) <volume>46</volume>:<fpage>5806</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3367905</pub-id>
</citation>
</ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>A deep learning framework for infrared and visible image fusion without strict registration</article-title>. <source>Int J Comput Vis</source> (<year>2024</year>) <volume>132</volume>:<fpage>1625</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-023-01948-x</pub-id>
</citation>
</ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Raghunandan</surname>
<given-names>KS</given-names>
</name>
<name>
<surname>Shivakumara</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Jalab</surname>
<given-names>HA</given-names>
</name>
<name>
<surname>Ibrahim</surname>
<given-names>RW</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>GH</given-names>
</name>
<name>
<surname>Pal</surname>
<given-names>U</given-names>
</name>
<etal/>
</person-group> <article-title>Riesz fractional based model for enhancing license plate detection and recognition</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2018</year>) <volume>28</volume>:<fpage>2276</fpage>&#x2013;<lpage>88</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2017.2713806</pub-id>
</citation>
</ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Karthikeyan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>de Herrera</surname>
<given-names>AGS</given-names>
</name>
<name>
<surname>Doctor</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Mirza</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>An ocr post-correction approach using deep learning for processing medical reports</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2022</year>) <volume>32</volume>:<fpage>2574</fpage>&#x2013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3087641</pub-id>
</citation>
</ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Multi-scale multi-attention network for moir&#xe9; document image binarization</article-title>. <source>Signal Processing: Image Commun</source> (<year>2021</year>) <volume>90</volume>:<fpage>116046</fpage>. <pub-id pub-id-type="doi">10.1016/j.image.2020.116046</pub-id>
</citation>
</ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zamir</surname>
<given-names>SW</given-names>
</name>
<name>
<surname>Arora</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Hayat</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>FS</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>MH</given-names>
</name>
<etal/>
</person-group> <article-title>Multi-stage progressive image restoration</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>20-25 June 2021</conf-date>; <conf-loc>Nashville, TN, USA</conf-loc> (<year>2021</year>) p. <fpage>14821</fpage>&#x2013;<lpage>31</lpage>.</citation>
</ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kanopoulos</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Vasanthavada</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Baker</surname>
<given-names>RL</given-names>
</name>
</person-group>. <article-title>Design of an image edge detection filter using the sobel operator</article-title>. <source>IEEE J solid-state circuits</source> (<year>1988</year>) <volume>23</volume>:<fpage>358</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1109/4.996</pub-id>
</citation>
</ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Charbonnier</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Blanc-Feraud</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Aubert</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Barlaud</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Two deterministic half-quadratic regularization algorithms for computed imaging</article-title>. In: <conf-name>Proceedings of 1st International Conference on Image Processing</conf-name>; <conf-date>13-16 November 1994</conf-date>; <conf-loc>Austin, TX, USA</conf-loc>, <volume>2</volume>. <publisher-name>IEEE</publisher-name> (<year>1994</year>) p. <fpage>168</fpage>&#x2013;<lpage>72</lpage>.</citation>
</ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition</article-title>. <source>IEEE Trans pattern Anal machine intelligence</source> (<year>2016</year>) <volume>39</volume>:<fpage>2298</fpage>&#x2013;<lpage>304</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2016.2646371</pub-id>
</citation>
</ref>
<ref id="B52">
<label>52.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Chintala</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>E</given-names>
</name>
<name>
<surname>DeVito</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <source>Automatic differentiation in pytorch</source> (<year>2017</year>).</citation>
</ref>
<ref id="B53">
<label>53.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Zuo</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Ffdnet: toward a fast and flexible solution for cnn-based image denoising</article-title>. <source>IEEE Trans Image Process</source> (<year>2018</year>) <volume>27</volume>:<fpage>4608</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1109/tip.2018.2839891</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>