<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.72980.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Research Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Spatio-temporal deep learning model for distortion classification in laparoscopic video</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: awaiting peer review]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>AlDahoul</surname>
                        <given-names>Nouar</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-5522-0033</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Abdul Karim</surname>
                        <given-names>Hezerul</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-7613-4596</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ba Wazir</surname>
                        <given-names>Abdulaziz Saleh</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-1225-1723</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Toledo Tan</surname>
                        <given-names>Myles Joshua</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ahmad Fauzi</surname>
                        <given-names>Mohammad Faizal</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-5382-6269</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Faculty of Engineering, Multimedia University, Cyberjaya, Selangor, 63100, Malaysia</aff>
                <aff id="a2">
                    <label>2</label>YO-VIVO corporation, Bacolod City, 6100, Philippines</aff>
                <aff id="a3">
                    <label>3</label>Department of Natural Sciences, University of St. La Salle, Bacolod City, 6100, Philippines</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:nouar.aldahoul@live.iium.edu.my">nouar.aldahoul@live.iium.edu.my</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>5</day>
                <month>10</month>
                <year>2021</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2021</year>
            </pub-date>
            <volume>10</volume>
            <elocation-id>1010</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>28</day>
                    <month>9</month>
                    <year>2021</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2021 AlDahoul N et al.</copyright-statement>
                <copyright-year>2021</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/10-1010/pdf"/>
            <abstract>
                <p>
                    <bold>Background: </bold>Laparoscopy is a surgery performed in the abdomen without making large incisions in the skin and with the aid of a video camera, resulting in laparoscopic videos. The laparoscopic video is prone to various distortions such as noise, smoke, uneven illumination, defocus blur, and motion blur. One of the main components in the feedback loop of video enhancement systems is distortion identification, which automatically classifies the distortions affecting the videos and selects the video enhancement algorithm accordingly. This paper aims to address the laparoscopic video distortion identification problem by developing fast and accurate multi-label distortion classification using a deep learning model. Current deep learning solutions based on convolutional neural networks (CNNs) can address laparoscopic video distortion classification, but they learn only spatial information.</p>
                <p>
                    <bold>Methods: </bold>In this paper, utilization of both spatial and temporal features in a CNN-long short-term memory (CNN-LSTM) model is proposed as a novel solution to enhance the classification. First, pre-trained ResNet50 CNN was used to extract spatial features from each video frame by transferring representation from large-scale natural images to laparoscopic images. Next, LSTM was utilized to consider the temporal relation between the features extracted from the laparoscopic video frames to produce multi-label categories. A novel laparoscopic video dataset proposed in the ICIP2020 challenge was used for training and evaluation of the proposed method.</p>
                <p>
                    <bold>Results: </bold>The experiments conducted show that the proposed CNN-LSTM outperforms the existing solutions in terms of accuracy (85%), and F1-score (94.2%). Additionally, the proposed distortion identification model is able to run in real-time with low inference time (0.15 sec).</p>
                <p>
                    <bold>Conclusions:</bold> The proposed CNN-LSTM model is a feasible solution to be utilized in laparoscopic videos for distortion identification.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>distortion classification</kwd>
                <kwd>convolutional Neural Network</kwd>
                <kwd>laparoscopic video</kwd>
                <kwd>long short-term memory</kwd>
                <kwd>multi-label classification</kwd>
                <kwd>spatio-temporal features</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/100012024">
                    <funding-source>Multimedia University</funding-source>
                </award-group>
                <funding-statement>This research project was funded by Multimedia University, Malaysia.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec id="sec1" sec-type="intro">
            <title>Introduction</title>
            <p>Video quality assessment (VQA) in the medical field is an important task to achieve satisfactory conditions for medical imaging modalities like magnetic resonance imaging (MRI), computed tomography (CT) scans, and laparoscopy. VQA is composed of two stages: distortion classification and quality score evaluation. Laparoscopic surgery videos are prone to distortions that affect a surgeon&#x2019;s visibility and degrade the vision quality for robot-assisted surgery.
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup>
            </p>
            <p>Laparoscopic videos are often affected by various types of distortions like noise, smoke, uneven illumination, and blur, which are all concomitant artifacts that arise from operating the laparoscopic surgical equipment.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> To enhance the distorted laparoscopic videos, most studies propose solutions that require troubleshooting the equipment.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>,
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> However, such solutions are time consuming and cannot guarantee high-quality laparoscopy every time.</p>
            <p>Recent studies have suggested the use of image or video enhancement methods like de-smoking for laparoscopic surgery,
                <sup>
                    <xref ref-type="bibr" rid="ref4">4</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> and joint wavelet decomposition and binocular combination for endoscopic image enhancement.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> In this case, real-time detection of the types of distortion is important to decide which enhancement methods are appropriate to apply. Real-time distortion classification is a challenging task and few recent studies have addressed it using hand-crafted features.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup> These existing image quality assessment methods, such as BIQI,
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> DIIVINE
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup> and BRISQUE,
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> were based on non-generic classification and are considered domain-dependent tasks. In addition, a distortion-specific classification approach has been demonstrated.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> This approach used a separate traditional feature method for each type of distortion.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup>
            </p>
            <p>On the other hand, convolutional neural networks (CNNs) overcome the previous limitations and learn features automatically with the same CNN architecture to detect all types of distortions. This paper aims to address the challenge of distortion detection and produce a generic method for distortion classification in laparoscopic videos.</p>
            <p>Artificial neural networks (ANNs) have shown significant capability in overcoming the issue of distortion classification by extracting informative features from all kinds of distortions. CNNs are powerful and efficient in several image tasks including classification,
                <sup>
                    <xref ref-type="bibr" rid="ref13">13</xref>
                </sup> segmentation,
                <sup>
                    <xref ref-type="bibr" rid="ref14">14</xref>
                </sup> enhancement,
                <sup>
                    <xref ref-type="bibr" rid="ref15">15</xref>
                </sup> and retrieval.
                <sup>
                    <xref ref-type="bibr" rid="ref16">16</xref>
                </sup> Recently, CNNs have also been used in several studies on image distortion classification for various applications.
                <sup>
                    <xref ref-type="bibr" rid="ref17">17</xref>,
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> However, recurrent neural networks (RNNs), and specifically, long short-term memory (LSTM)
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> have not yet been investigated for distortion classification in video datasets. This paper aims to highlight the use of CNN-LSTM
                <sup>
                    <xref ref-type="bibr" rid="ref20">20</xref>
                </sup> to improve classification accuracy.</p>
            <p>In the context of distortion classification in laparoscopic surgery videos, a recent study has proposed the use of deep CNNs, such as ResNet for distortion ranking.
                <sup>
                    <xref ref-type="bibr" rid="ref21">21</xref>
                </sup> Its method achieved ranking accuracies of 83.3%, 84.7%, and 87.3% using Resnet18, Resnet34, and Resnet50, respectively. However, the previous work focused only on spatial features extracted from a collection of 20,000 images for image-level distortion ranking.</p>
            <p>Another very recent work was found to transfer learning from pre-trained ResNet50 CNN to laparoscopic video frames.
                <sup>
                    <xref ref-type="bibr" rid="ref22">22</xref>
                </sup> The spatial features extracted from ResNet50 were applied to four support vector machine classifiers (three binary and one 5-class) utilizing decision fusion to produce the final distortion lists.
                <sup>
                    <xref ref-type="bibr" rid="ref22">22</xref>
                </sup> Hence, this paper proposes to extract spatiotemporal features using CNN-LSTM for video-level distortion classification.</p>
            <p>The key contributions of this paper are:
                <list list-type="bullet">
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Utilization of a RNN model such as LSTM with time series of CNN-based features extracted from the frames. To the best of our knowledge, this is the first paper that uses CNN-LSTM for non-reference distortion classification in laparoscopic videos.</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>An evaluation and comparison between the proposed CNN-LSTM and existing solutions presented for the ICIP2020 challenge.</p>
                    </list-item>
                </list>
            </p>
            <p>This paper is structured as follows: 
                <italic toggle="yes">Methods</italic> describes the proposed method and the experiments including the dataset and the experimental setup. In 
                <italic toggle="yes">Results and discussion,</italic> the results of the proposed solution and the comparison with existing methods are presented and discussed. 
                <italic toggle="yes">Conclusions</italic> summarizes the significance of this work and opens doors for further improvement.</p>
        </sec>
        <sec id="sec2" sec-type="methods">
            <title>Methods</title>
            <sec id="sec3">
                <title>The proposed multi-label distortion classification with CNN-LSTM</title>
                <p>In this section, we describe the proposed methodology for distortion classification in laparoscopic videos. This classification problem is formulated as a single multi-label classification which can be transformed to multiple binary classifiers. In this scenario, each label (distortion) in the dataset is used with a separate binary classifier, resulting in five binary classifiers in total. The block diagram of the proposed model is shown in 
                    <xref ref-type="fig" rid="f1">Figure 1</xref>.</p>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>Illustration of the proposed multi-label distortion classification.</title>
                        <p>CNN, convolutional neural networks; LSTM, long short-term memory; AWGN, additive white Gaussian noise.</p>
                    </caption>
                    <graphic id="gr1" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76597/24bc2823-4e04-4b3f-a2c2-10f3d98383f0_figure1.gif"/>
                </fig>
                <p>
                    <italic toggle="yes">Transfer learning with residual network</italic>
                </p>
                <p>Usually, very deep CNNs suffer from the gradient vanishing problem, which leads to a drop in accuracy.
                    <sup>
                        <xref ref-type="bibr" rid="ref23">23</xref>
                    </sup> To address this problem, residual network (ResNet) was developed utilizing skip connections instead of direct stacked layers.
                    <sup>
                        <xref ref-type="bibr" rid="ref23">23</xref>
                    </sup> ResNet is a well-known deep neural network with high generalization ability used for image recognition.
                    <sup>
                        <xref ref-type="bibr" rid="ref23">23</xref>
                    </sup> Residual networks have various versions with different numbers of layers, such as ResNet50 with 50 layers and over 23 million trainable parameters.</p>
                <p>The transfer learning approach is summarized by training deep CNNs like ResNet with a large-scale dataset such as ImageNet
                    <sup>
                        <xref ref-type="bibr" rid="ref24">24</xref>
                    </sup> and utilizing them with a novel small-scale dataset. In this paper, ResNet50
                    <sup>
                        <xref ref-type="bibr" rid="ref23">23</xref>
                    </sup> was transferred to the laparoscopic video dataset and utilized to extract spatial features from the video&#x2019;s frames. This CNN pre-trained on ImageNet
                    <sup>
                        <xref ref-type="bibr" rid="ref24">24</xref>
                    </sup> was used after removing top layers. The input images were resized to 224 &#x00d7; 224 and the dimensions of extracted features was 2048.</p>
                <p>
                    <italic toggle="yes">Classification with LSTM</italic>
                </p>
                <p>LSTM is a special type of RNN that is used for long-range sequence modeling.
                    <sup>
                        <xref ref-type="bibr" rid="ref19">19</xref>
                    </sup> LSTM has a memory cell, which acts as an accumulator of state information, supported by control gates. The advantage of this structure is that it solves the problem of gradient vanishing.
                    <sup>
                        <xref ref-type="bibr" rid="ref19">19</xref>
                    </sup> The CNN-LSTM network was found to capture spatiotemporal correlations better than fully connected LSTM, which is only powerful for spatial correlation.
                    <sup>
                        <xref ref-type="bibr" rid="ref20">20</xref>
                    </sup>
                </p>
                <p>In this paper, the spatial feature vector extracted from ResNet50 represents one laparoscopic frame. Additionally, the series of feature vectors extracted from a series of frames in one video was applied to a set of five LSTMs. This aims to map the video to two categories in each LSTM. For example, the first LSTM checks whether smoke distortion is available in a video and produces two classes: &#x201c;yes&#x201d; and &#x201c;no.&#x201d; The already-trained CNN was utilized after replacing the top layers with five LSTM classifiers to tune the parameters of the fully connected layers. In other words, each LSTM fits the extracted features and maps them to two categories: &#x201c;yes&#x201d; and &#x201c;no.&#x201d;</p>
                <p>The architecture of each LSTM consists of the following layers:
                    <list list-type="order">
                        <list-item>
                            <label>1)</label>
                            <p>Bidirectional LSTM with 64 nodes</p>
                        </list-item>
                        <list-item>
                            <label>2)</label>
                            <p>ReLU activation function</p>
                        </list-item>
                        <list-item>
                            <label>3)</label>
                            <p>Batch normalization</p>
                        </list-item>
                        <list-item>
                            <label>4)</label>
                            <p>Dropout with 0.2</p>
                        </list-item>
                        <list-item>
                            <label>5)</label>
                            <p>Fully connected layers with 64 nodes</p>
                        </list-item>
                        <list-item>
                            <label>6)</label>
                            <p>ReLU activation function</p>
                        </list-item>
                        <list-item>
                            <label>7)</label>
                            <p>Batch normalization</p>
                        </list-item>
                        <list-item>
                            <label>8)</label>
                            <p>Dropout with 0.2</p>
                        </list-item>
                        <list-item>
                            <label>9)</label>
                            <p>Fully connected layers with two nodes</p>
                        </list-item>
                        <list-item>
                            <label>10)</label>
                            <p>Softmax activation function</p>
                        </list-item>
                    </list>
                </p>
            </sec>
            <sec id="sec4">
                <title>Experiments</title>
                <p>
                    <italic toggle="yes">Datasets and experimental setup</italic>
                </p>
                <p>The dataset used in this paper is an extended version of the Laparoscopic Video Quality (LVQ) database.
                    <sup>
                        <xref ref-type="bibr" rid="ref8">8</xref>
                    </sup> The database contains 10 reference videos, each 10 seconds in length.
                    <sup>
                        <xref ref-type="bibr" rid="ref8">8</xref>
                    </sup> Each reference video is distorted by five different types of distortions with four different levels, resulting in a total of 200 videos. These videos were extracted from the Cholec80 dataset that comprises 80 different videos of cholecystectomy surgeries.
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup> The extracted videos were selected considering multiple variations of scene content. The resolution of the videos is 512 &#x00d7; 288 with a 16:9 aspect ratio and a frame rate of 25 fps.</p>
                <p>The extended version of LVQ dataset was issued in the ICIP2020 challenge and includes 1000 laparoscopic videos divided into 800 videos for training and 200 videos for testing. The distortions include additive white Gaussian noise (AWGN), smoke, uneven illumination, defocus and motion blur. The numbers of videos for each label or distortion are not balanced (300 videos with AWGN, 320 videos with smoke, 400 videos with uneven illumination, 160 videos with defocus blur, 80 videos with motion blur). The challenge in this dataset is that each video is affected by single or multiple distortions and thus, the problem of distortion classification is formulated as a multi-label classification problem.</p>
                <p>The training and testing for the ResNet-LSTM model was carried out using 
                    <ext-link ext-link-type="uri" xlink:href="https://opencv.org/">OpenCV</ext-link> and 
                    <ext-link ext-link-type="uri" xlink:href="https://www.tensorflow.org/">TensorFlow</ext-link> frameworks and libraries on an NVIDIA GeForce GTX 1080 Ti GPU. The learning rate used to train the LSTM model was set to 0.001, the batch size was set to 8, and the number of epochs was set to 150. The minimization of the categorical crossentropy loss function was achieved using the Adam optimizer.</p>
            </sec>
        </sec>
        <sec id="sec5" sec-type="results|discussion">
            <title>Results and discussion</title>
            <p>To the best of our knowledge, no other papers have utilized this extended version of the laparoscopic video dataset 
                <ext-link ext-link-type="uri" xlink:href="https://hipernav.eu/icip-2020-challenge-session/">challenge dashboard</ext-link> for distortion classification. For this reason, we compared our approach with the best solutions presented in the ICIP2020 challenge as shown in 
                <xref ref-type="table" rid="T1">Table 1</xref>.</p>
            <table-wrap id="T1" orientation="portrait" position="float">
                <label>Table 1. </label>
                <caption>
                    <title>Classification accuracy and F1-score of the proposed method and various baseline models.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Solution</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">F1-score (single + multi distortions)</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">F1-score (single-distortion)</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Accuracy</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>VGG16 + many fm + fc</bold>
                                <break/>
                                <bold>(Baseline)*</bold>
                                <sup>
                                    <bold>#</bold>
                                </sup>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">94.1%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">93.3%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">81.5%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>VGG16 + 5 fc</bold>
                                <break/>
                                <bold>(Baseline)*</bold>
                                <sup>
                                    <bold>#</bold>
                                </sup>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">93.3%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">90.7%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">78.0%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>(Baseline)*</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">91.5%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">88.0%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">76.5%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>(Baseline)*</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">85.4%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>98.7%</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">58.0%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>(Baseline)*</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">83.2%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">89.3%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">57.0%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>ResNet50-LSTM</bold>
                                <break/>
                                <bold>(Proposed)</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>94.2%</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">89.3%</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>85.0%</bold>
                            </td>
                        </tr>
                    </tbody>
                </table>
                <table-wrap-foot>
                    <p>
                        <bold>Data sources: * =</bold> 
                        <ext-link ext-link-type="uri" xlink:href="https://hipernav.eu/icip-2020-challenge-session/">
                            <bold>Challenge dashboard</bold>
                        </ext-link>
                        <bold>;</bold> 
                        <sup>
                            <bold>#</bold>
                        </sup> 
                        <bold>=</bold> 
                        <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/file/d/1QI4-8vzjlcq6tsBYzDv7LWDtN8PeTZCB/view">
                            <bold>Challenge presentation event</bold>
                        </ext-link>
                        <bold>.</bold>
                    </p>
                </table-wrap-foot>
            </table-wrap>
            <p>The description of the baseline solutions was given by winners in the 
                <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/file/d/1QI4-8vzjlcq6tsBYzDv7LWDtN8PeTZCB/view">ICIP2020 challenge presentation event</ext-link>. One of the solutions was based on using a VGG16 CNN
                <sup>
                    <xref ref-type="bibr" rid="ref26">26</xref>
                </sup> to extract features. The feature vector was applied to the fully connected neural network that included two hidden layers with 4096 nodes, two batch normalization layers, and two dropout layers. On the other hand, another solution used a deep multi-task learning model. It included one shared VGG-based feature extraction block and five independent binary classifiers (one for each distortion type). Each classifier had two fully connected layers with 512 nodes and one node in the output layer with a sigmoid activation function 
                <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/file/d/1QI4-8vzjlcq6tsBYzDv7LWDtN8PeTZCB/view">ICIP2020 challenge presentation event</ext-link>. The description of other baseline solutions was not presented, but the results were shown in the 
                <ext-link ext-link-type="uri" xlink:href="https://hipernav.eu/icip-2020-challenge-session/">challenge dashboard</ext-link>.</p>
            <p>The performance of the proposed methodology was evaluated in terms of classification accuracy, F1-score of single distortion, and F1-score of single and multiple distortions as shown in 
                <xref ref-type="table" rid="T1">Table 1</xref>. It can be observed that the proposed ResNet50-LSTM leads to the best accuracy of 85.0%, while baseline methods yielded accuracies of between 57% and 81.5%. Additionally, ResNet50-LSTM yielded the best F1-score of single and multiple distortions (94.2%), while baseline methods yielded F1-score between 83.2% and 94.1%. Furthermore, it is clear that the performance of our method for multiple distortions outperforms that for single distortion, which still has room for improvement.</p>
            <p>
                <xref ref-type="fig" rid="f2">Figure 2</xref> shows the confusion matrix for each distortion category produced from each LSTM. The LSTMs were able to correctly classify 58 videos out of 60, 46 videos out of 50, 94 out of 95, 88 out of 95 for AWGN, defocus blur, smoke, and uneven illumination, respectively. On the other hand, motion blur LSTM gave the worst classification performance with 29 correct videos out of 45. The reason for this drop was that the videos with motion blur have the minimum number of samples, which is only 80 videos. The performance of the motion LSTM can be improved significantly by having more samples affected by motion blur distortion. The performance metrics of the proposed method for each class are shown in 
                <xref ref-type="table" rid="T2">Table 2</xref>.</p>
            <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                <label>Figure 2. </label>
                <caption>
                    <title>Confusion matrix of a) additive white Gaussian noise, b) defocus blur, c) motion blur, d) smoke, e) uneven illumination.</title>
                </caption>
                <graphic id="gr2" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76597/24bc2823-4e04-4b3f-a2c2-10f3d98383f0_figure2.gif"/>
            </fig>
            <table-wrap id="T2" orientation="portrait" position="float">
                <label>Table 2. </label>
                <caption>
                    <title>Performance metrics of the proposed method for each class in the laparoscopic dataset.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Distortion</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Accuracy %</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Recall %</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Precision %</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">F1-score %</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">FNR %</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">FPR %</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>AWGN noise</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">97.5</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">96.66</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">95.08</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">95.86</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">3.33</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">2.14</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>Defocus blur</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">97.0</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">92.0</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">95.83</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">93.88</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">8.0</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">1.33</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>Motion blur</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">91.0</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">64.44</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">93.55</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">76.31</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">35.56</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">1.29</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>Smoke</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">98.5</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">98.95</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">97.92</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">98.43</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">1.05</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">1.90</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>Uneven illumination</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">96.5</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">92.63</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">100</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">96.17</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">7.37</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">0</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="middle">
                                <bold>Average</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">96.1</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">88.94</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">96.48</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">92.13</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">11.06</td>
                            <td align="left" colspan="1" rowspan="1" valign="middle">1.33</td>
                        </tr>
                    </tbody>
                </table>
                <table-wrap-foot>
                    <p>AWGN, additive white Gaussian noise; FNR, false negative rate; FPR, false positive rate.</p>
                </table-wrap-foot>
            </table-wrap>
            <p>The proposed ResNet50-LSTM was able to run considering real-time conditions. The inference time was 0.05 seconds to extract features from one frame using ResNet50. The features extracted from one frame were added to the features of other frames to be applied to the LSTM. The inference time for five LSTMs to produce the five distortion classes was 0.1 seconds. In summary, the proposed model updates the distortion categories every 0.15 seconds and achieves high speed performance.</p>
        </sec>
        <sec id="sec6" sec-type="conclusions">
            <title>Conclusions</title>
            <p>In this paper, a novel strategy of distortion classification was proposed. A multi-label spatiotemporal deep model, including a pre-trained deep CNN of ResNet50 and five LSTMs, was used to address the problem of single and multiple distortion classification. The proposed model was tested with a laparoscopic video dataset and the results were promising. It was found that our model outperformed existing solutions in terms of accuracy by 4.5% and yielded the best F1-score for single and multiple distortions. Hence, we intend to enhance the performance by tuning more layers of pre-trained CNN with laparoscopic images affected by distortions to learn more informative features. The last step requires collecting a large number of images to achieve promising improvements. Additionally, more recent CNN architectures such as EfficientNet
                <sup>
                    <xref ref-type="bibr" rid="ref27">27</xref>
                </sup> and DeiT (Data-efficient Image Transformers)
                <sup>
                    <xref ref-type="bibr" rid="ref28">28</xref>
                </sup> models are good candidates for extracting informative features. In this paper, the proposed solution only classifies the laparoscopic distortions into five categories. Hence, in future work, we plan to rank each category of distortion in terms of distortion intensity, which is a more challenging matter.</p>
        </sec>
        <sec id="sec7">
            <title>Data availability</title>
            <sec id="sec8">
                <title>Underlying data</title>
                <p>The datasets used in this work were used for the ICIP 2020 challenge and created by researchers from Universit&#x00e9; Sorbonne Paris Nord, France; Norwegian University of Science and Technology, Norway; and Oslo University Hospital, Norway. The datasets are publicly available under a CC-BY-NC-SA 4.0 license from 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/zakopz/icip2020-lvq-challenge">https://github.com/zakopz/icip2020-lvq-challenge</ext-link>.</p>
                <p>
                    <italic toggle="yes">This dataset was not generated nor is it owned by the authors of this article; the listed owners are Universit&#x00e9; Sorbonne Paris Nord, France; Norwegian University of Science and Technology, Norway; and Oslo University Hospital, Norway. Therefore, neither the authors nor F1000Research are responsible for the content of this dataset and cannot provide information about data collection. As this dataset contains potentially identifying images/information, caution is advised when using this dataset in future research.</italic>
                </p>
            </sec>
        </sec>
    </body>
    <back>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>S&#x00e1;nchez-Gonz&#x00e1;lez</surname>
                            <given-names>P</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Laparoscopic video analysis for training and image-guided surgery.</article-title>
                    <source>

                        <italic toggle="yes"> Minim Invasive Ther Allied Technol.</italic>
</source>
                    <year>2011</year>;<volume>20</volume>(<issue>6</issue>):<fpage>311</fpage>&#x2013;<lpage>320</lpage>.
                    <pub-id pub-id-type="pmid">21247251</pub-id>
                    <pub-id pub-id-type="doi">10.3109/13645706.2010.541921</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Verdaasdonk</surname>
                            <given-names>EGG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Stassen</surname>
                            <given-names>LPS</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Elst</surname>
                            <given-names>M</given-names>
                            <prefix>van der</prefix>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Problems with technical equipment during laparoscopic surgery: An observational study.</article-title>
                    <source>

                        <italic toggle="yes">Surg Endosc.</italic>
</source>
                    <year>2007</year>;<volume>21</volume>(<issue>2</issue>):<fpage>275</fpage>&#x2013;<lpage>279</lpage>.
                    <pub-id pub-id-type="pmid">17122973</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s00464-006-0019-2</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Siddaiah-Subramanya</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nyandowe</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tiang</surname>
                            <given-names>KW</given-names>
                        </name>
</person-group>:
                    <article-title>Technical problems during laparoscopy: A systematic method of troubleshooting for surgeons.</article-title>
                    <source>

                        <italic toggle="yes"> Innov Surg Sci.</italic>
</source>
                    <year>2017</year>;<volume>2</volume>(<issue>4</issue>):<fpage>233</fpage>&#x2013;<lpage>237</lpage>.
                    <pub-id pub-id-type="pmid">31579756</pub-id>
                    <pub-id pub-id-type="doi">10.1515/iss-2017-0031</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6754030</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cheikh</surname>
                            <given-names>FA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaaniche</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A smoke removal method for laparoscopic images.</article-title>
                    <source>

                        <italic toggle="yes">arXiv.</italic>
</source>
                    <year>2018</year>:<fpage>6</fpage>&#x2013;<lpage>10</lpage>.</mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Venkatesh</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sharma</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Srivastava</surname>
                            <given-names>V</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Unsupervised smoke to desmoked laparoscopic surgery images using contrast driven Cyclic-DesmokeGAN.</article-title>
                    <source>

                        <italic toggle="yes"> Comput Biol Med.</italic>
</source>
                    <year>2020</year>;<volume>123</volume>:<fpage>103873</fpage>.
                    <pub-id pub-id-type="pmid">32658788</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.compbiomed.2020.103873</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Mohammed</surname>
                            <given-names>AK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cheikh</surname>
                            <given-names>FA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Multiscale deep desmoking for laparoscopic surgery.</article-title>
                    <source>

                        <italic toggle="yes">SPIE Medical Imaging 2019.</italic>
</source>
                    <year>2019</year>, p.<fpage>68</fpage>.
                    <pub-id pub-id-type="doi">10.1117/12.2507822</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sdiri</surname>
                            <given-names>B</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaaniche</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cheikh</surname>
                            <given-names>FA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Efficient enhancement of stereo endoscopic images based on joint wavelet decomposition and binocular combination.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Transactions Medical Imaging.</italic>
</source>
                    <year>2019</year>;<volume>38</volume>(<issue>1</issue>):<fpage>33</fpage>&#x2013;<lpage>45</lpage>.
                    <pub-id pub-id-type="doi">10.1109/TMI.2018.2853808</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Khan</surname>
                            <given-names>ZA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Towards a video quality assessment based framework for enhancement of laparoscopic videos.</article-title>
                    <source>

                        <italic toggle="yes">arXiv.</italic>
</source>
                    <year>2020</year>.
                    <pub-id pub-id-type="doi">10.1117/12.2549266</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Khan</surname>
                            <given-names>ZA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaaniche</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Beghdadi</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Joint Statistical Models for No-Reference Stereoscopic Image Quality Assessment.</article-title>
                    <source>

                        <italic toggle="yes">Proc Euro Workshop Visual Information Processing, EUVIP.</italic>
</source>
                    <year>2018</year>, pp.<fpage>26</fpage>&#x2013;<lpage>28</lpage>.
                    <pub-id pub-id-type="doi">10.1109/EUVIP.2018.8611676</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Mittal</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Moorthy</surname>
                            <given-names>AK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bovik</surname>
                            <given-names>AC</given-names>
                        </name>
</person-group>:
                    <article-title>No-reference image quality assessment in the spatial domain.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Trans. Image Process.</italic>
</source>
                    <year>2012</year>;<volume>21</volume>(<issue>12</issue>):<fpage>4695</fpage>&#x2013;<lpage>4708</lpage>.
                    <pub-id pub-id-type="doi">10.1109/TIP.2012.2214050</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Moorthy</surname>
                            <given-names>AK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bovik</surname>
                            <given-names>AC</given-names>
                        </name>
</person-group>:
                    <article-title>A two-stage framework for blind image quality assessment.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Signal Processing Letters.</italic>
</source>
                    <year>2010</year>;<volume>17</volume>(<issue>5</issue>):<fpage>513</fpage>&#x2013;<lpage>516</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ICIP.2010.5651745</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Moorthy</surname>
                            <given-names>AK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bovik</surname>
                            <given-names>AC</given-names>
                        </name>
</person-group>:
                    <article-title>Blind image quality assessment: From natural scene statistics to perceptual quality.</article-title>
                    <source>

                        <italic toggle="yes">Trans. Image Process.</italic>
</source>
                    <year>2011</year>;<volume>20</volume>(<issue>12</issue>):<fpage>3350</fpage>&#x2013;<lpage>3364</lpage>.
                    <pub-id pub-id-type="doi">10.1109/TIP.2011.2147325</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Jiang</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Feng</surname>
                            <given-names>X</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>F</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Multi-Spectral RGB-NIR Image Classification Using Double-Channel CNN.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2019</year>;<volume>7</volume>:<fpage>20607</fpage>&#x2013;<lpage>20613</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2896128</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>Y</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>T</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>CNN-based semantic segmentation using level set loss.</article-title>
                    <source>

                        <italic toggle="yes">2019 IEEE Winter Conference on Applications of Computer Vision, WACV.</italic>
</source>
                    <year>2019</year>,<volume>2019</volume>, pp.<fpage>1752</fpage>&#x2013;<lpage>1760</lpage>.
                    <pub-id pub-id-type="doi">10.1109/WACV.2019.00191</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Qiu</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wen</surname>
                            <given-names>C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Xie</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Efficient medical image enhancement based on CNN-FBB model.</article-title>
                    <source>

                        <italic toggle="yes">IET Image Processing.</italic>
</source>
                    <year>2019</year>;<volume>13</volume>(<issue>10</issue>):<fpage>1736</fpage>&#x2013;<lpage>1744</lpage>.
                    <pub-id pub-id-type="doi">10.1049/iet-ipr.2018.6380</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Alex</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Khened</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ayyachamy</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Medical image retrieval using Resnet-18 for clinical diagnosis.</article-title>
                    <source>

                        <italic toggle="yes">SPIE Medical Imaging.</italic>
</source>
                    <year>2019</year>;<volume>1095410</volume>:<fpage>35</fpage>.
                    <pub-id pub-id-type="doi">10.1117/12.2515588</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hossain</surname>
                            <given-names>MT</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Teng</surname>
                            <given-names>SW</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Distortion Robust Image Classification Using Deep Convolutional Neural Network with Discrete Cosine Transform.</article-title>
                    <source>

                        <italic toggle="yes">Int Conf Image Processing (ICIP).</italic>
</source>
                    <year>2019</year>, pp.<fpage>659</fpage>&#x2013;<lpage>663</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ICIP.2019.8803787</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Buczkowski</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Stasinski</surname>
                            <given-names>R</given-names>
                        </name>
</person-group>:
                    <article-title>Convolutional Neural Network-Based Image Distortion Classification.</article-title>
                    <source>

                        <italic toggle="yes">2019 Int Conf Systems Signals Image Processing (IWSSIP).</italic>
</source>
                    <year>2019</year>, pp.<fpage>275</fpage>&#x2013;<lpage>279</lpage>.
                    <pub-id pub-id-type="doi">10.1109/IWSSIP.2019.8787212</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <mixed-citation publication-type="book">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hochreiter</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Schmidhuber</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <source>

                        <italic toggle="yes">Long Short-Term Memory Neural Comput.</italic>
</source>
                    <year>1997</year>.
                    <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Islam</surname>
                            <given-names>MZ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Islam</surname>
                            <given-names>MM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Asraf</surname>
                            <given-names>A</given-names>
                        </name>
</person-group>:
                    <article-title>A combined deep CNN-LSTM network for the detection of novel coronavirus (COVID-19) using X-ray images.</article-title>
                    <source>

                        <italic toggle="yes">Informat. Med. Unlocked.</italic>
</source>
                    <year>2020</year>;<volume>20</volume>.
                    <pub-id pub-id-type="doi">10.1016/j.imu.2020.100412</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Khan</surname>
                            <given-names>ZA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Beghdadi</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaaniche</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Residual Networks Based Distortion Classfication and Ranking for Laparoscopic Image Quality Assesment.</article-title>
                    <source>

                        <italic toggle="yes">2020 IEEE Int Conf Image Processing (ICIP).</italic>
</source>
                    <year>2020</year>, pp.<fpage>176</fpage>&#x2013;<lpage>180</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ICIP40778.2020.9191111</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Aldahoul</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Karim</surname>
                            <given-names>HA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tan</surname>
                            <given-names>MJT</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Transfer Learning and Decision Fusion for Real Time Distortion Classification in Laparoscopic Videos.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2021</year>;<volume>9</volume>:<fpage>115006</fpage>&#x2013;<lpage>115018</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3105454</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <mixed-citation publication-type="book">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>He</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>X</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ren</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <source>

                        <italic toggle="yes">Deep residual learning for image recognition.</italic>
</source>
                    <year>2016</year>.
                    <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Deng</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dong</surname>
                            <given-names>W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Socher</surname>
                            <given-names>R</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>ImageNet: A large-scale hierarchical image database.</article-title>
                    <year>2010</year>.
                    <pub-id pub-id-type="doi">10.1109/cvpr.2009.5206848</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Twinanda</surname>
                            <given-names>AP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shehata</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Mutter</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>EndoNet: A Deep Architecture for Recognition Tasks on Laparoscopic Videos.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Trans. Med. Imaging.</italic>
</source>
                    <year>2017</year>.
                    <pub-id pub-id-type="doi">10.1109/TMI.2016.2593957</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Simonyan</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zisserman</surname>
                            <given-names>A</given-names>
                        </name>
</person-group>:
                    <article-title>Very deep convolutional networks for large-scale image recognition.</article-title>
                    <year>2015</year>. arXiv:<fpage>1409.1556</fpage>.</mixed-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Tan</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Le</surname>
                            <given-names>QV</given-names>
                        </name>
</person-group>:
                    <article-title>EfficientNet: Rethinking model scaling for convolutional neural networks.</article-title>
                    <year>2019</year>.<fpage>arXiv:1905.11946</fpage>.</mixed-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Touvron</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cord</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Douze</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Training data-efficient image transformers&amp; distillation through attention.</article-title>
                    <year>2020</year>;<fpage>arXiv:2012.12877</fpage>.</mixed-citation>
            </ref>
        </ref-list>
    </back>
</article>
