<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="methods-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.13511.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Method Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>
                    <italic>netSmooth</italic>: Network-smoothing based imputation for single cell RNA-seq</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: awaiting peer review]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ronen</surname>
                        <given-names>Jonathan</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-3980-6469</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Akalin</surname>
                        <given-names>Altuna</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Resources</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0468-0117</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Scientific Bioinformatics Platform, Berlin Institute for Medical Systems Biology, Max Delbr&#x00fc;ck Center for Molecular Medicine, Berlin, 13125, Germany</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:altuna.akalin@mdc-berlin.de">altuna.akalin@mdc-berlin.de</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>3</day>
                <month>1</month>
                <year>2018</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2018</year>
            </pub-date>
            <volume>7</volume>
            <elocation-id>8</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>22</day>
                    <month>12</month>
                    <year>2017</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2018 Ronen J and Akalin A</copyright-statement>
                <copyright-year>2018</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/7-8/pdf"/>
            <abstract>
                <p>Single cell RNA-seq (scRNA-seq) experiments suffer from a range of characteristic technical biases, such as dropouts (zero or near zero counts) and high variance. Current analysis methods rely on imputing missing values by various means of local averaging or regression, often amplifying biases inherent in the data. We present netSmooth, a network-diffusion based method that uses priors for the covariance structure of gene expression profiles on scRNA-seq experiments in order to smooth expression values. We demonstrate that netSmooth improves clustering results of scRNA-seq experiments from distinct cell populations, time-course experiments, and cancer genomics. We provide an R package for our method, available at: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/BIMSBbioinfo/netSmooth">https://github.com/BIMSBbioinfo/netSmooth</ext-link>.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>scRNA-seq</kwd>
                <kwd>single-cell</kwd>
                <kwd>genomics</kwd>
                <kwd>imputation</kwd>
                <kwd>networks</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/501100001656">
                    <funding-source>Helmholtz-Gemeinschaft</funding-source>
                </award-group>
                <funding-statement>AA and JR are funded by core funding from Max Delbr&#x00fc;ck Center, part of Helmholtz Association.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="intro">
            <title>Introduction</title>
            <p>Single cell RNA sequencing (scRNA-seq) enables profiling of single cells&#x2019; transcriptomes at unprecedented throughput and resolution. It has enabled previously impractical, studies of cell type heterogeneity, differentiation, and developmental trajectories
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>
                </sup>. However, the adaptation of RNA sequencing techniques from bulk samples to single cells did not progress without challenges. Typically, only a fraction of a cells transcriptome may be captured by the experiment, leading to so called "drop-out" events where a gene gets a false 0 (or near 0) count in some cell. The dropout rate is related to the population level expression of a gene leading to many false zero counts for lowly expressed genes, and artificially low counts for highly expressed ones
                <sup>
                    <xref ref-type="bibr" rid="ref-2">2</xref>
                </sup>. Furthermore, the drop-out rate could be related to the biology of the cell type, as some cell types transcribe fewer genes than others, which will appear as drop-out events
                <sup>
                    <xref ref-type="bibr" rid="ref-2">2</xref>
                </sup>. When summed over many samples, transcript counts from single cells resemble those of bulk experiments
                <sup>
                    <xref ref-type="bibr" rid="ref-3">3</xref>
                </sup>, but across individual cells there is significant variation. This makes analysis more difficult than in bulk RNA sequencing experiments.</p>
            <p>Computational methods designed to deal with these issues treat dropout events as missing data points, whose values may be imputed based on non-missing data points (observed measurements). The proportion of 0 counts per gene, a proxy for its technical dropout rate, is a function of the population-wise mean expression of that gene
                <sup>
                    <xref ref-type="bibr" rid="ref-2">2</xref>,
                    <xref ref-type="bibr" rid="ref-4">4</xref>
                </sup>. This observation has led researchers to treat 0 counts as dropout candidates to be imputed.</p>
            <p>CIDR
                <sup>
                    <xref ref-type="bibr" rid="ref-5">5</xref>
                </sup> attempts to impute missing values based on the predicted mean expression of a gene, given its empirical dropout rate (0-count). scImpute
                <sup>
                    <xref ref-type="bibr" rid="ref-6">6</xref>
                </sup> estimates dropout likelihoods per gene and per sample, and assigns each gene in each sample a status as a dropout candidate. Genes might be considered likely dropouts even with nonzero expression, and 0-count genes might not be considered likely dropouts, based on their population-wide expression distributions. It then uses a regularized linear model to predict the expression of dropout genes based on the expression of likely non-dropouts in all other cells. MAGIC
                <sup>
                    <xref ref-type="bibr" rid="ref-7">7</xref>
                </sup> performs local averaging after building a topological graph of the data, updating the expression value of all genes in all cells to their local neighborhood average.</p>
            <p>All of the methods mentioned above use measured information in the data in order to impute the missing information within the same data. As such, they amplify whatever biases are present in a dataset; similar cells pre-imputation will become more similar after imputation, as expression profiles of non-dropout genes will drive similarities in imputed dropped-out genes. Further, all methods except MAGIC only impute unobserved expression events (0s or near 0s), while the dropout phenomenon actually affects the whole transcriptome. Hence, imputation methods for scRNAseq should also adjust non-0 expression measurements in order to recover the true signal.</p>
            <p>We present a method, called 
                <italic toggle="yes">netSmooth</italic>, that uses prior knowledge to temper noisy experimental data. RNA sequencing experiments produce counts data as a proxy for gene activity, which is not known a-priori, especially for experiments profiling unknown cell types. However, decades of molecular biology research have taught us much about the principles of gene interaction. Interacting genes are likely to be co-expressed in cells
                <sup>
                    <xref ref-type="bibr" rid="ref-8">8</xref>,
                    <xref ref-type="bibr" rid="ref-9">9</xref>
                </sup>, and as such, protein-protein interaction (PPI) databases
                <sup>
                    <xref ref-type="bibr" rid="ref-10">10</xref>,
                    <xref ref-type="bibr" rid="ref-11">11</xref>
                </sup> describe genes&#x2019; propensity for co-expression. We developed a graph-diffusion method on PPI networks for smoothing of gene expression values. Each node in the graph (a gene) has an associated gene expression value, and the diffusion presents a weighted averaging of gene expression values among adjacent nodes in the graph, within each cell. This is done iteratively until convergence, strengthening co-expression patterns which are expected to be present. Incorporation of prior data from countless experiments in the preprocessing of scRNA-seq experiments improves resistance to noise and dropouts. Similar network based approaches have been used to extract meaningful information from sparse mutational profiles
                <sup>
                    <xref ref-type="bibr" rid="ref-12">12</xref>,
                    <xref ref-type="bibr" rid="ref-13">13</xref>
                </sup>, and indirectly on gene expression data by diffusing test statistics on the network to discover regulated gene candidates
                <sup>
                    <xref ref-type="bibr" rid="ref-14">14</xref>
                </sup>. We propose diffusion of gene expression values directly on the network as a method for data denoising and imputation. Furthermore, the parameters of this proposed method could be optimized using clustering robustness metrics. We applied our method to a variety of single cell experiments and compared its performance to other selected imputation methods scImpute and MAGIC. These methods represent the latest and divergent ways of imputing the scRNA-seq data.</p>
            <p>We also made available an R package providing the necessary functionality to use our method on other data. It is available on GitHub: 
                <ext-link ext-link-type="uri" xlink:href="https://github.com/BIMSBbioinfo/netSmooth">https://github.com/BIMSBbioinfo/netSmooth</ext-link>.</p>
        </sec>
        <sec sec-type="results">
            <title>Results</title>
            <sec>
                <title>Overview of the method</title>
                <p>The intuition behind the 
                    <italic toggle="yes">netSmooth</italic> algorithm is that gene networks encoding co-expression patterns can be used to smooth scRNA-seq data, pushing its coexpression patterns in a biologically meaningful direction. We demonstrate this using protein-protein interaction networks, which are predictive of coexpression
                    <sup>
                        <xref ref-type="bibr" rid="ref-9">9</xref>
                    </sup>. We produced a PPI graph of high-confidence interactions based on the PPI database STRING
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup>.</p>
                <p>There are 2 inputs to the method: (1) a gene expression matrix, 
                    <italic toggle="yes">N</italic> genes by 
                    <italic toggle="yes">M</italic> cells, and (2) a graph where genes are nodes, and edges indicate genes which are expected to be co-expressed. The edges may be weighed, indicating the strength or direction of a relationship; an edge weight of 2 indicates stronger expected co-expression than an edge weight of 1, and an edge weight of &#x2212;1 indicates negative expected co-expression, such as one gene being a repressor for another. The expression profile of each cell is then projected onto the graph, and a diffusion process is used to smooth the expression values, within each sample, of adjacent genes in the graph (
                    <xref ref-type="fig" rid="f1">Figure 1</xref>). In this way, post-smoothing values of genes represent an estimate of activity levels based on reads aligned to that gene, as well as those aligned to its neighbors in the graph. Thus, a gene with a low read count (possible technical drop-out), whose neighbors in the graph are highly expressed, will get a higher value post smoothing. The rate at which expression values of genes diffuse to their neighbors is degree-normalized, so that genes with many edges will affect their neighbors less than genes with more specific interactions. The diffusion is done using a "random walks with restarts" (RWR) process
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup>, where a conceptual random walker starts in some node in the graph, and at each iteration moves to a neighboring node with a probability determined by the edge weight between the nodes, or, with some probability, restarts the walk from the original node. The 
                    <italic toggle="yes">network-smoothed</italic> value is the stationary distribution of this process. The RWR process has one free parameter, the restart rate. A low value for the restart rate allows diffusion to reach further in the graph; a high restart rate will lead to more local diffusions. For more details see the Methods section.</p>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>The 
                            <italic toggle="yes">netSmooth</italic> algorithm takes a gene expression profile, and a gene network.</title>
                        <p>The expression profile of each sample is projected onto the network, where a diffusion process allows genes&#x2019; expression values to be smoothed by their neighbors&#x2019;. This is done for each cell independently of others. The end result is a network smoothed gene expression matrix.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure1.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Network smoothing improves cell type identification from single-cell RNA-seq</title>
                <p>We first assess 
                    <italic toggle="yes">netSmooth</italic> on a dataset of 1645 mouse hematopoietic stem/progenitor cells (HSPCs) assayed using flow cytometry as well as scRNA-seq
                    <sup>
                        <xref ref-type="bibr" rid="ref-15">15</xref>
                    </sup>. The cells are FACS-sorted into 12 common HSPC phenotypes. This presents an atlas of the hematopoiesis process at a single cell resolution, showing the differentiation paths taken by E-SLAM HSCs as they differentiate to E, GM, and L progenitors. The authors of this study demonstrate that upon clustering the data, some clusters corresponds to cell types. However, the clusters are not noise free and do not fully recapitulate cell type identity. We obtained clusterings of the cells from the normalized counts, as well as after application of 
                    <italic toggle="yes">netSmooth</italic>, MAGIC
                    <sup>
                        <xref ref-type="bibr" rid="ref-7">7</xref>
                    </sup>, and scImpute
                    <sup>
                        <xref ref-type="bibr" rid="ref-6">6</xref>
                    </sup>, using a robust clustering procedure based on the 
                    <italic toggle="yes">clusterExperiment</italic> R package
                    <sup>
                        <xref ref-type="bibr" rid="ref-16">16</xref>
                    </sup> (See Methods). After clustering, we used the edgeR-QLF test
                    <sup>
                        <xref ref-type="bibr" rid="ref-17">17</xref>
                    </sup> to identify genes that are differentially expressed in any of the discovered clusters. 
                    <xref ref-type="fig" rid="f2">Figure 2a,b</xref> shows that after network-smoothing, we are able to identify clusters with a more pronounced differential expression profile. Further, many more of the genes identified as differentially expressed between the clusters (without smoothing) seem to have low and uninformative expression values overall. MAGIC and scImpute also improve this pattern (
                    <xref ref-type="fig" rid="f2">Figure 2c,d</xref>). MAGIC seems to do the strongest transformation to the data, as seen in lower dimension embeddings (
                    <xref ref-type="other" rid="SF2">Figure S2</xref>, 
                    <xref ref-type="other" rid="SF3">Figure S3</xref>).</p>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>Figure 2. </label>
                    <caption>
                        <title>Cells were clustered using the robust clustering procedure, and the 500 most differentially expressed genes (by edgeR-QLF test adjusted P value) in any of the discovered clusters are shown in a heatmap, as well as cluster assignments and FACS-sorted cell types.</title>
                        <p>
                            <bold>A</bold>) raw (no imputation), 
                            <bold>B</bold>) after application of 
                            <italic toggle="yes">netSmooth</italic>, 
                            <bold>C</bold>) missing values imputed using MAGIC 
                            <bold>D</bold>) missing values imputed using scImpute.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure2.gif"/>
                </fig>
                <p>As this dataset has cells with labels independent of the RNAseq (FACS-sorted phenotypes), it presents us with an opportunity to compare the gene expression levels (as measured by RNAseq), to a meaningful phenotypic variable, i.e. the cell type. The cell type discrimination of a clustering result is compared using a cluster purity metric and and the adjusted mutual information (AMI). The cluster purity measures how cell-type specific clusters are by comparing homogeneity of the external labels (FACS-defined cell types), within clusters provided by scRNA-seq data. AMI is a chance-adjusted information-theoretic measure of agreement between two labellings. This method accounts for artificially high mutual information between external labels and clusters when there are high number of clusters (See Methods for details on metrics). We also measured number of cells in robust clusters as quantitative metric. The robust clustering procedure allows cells to be omitted (not be assigned to a cluster) if they cannot be placed in a cluster across multiple clustering methods and/or parameters (See Methods). Only MAGIC is able to increase the proportion of cells in this dataset which fall into robust clusters (
                    <xref ref-type="fig" rid="f3">Figure 3a</xref>), but only 
                    <italic toggle="yes">netSmooth</italic> leads to more biologically meaningful clusters, in terms of purity and AMI (
                    <xref ref-type="fig" rid="f3">Figures 3b,c</xref>), demonstrating that 
                    <italic toggle="yes">netSmooth</italic> can assist in cell type identification, and outperformed both MAGIC and scImpute in this task. The higher clusterability following application of MAGIC than 
                    <italic toggle="yes">netSmooth</italic>, might indicate that MAGIC was overzealous in its transformation, squeezing more cells into the same space. This might lead to more robust clusters, but less reliable cell type identification.</p>
                <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                    <label>Figure 3. </label>
                    <caption>
                        <title>Hematopoiesis clustering metrics.</title>
                        <p>
                            <bold>A</bold>) The proportion of cells which were assigned to robust clusters. 
                            <bold>B</bold>) cluster purity (proportion of dominant cell type) for the robust clusters. 
                            <italic toggle="yes">netSmooth</italic> produces the most pure clusters in terms of cell types. 
                            <bold>C</bold>) AMI of the clustering results obtained after application of each of the methods. Only 
                            <italic toggle="yes">netSmooth</italic> increases the AMI between the clustering and the cell types.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure3.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Network smoothing improves capture of developmental expression patterns</title>
                <p>Next, we test 
                    <italic toggle="yes">netSmooth</italic> on 269 isolated cells from mouse embryos at different stages of pre-implantation development between oocyte and blastocyst, as well as 5 liver cells and 10 fibroblast cells
                    <sup>
                        <xref ref-type="bibr" rid="ref-18">18</xref>
                    </sup>. The authors of this study demonstrated that lower dimension embeddings capture much of the developmental trajectory (
                    <xref ref-type="fig" rid="f4">Figure 4a</xref>, 
                    <xref ref-type="other" rid="SF4">Figure S4a</xref>, 
                    <xref ref-type="other" rid="SF5">Figure S5a</xref>). We then applied 
                    <italic toggle="yes">netSmooth</italic>, MAGIC, and scImpute. 
                    <xref ref-type="fig" rid="f4">Figure 4b</xref> shows the principal component analysis of 
                    <italic toggle="yes">netSmooth</italic>-processed data, and 
                    <xref ref-type="fig" rid="f4">Figures 4c and 4d</xref> show the PCA plot following application of MAGIC and scImpute, respectively. 
                    <italic toggle="yes">netSmooth</italic> and scImpute preserve most of the variance structure of the data, while MAGIC seems to push the data onto a completely different manifold (
                    <xref ref-type="fig" rid="f4">Figure 4</xref>, 
                    <xref ref-type="other" rid="SF5">Figure S5</xref>). We used the robust clustering procedure to obtain clusters, and computed the cluster purity and AMI metrics. 
                    <italic toggle="yes">netSmooth</italic> enabled the clustering procedure to place more of the samples into robust clusters (
                    <xref ref-type="fig" rid="f5">Figure 5a</xref>), and as in the hematopoiesis case, 
                    <italic toggle="yes">netSmooth</italic> is able to assist in identifying the developmental stage or tissue that cells belong to better than the other methods, as evidenced by the higher cluster purities (
                    <xref ref-type="fig" rid="f5">Figure 5b</xref>) and AMI (
                    <xref ref-type="fig" rid="f5">Figure 5c</xref>). Although MAGIC and scImpute reduce the 0-count genes further than 
                    <italic toggle="yes">netSmooth</italic> (
                    <xref ref-type="other" rid="SF1">Figure S1</xref>), they do not add as much clarity to the developmental stage signal inherent in the data. This shows that imputing missing counts based on data from the same experiment is not as powerful as including priors in the quasi-imputation process 
                    <italic toggle="yes">netSmooth</italic> does.</p>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>Figure 4. </label>
                    <caption>
                        <p>2D PCA plots of the embryonic development dataset 
                            <bold>A</bold>) no preprocessing, 
                            <bold>B</bold>) after application of 
                            <italic toggle="yes">netSmooth</italic>, 
                            <bold>C</bold>) after imputing missing values with scImpute, and 
                            <bold>D</bold>) after application of MAGIC.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure4.gif"/>
                </fig>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>Figure 5. </label>
                    <caption>
                        <title>The Embryonic development dataset.</title>
                        <p>
                            <bold>A</bold>) The proportion of cells which were assigned to robust clusters. All three methods lead to better clusterability, with MAGIC having the strongest effect. 
                            <bold>B</bold>) cluster purity (proportion of dominant cell type) for the robust clusters. 
                            <italic toggle="yes">netSmooth</italic> produces the most pure clusters in terms of cell types. 
                            <bold>C</bold>) Adjusted mutual information of clusterings and cell types. Only 
                            <italic toggle="yes">netSmooth</italic> increases the AMI over the non-preprocessed data.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure5.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Network smoothing improves identification of glioblastoma tumors</title>
                <p>Finally, we demonstrate applicability of 
                    <italic toggle="yes">netSmooth</italic> to cancer research. Patel 
                    <italic toggle="yes">et al.</italic> generated scRNA-seq data of 800 cells from 5 glioblastoma tumors and 2 cell lines
                    <sup>
                        <xref ref-type="bibr" rid="ref-19">19</xref>
                    </sup>. Lower dimension embedding plots show that cells from different tumors or cell lines generally group together, but some are not wholly distinguishable from other tumors (
                    <xref ref-type="fig" rid="f6">Figure 6a</xref>, 
                    <xref ref-type="other" rid="SF4">Figure S4a</xref>, 
                    <xref ref-type="other" rid="SF5">Figure S5a</xref>). Further, the two cell lines group closer to each other than the other patient samples. After applying 
                    <italic toggle="yes">netSmooth</italic> to the data, tumors become easier to distinguish in a lower dimensional embedding (
                    <xref ref-type="fig" rid="f6">Figure 6b</xref>), indicating that 
                    <italic toggle="yes">netSmooth</italic> improves assignment of each cell to its tumor, cell line, or clone of origin. Again, scImpute also leads to similar reduced dimension embedding (
                    <xref ref-type="fig" rid="f6">Figure 6d</xref>), while MAGIC distorted the data more than the other methods (
                    <xref ref-type="fig" rid="f6">Figure 6c</xref>). We used the robust clustering procedure before and after 
                    <italic toggle="yes">netSmooth</italic>, MAGIC, and scImpute. Only MAGIC increase the clusterabitliy of the data (
                    <xref ref-type="fig" rid="f7">Figure 7a</xref>), but 
                    <italic toggle="yes">netSmooth</italic> leads to the most pure clusters, in terms of tumor or cell line of origin (
                    <xref ref-type="fig" rid="f7">Figure 7b, Figure 7c</xref>).</p>
                <fig fig-type="figure" id="f6" orientation="portrait" position="float">
                    <label>Figure 6. </label>
                    <caption>
                        <p>t-SNE plots of the glioblastoma dataset 
                            <bold>A</bold>) no preprocessing, 
                            <bold>B</bold>) after application of 
                            <italic toggle="yes">netSmooth</italic>, 
                            <bold>C</bold>), using MAGIC, and 
                            <bold>D</bold>) after application of scImpute.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure6.gif"/>
                </fig>
                <fig fig-type="figure" id="f7" orientation="portrait" position="float">
                    <label>Figure 7. </label>
                    <caption>
                        <title>Imputation performance for the glioblastoma dataset.</title>
                        <p>
                            <bold>A</bold>) The proportion of cells which were assigned to robust clusters. 
                            <italic toggle="yes">netSmooth</italic>, MAGIC, and scImpute all increased the proportion of cells that are assigned to robust clusters, with MAGIC leading, 
                            <italic toggle="yes">netSmooth</italic> in second place, and scImpute in third. 
                            <bold>B</bold>) cluster purity (proportion of dominant cell type) for the robust clusters. 
                            <italic toggle="yes">netSmooth</italic> produces the most pure clusters in terms of tumor or cell line of origin. 
                            <bold>C</bold>) AMI of the clustering results obtained after application of each of the methods.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure7.gif"/>
                </fig>
                <p>Tumor or cell line of origin is an imperfect proxy for phenotypical variation in cancer cells, because some cells cluster by cell type rather than tumor of origin, demonstrating the heterogeneity in these glioblastoma tumors and similarities across origins
                    <sup>
                        <xref ref-type="bibr" rid="ref-19">19</xref>
                    </sup>. Nevertheless, we chose to compute cluster purity based on the cell origin rather than other labels which might be assigned to them, as it is the only 
                    <italic toggle="yes">ground truth</italic> variable that is independent of the RNAseq experiment. Further, cells do group by origin (
                    <xref ref-type="fig" rid="f6">Figure 6</xref>, 
                    <xref ref-type="other" rid="SF6">Figure S6</xref>), and identification of origin is an interesting question in its own right in the field of cancer genomics, particularly for heterogeneous tumors such as these.</p>
            </sec>
            <sec>
                <title>Sensitivity to the network</title>
                <p>Next, we set out to ensure that the results are not an artifact of the network structure, i.e. that the actual links between genes that we used in the network are important. We expect 
                    <italic toggle="yes">netSmooth</italic> not to perform well when using networks with similar characteristics, but where edges do not represent real interactions. To that effect, we constructed 20 random networks by keeping the same graph structure of the real PPI graph, but shuffling the gene names.</p>
                <p>Thus, these random networks share all the characteristics of the real network (degree distribution, community structure), except for the true identity of the nodes. We then used those networks as inputs to 
                    <italic toggle="yes">netSmooth</italic> and ran the benchmarks as before on the hematopoiesis dataset. Using random networks as an input to 
                    <italic toggle="yes">netSmooth</italic> gives cluster purities distributed around a mode given by the cluster purities of the raw data, while the cluster purities given from using the real PPI network lie at the extreme edge of the distribution (
                    <xref ref-type="fig" rid="f8">Figure 8a</xref>). Further, most random networks result in fewer samples belonging to robust clusters (
                    <xref ref-type="fig" rid="f8">Figure 8b</xref>). These results demonstrate that it is indeed the information contained in the PPI graph enables netSmooth to transform the gene expression matrix in a more biologically coherent direction, and that the transformation we see can not be explained simply by the network structure.</p>
                <fig fig-type="figure" id="f8" orientation="portrait" position="float">
                    <label>Figure 8. </label>
                    <caption>
                        <title>Performance of 
                            <italic toggle="yes">netSmooth</italic> with randomized networks.</title>
                        <p>
                            <bold>A</bold>) The median cluster purity achieved with the random networks. The real network outperforms the random ones, which result in cluster purities distributed around the purity given without using 
                            <italic toggle="yes">netSmooth</italic>. 
                            <bold>B</bold>) The proportion of samples assigned to robust clusters using the random networks as well as the real one. While all networks result in fewer samples robustly clustered (in the hematopoiesis dataset), the real network outperforms most random networks.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure8.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Using other networks with netSmooth</title>
                <p>In addition to using an unweighed (where all edge weights are 1), undirected (where all edge weights are positive) network from string-db, we constructed other gene networks and used them as inputs to 
                    <italic toggle="yes">netSmooth</italic>. We created a directed gene network from only those edges in string-db which are marked as activating or inhibiting
                    <xref ref-type="fn" rid="FN1">
                        <sup>i</sup>
                    </xref>. We set the edge weights of the activating interactions to +1, and &#x2212;1 for the inhibiting interactions, allowing gene expression values to be adjusted downwards for genes whose known antagonists are highly expressed. After smoothing, we set all negative smoothed expression values to 0. We also constructed a gene network from string-db using only genes that are known to demonstrate cell-type specific expression. In order to obtain a list of genes with such cell-type specific expression patterns from the 
                    <italic toggle="yes">Expression Atlas</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref-20">20</xref>
                    </sup>, we used only the genes which show a cell-type specific expression with a mean TPM of at least 1 in some cell type, and used the subset of string-db network containing those genes as an input to 
                    <italic toggle="yes">netSmooth</italic>. Both of those modified graphs perform similarly to the undirected graph from string-db (
                    <xref ref-type="fig" rid="f9">Figure 9</xref>, 
                    <xref ref-type="other" rid="SF8">Figure S8a, S8b</xref>), demonstrating that 
                    <italic toggle="yes">netSmooth</italic> is able to use priors from different types of experiments in order to improve clustering of scRNA-seq.</p>
                <fig fig-type="figure" id="f9" orientation="portrait" position="float">
                    <label>Figure 9. </label>
                    <caption>
                        <title>Cluster purities after applying 
                            <italic toggle="yes">netSmooth</italic> with different input networks.</title>
                        <p>Raw refers to no smoothing, non-directional is the same as the results shown in previous sections. Directional refers to a gene network where inhibitory relationships have negative edge weights, and cell-type specific refers to a gene network of only genes which are known to have cell-type specific expression patterns.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure9.gif"/>
                </fig>
                <p>We also considered other sources for the gene network. We constructed a gene network from HumanNet
                    <sup>
                        <xref ref-type="bibr" rid="ref-21">21</xref>
                    </sup>, a functional gene network where edges denote interactions between two genes. We constructed a smoothing graph by taking all edges from HumanNet, and producing a graph where all edge weights are set to 1. We then used this graph as an input to 
                    <italic toggle="yes">netSmooth</italic> on the glioblastoma dataset. It performs similarly to the network from string-db (
                    <xref ref-type="fig" rid="f10">Figure 10</xref>, 
                    <xref ref-type="other" rid="SF8">Figure S8c</xref>), demonstrating that other sources for gene interactions may also be used by 
                    <italic toggle="yes">netSmooth</italic> to improve clustering results of scRNA-seq.</p>
                <fig fig-type="figure" id="f10" orientation="portrait" position="float">
                    <label>Figure 10. </label>
                    <caption>
                        <title>Cluster purities after applying 
                            <italic toggle="yes">netSmooth</italic> with different input networks.</title>
                        <p>Raw refers to no smoothing, string-db is the same as the results shown in previous sections, and HumanNet refers to a gene network constructed from the HumanNet database.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure10.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Optimizing the smoothing parameters by cluster robustness</title>
                <p>The 
                    <italic toggle="yes">netSmooth</italic> algorithm, given a gene network, has one free parameter - the restart rate of the random walker, (1 
                    <italic toggle="yes">&#x2212; &#x03b1;</italic>). Alternatively, 
                    <italic toggle="yes">&#x03b1;</italic> is the complement of the restart rate. An 
                    <italic toggle="yes">&#x03b1;</italic> = 0 indicates a perfect restart rate and consequently no smoothing; an 
                    <italic toggle="yes">&#x03b1;</italic> = 1 corresponds to a random walk without restarts. Intermediate values for 
                    <italic toggle="yes">&#x03b1;</italic> result in increasing levels of smoothing; the value of 
                    <italic toggle="yes">&#x03b1;</italic> determines how far random walks will go on the graph before restarting, or how far along the network a gene&#x2019;s influence is allowed to reach (See Methods). It is tempting to optimize 
                    <italic toggle="yes">&#x03b1;</italic> with respect to the variable the experiment sets out to measure, e.g. cluster purity. For instance, in the embryonic development dataset, we would choose 
                    <italic toggle="yes">&#x03b1;</italic> = 0.7 as the value that produces the highest cluster purity (
                    <xref ref-type="fig" rid="f11">Figure 11b</xref>). However, in many experiments the identity of the samples is not known a-priori. Therefore, we propose a data driven workflow to pick a sensible value for 
                    <italic toggle="yes">&#x03b1;</italic>.</p>
                <fig fig-type="figure" id="f11" orientation="portrait" position="float">
                    <label>Figure 11. </label>
                    <caption>
                        <title>Boxplots of cluster purity for clusters obtained by the robust clustering procedure following application of 
                            <italic toggle="yes">netSmooth</italic> with different values of 
                            <italic toggle="yes">&#x03b1;</italic>.</title>
                        <p>
                            <italic toggle="yes">&#x03b1;</italic> = 0 is equivalent to not using 
                            <italic toggle="yes">netSmooth</italic> at all. The procedure is robust to alpha, that is, most values of alpha produce more robust clusters. 
                            <bold>A</bold>) HSPCs, 
                            <bold>B</bold>) embryonic cells, 
                            <bold>C</bold>) glioblastomas.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure11.gif"/>
                </fig>
                <p>One such data-driven statistic is the proportion of samples assigned to robust clusters; following application of 
                    <italic toggle="yes">netSmooth</italic>, the robust clustering procedure is able to assign more samples to statistically robust clusters. For all three datasets, picking the 
                    <italic toggle="yes">&#x03b1;</italic> that gives the highest proportion of cells in robust clusters, also gives the clusters with the highest purity index (
                    <xref ref-type="fig" rid="f12">Figure 12</xref>). Importantly, this metric is entirely data-driven and does not require external labels, making it feasible for any scRNA-seq study. The results in the previous sections all use the value of 
                    <italic toggle="yes">&#x03b1;</italic> picked to optimize proportion in robust clusters.</p>
                <fig fig-type="figure" id="f12" orientation="portrait" position="float">
                    <label>Figure 12. </label>
                    <caption>
                        <title>The proportion of cells in robust clusters, and cluster purity for those robust clusters, for a range of alpha values, shows that picking the alpha with the highest proportion in robust clusters also picks the alpha with the highest cluster purity.</title>
                        <p>
                            <bold>A</bold>) hematopoietic stem/progenitor cells 
                            <bold>B</bold>) embryonic cells, 
                            <bold>C</bold>) glioblastomas.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/14669/c84db29f-c15d-4c56-9070-e17ff28b9885_figure12.gif"/>
                </fig>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <p>Single cell RNA sequencing technology provides whole-genome transcriptional profiles at unprecedented throughput and resolution. However, high variance and dropout events that happen in all current scRNA-seq platforms complicate the interpretation of the data. Methods that treat 0 counts as missing values and impute them based on nonzero values in the data may amplify biases in the data.</p>
            <p>We presented 
                <italic toggle="yes">netSmooth</italic> as a preprocessing step for scRNA-seq experiments, overcoming these challenges by the use of prior information derived from protein-protein interactions or other molecular interaction networks. We demonstrated that network smoothing assists in several standard analyses that are common in scRNA-seq studies. This procedure enhances cell type identification in hematopoiesis; it elucidates time series data and assists identification of the developmental stage of single cells. Finally, it is also applicable in cancer, improving identification of tumor of origin for glioblastomas. In addition, we showed that network smoothing parameter can be optimized by cluster robustness metrics, providing a workflow when there are no other external labels to distinguish cells. We demonstrated that 
                <italic toggle="yes">netSmooth</italic> can use prior information from different sources in order to achieve this. We compared 
                <italic toggle="yes">netSmooth</italic> with scImpute, a statistical genome-wide imputation method, and MAGIC, a genome-wide data smoothing algorithm, and demonstrated that while scImpute and MAGIC reduce the drop-out phenomenon more than 
                <italic toggle="yes">netSmooth</italic> does, 
                <italic toggle="yes">netSmooth</italic> outperforms them in amplifying the biological/technical variability ratio. 
                <italic toggle="yes">netSmooth</italic> provides clusters that are more homogeneous and have higher adjusted mutual information (AMI) with respect to cell types. Although, in some cases data processed by MAGIC produces more robust clusters, the clusters returned after MAGIC processing do not have higher AMI or cluster purity. Higher robustness achieved by MAGIC processing might be due to the fact that the algorithm reinforces local structures too much in the data and producing artificially similar expression profiles between cells.</p>
            <p>Finally, 
                <italic toggle="yes">netSmooth</italic> is a versatile algorithm that may be incorporated in any analysis pipeline for any experiment where the organism in question has a high quality PPI network available. Although not shown, the algorithm is applicable to any omics data set that can be constructed as a genes-by-samples matrix, such as proteomics, SNPs and copy number variation. In addition, most of the computational load of network smoothing can be done "off-line". As such it scales well with the number of cells, which is likely to increase in future scRNA-seq experiments.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <sec>
                <title>The random walks with restarts process</title>
                <p>The 
                    <italic toggle="yes">netSmooth</italic> algorithm takes a graph 
                    <italic toggle="yes">G</italic> = {
                    <italic toggle="yes">V</italic>, 
                    <italic toggle="yes">E</italic>} where 
                    <italic toggle="yes">V</italic> = {
                    <italic toggle="yes">gene</italic>
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub>} is the set of genes, and 
                    <italic toggle="yes">E</italic> = {(
                    <italic toggle="yes">i  &#x2192;  j</italic>)} is the set of edges between genes. The edge weights are degree-normalized, so that each gene&#x2019;s outgoing edges&#x2019; weights sum to 1. We then define a process of random walk with restarts as in 
                    <xref ref-type="bibr" rid="ref-13">13</xref>, on the PPI graph, where a conceptual random walker starts on a node in the graph (a gene/protein) and at each step walks to an adjacent node with the probability determined by the 
                    <italic toggle="yes">&#x03b1;</italic> times the edge weight. Further, at each step, there is a probability of (1 
                    <italic toggle="yes">&#x2212; &#x03b1;</italic>) that the walker restarts to its original node.</p>
                <p>Mathematically, given a graph defined by an adjacency matrix 
                    <italic toggle="yes">A</italic>
                    <sub>[
                        <italic toggle="yes">MxM</italic>]</sub>, where 
                    <italic toggle="yes">A</italic>
                    <sub>
                        <italic toggle="yes">i j</italic>
                    </sub> is the edge weight between gene 
                    <italic toggle="yes">i</italic> and gene 
                    <italic toggle="yes">j</italic> (and 0 for unconnected genes), and a vector 
                    <italic toggle="yes">f</italic>
                    <sub>[
                        <italic toggle="yes">Mx</italic>1]</sub>, where 
                    <inline-formula>
                        <mml:math display="inline" id="M">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>f</mml:mi>
                                    <mml:mi>i</mml:mi>
                                    <mml:mi>t</mml:mi>
                                </mml:msubsup>
                            </mml:mrow>
                        </mml:math>
                    </inline-formula> is the probability that the walker is at node 
                    <italic toggle="yes">i</italic> at step 
                    <italic toggle="yes">t</italic>, the process is defined by</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math1">
                            <mml:msup>
                                <mml:mi>f</mml:mi>
                                <mml:mrow>
                                    <mml:mi>t</mml:mi>
                                    <mml:mo>+</mml:mo>
                                    <mml:mn>1</mml:mn>
                                </mml:mrow>
                            </mml:msup>
                            <mml:mo>=</mml:mo>
                            <mml:mi>&#x03b1;</mml:mi>
                            <mml:mi>A</mml:mi>
                            <mml:msup>
                                <mml:mi>f</mml:mi>
                                <mml:mi>t</mml:mi>
                            </mml:msup>
                            <mml:mo>+</mml:mo>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>1</mml:mn>
                            <mml:mo>&#x2212;</mml:mo>
                            <mml:mi>&#x03b1;</mml:mi>
                            <mml:mo stretchy="false">)</mml:mo>
                            <mml:msup>
                                <mml:mi>f</mml:mi>
                                <mml:mn>0</mml:mn>
                            </mml:msup>
                            <mml:mo>.</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>This process is convergent, and the stationary distribution is given by</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math2">
                            <mml:mrow>
                                <mml:msup>
                                    <mml:mi>f</mml:mi>
                                    <mml:mi>&#x221e;</mml:mi>
                                </mml:msup>
                                <mml:mo>=</mml:mo>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mn>1</mml:mn>
                                <mml:mo>&#x2212;</mml:mo>
                                <mml:mi>&#x03b1;</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:msup>
                                    <mml:mrow>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>I</mml:mi>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mi>&#x03b1;</mml:mi>
                                        <mml:mi>A</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mn>1</mml:mn>
                                    </mml:mrow>
                                </mml:msup>
                                <mml:msup>
                                    <mml:mi>f</mml:mi>
                                    <mml:mn>0</mml:mn>
                                </mml:msup>
                                <mml:mo>.</mml:mo>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Hence, the random walk with restarts process is a diffusion process defined on the PPI graph, or through the diffusion kernel (smoothing kernel)</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math3">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>K</mml:mi>
                                    <mml:mi>A</mml:mi>
                                    <mml:mi>&#x03b1;</mml:mi>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mn>1</mml:mn>
                                <mml:mo>&#x2212;</mml:mo>
                                <mml:mi>&#x03b1;</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:msup>
                                    <mml:mrow>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>I</mml:mi>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mi>&#x03b1;</mml:mi>
                                        <mml:mi>A</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mn>1</mml:mn>
                                    </mml:mrow>
                                </mml:msup>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where (1 
                    <italic toggle="yes">&#x2212; &#x03b1;</italic>) is the restart probability, and 
                    <italic toggle="yes">A</italic> is the (column normalized) adjacency matrix of the PPI graph. Consequently, we define the 
                    <italic toggle="yes">network-smoothed</italic> expression profile</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math4">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>E</mml:mi>
                                    <mml:mrow>
                                        <mml:mi>s</mml:mi>
                                        <mml:mi>m</mml:mi>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:msubsup>
                                    <mml:mi>K</mml:mi>
                                    <mml:mi>A</mml:mi>
                                    <mml:mi>&#x03b1;</mml:mi>
                                </mml:msubsup>
                                <mml:mi>E</mml:mi>
                                <mml:mo>,</mml:mo>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where 
                    <italic toggle="yes">E</italic>
                    <sub>[
                        <italic toggle="yes">MxN</italic>]</sub> is the normalized count values of the 
                    <italic toggle="yes">M</italic> genes in the 
                    <italic toggle="yes">N</italic> cells.</p>
            </sec>
            <sec>
                <title>The clustering procedure</title>
                <p>Clustering analysis features prominently in scRNA-seq analyses; whether recapitulating known results or discovering new cell types, clustering cells by their gene expression profiles is commonly used to identify distinct populations. While some approaches directly take into account the zero-inflation of scRNA-seq data
                    <sup>
                        <xref ref-type="bibr" rid="ref-5">5</xref>
                    </sup>, other studies use traditional methods
                    <sup>
                        <xref ref-type="bibr" rid="ref-18">18</xref>
                    </sup>. There is no standard method for clustering single cell RNAseq data, as different studies produce data with different topologies, which respond differently to the various clustering algorithms.</p>
                <p>In order to avoid optimizing different clustering routines for the different datasets we benchmark on, we have implemented a robust clustering routine based on 
                    <italic toggle="yes">clusterExperiment</italic>
                    <xref ref-type="fn" rid="FN2">
                        <sup>ii</sup>
                    </xref>
                    <sup>
                        <xref ref-type="bibr" rid="ref-16">16</xref>
                    </sup>, a framework for robust clustering based on consensus clustering of clustering assignments obtained from different clustering algorithms, different parameters for these algorithms, and different views of the data. The different views are different reduced dimensionality projections of the data based on different techniques. Thus, no single clustering result will dominate the data, and only cluster structures which are robust to different analyses will prevail. The procedure we implemented using the framework is as follows:</p>
                <list list-type="bullet">
                    <list-item>
                        <label>1. </label>
                        <p>Perform different dimensionality reduction techniques on the data</p>
                        <list id="L2" list-type="bullet">
                            <list-item>
                                <label>&#x2022; </label>
                                <p>PCA on the 500 most variable genes</p>
                                <list list-type="bullet">
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>with 5 components</p>
                                    </list-item>
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>with 15 components</p>
                                    </list-item>
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>with 50 components</p>
                                    </list-item>
                                </list>
                            </list-item>
                            <list-item>
                                <label>&#x2022; </label>
                                <p>Alternatively to PCA, t-SNE on the 500 most variable genes</p>
                                <list list-type="bullet">
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>with 2 dimensions</p>
                                    </list-item>
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>with 3 dimensions</p>
                                    </list-item>
                                </list>
                            </list-item>
                            <list-item>
                                <label>&#x2022; </label>
                                <p>Select the most variable genes</p>
                                <list list-type="bullet">
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>100 most variable genes</p>
                                    </list-item>
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>500 most variable genes</p>
                                    </list-item>
                                    <list-item>
                                        <label>&#x2013; </label>
                                        <p>1000 most variable genes</p>
                                    </list-item>
                                </list>
                            </list-item>
                        </list>
                    </list-item>
                    <list-item>
                        <label>2. </label>
                        <p>On each reduced dimension view of the data, perform PAM clustering with K ranging from 5 to 10</p>
                    </list-item>
                    <list-item>
                        <label>3. </label>
                        <p>Calculate the co-clustering index for each pair of samples (the proportion of times the samples are clustered together, in the different clustering results based on the different reduced dimensions and clustering parameters above)</p>
                    </list-item>
                    <list-item>
                        <label>4. </label>
                        <p>Find a consensus clustering from the co-clustering matrix. This is done by constructing a dendrogram using average linkage, and traversing down the tree until a block with a self-similarity of at least 0.6, and a minimum size of 20 samples emerges. (instead of using 
                            <monospace>cutree</monospace>).</p>
                    </list-item>
                    <list-item>
                        <label>5. </label>
                        <p>Perform hierarchical clustering of the cluster medioids, with similarities based on expression of the 500 most variable genes</p>
                    </list-item>
                    <list-item>
                        <label>6. </label>
                        <p>Perform a DE analysis between clusters that are adjacent in the hierarchy from (5), and merge them if the proportion of genes that are found to be significantly differentially expressed between them (adjP 
                            <italic toggle="yes">&lt;</italic> .05) is less than than 0.1.</p>
                    </list-item>
                </list>
                <p>Using only the 500 most variable genes insures the biological variation will dominate the technical variation, and enhances the reproducibility of t-SNE
                    <sup>
                        <xref ref-type="bibr" rid="ref-22">22</xref>
                    </sup>.</p>
                <p>Importantly, samples that at step (4) don&#x2019;t have a high enough affinity to any emerging cluster, will not be assigned to any cluster. The clustering is performed using the 
                    <monospace>clusterExperiment::clusterSingle</monospace> and 
                    <monospace>clusterExperiment::clusterMany</monospace> functions, the consensus clustering is obtained using the 
                    <monospace>clusterExperiment::combineMany</monospace> function, and the cluster merging (steps 5 and 6) using the 
                    <monospace>clusterExperiment::makeDendrogram</monospace> and 
                    <monospace>clusterExperiment::mergeClusters</monospace> functions. For more details, see 
                    <xref ref-type="bibr" rid="ref-16">16</xref>.</p>
            </sec>
            <sec>
                <title>Choice of dimensionality reduction technique in the clustering procedure</title>
                <p>In step (1) above, we cluster cells in a lower dimension embedding using either PCA
                    <sup>
                        <xref ref-type="bibr" rid="ref-23">23</xref>
                    </sup> or t-SNE
                    <sup>
                        <xref ref-type="bibr" rid="ref-24">24</xref>
                    </sup>, in a dataset-dependent manner. Different single cell datasets respond better to different dimensionality reduction techniques which are better able to tease out the biological cluster structure of the data. In order to pick the right technique algorithmically, we compute the entropy in a 2D embedding. We obtained 2D embeddings from the 500 most variable genes using either PCA or t-SNE, binned them in a 20x20 grid, and computed the entropy using the 
                    <monospace>discretize</monospace> and 
                    <monospace>entropy</monospace> functions in the 
                    <italic toggle="yes">entropy</italic> R package
                    <xref ref-type="fn" rid="FN3">
                        <sup>iii</sup>
                    </xref>
                    <sup>
                        <xref ref-type="bibr" rid="ref-25">25</xref>
                    </sup>. The entropy in the 2D embedding is a measure for the information captured by it. For the clustering procedure, we pick the embedding with the highest information content. For the hematopoiesis and glioblastoma datasets, this is t-SNE, while for the embryonic development dataset it is PCA (
                    <xref ref-type="table" rid="T2">Table 2</xref>). This method may be used to pick any dimensionality reduction technique other than the ones mentioned here, which might be more suitable for other analyses.</p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Datasets and availability.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Dataset</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">URL</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Hematopoiesis</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">
                                    <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682">https://www.ncbi.nlm.nih.gov/geo/query/acc.</ext-link>
                                    <break/>
                                    <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682">cgi?acc=GSE81682</ext-link>
                                </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Embryonic cells</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">
                                    <ext-link ext-link-type="uri" xlink:href="http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE45719.rds">http://imlspenticton.uzh.ch/robinson_lab/</ext-link>
                                    <break/>
                                    <ext-link ext-link-type="uri" xlink:href="http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE45719.rds">conquer/data-mae/GSE45719.rds</ext-link>
                                </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Glioblastoma</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">
                                    <ext-link ext-link-type="uri" xlink:href="http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE57872.rds">http://imlspenticton.uzh.ch/robinson_lab/</ext-link>
                                    <break/>
                                    <ext-link ext-link-type="uri" xlink:href="http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE57872.rds">conquer/data-mae/GSE57872.rds</ext-link>
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T2" orientation="portrait" position="anchor">
                    <label>Table 2. </label>
                    <caption>
                        <title>Entropy in 2D lower dimension embeddings.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Dataset</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">PCA Entropy</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">t-SNE Entropy</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Hematopoiesis</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">4.96</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">5.03</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Embryonic cells</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">4.09</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">3.94</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Glioblastoma</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">4.87</td>
                                <td align="right" colspan="1" rowspan="1" valign="top">5.06</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec>
                <title>Cluster purity and adjusted mutual information</title>
                <p>The cluster purity metric displayed above refers to the proportion of the samples in a cluster which are of the dominant cell type in that cluster. The purity for cluster 
                    <italic toggle="yes">i</italic> is given by</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math5">
                            <mml:mrow>
                                <mml:mi>P</mml:mi>
                                <mml:mi>u</mml:mi>
                                <mml:mi>r</mml:mi>
                                <mml:mi>i</mml:mi>
                                <mml:mi>t</mml:mi>
                                <mml:msub>
                                    <mml:mi>y</mml:mi>
                                    <mml:mi>i</mml:mi>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mfrac>
                                    <mml:mrow>
                                        <mml:mstyle displaystyle="true">
                                            <mml:munder>
                                                <mml:mo>&#x2211;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mo>&#x2208;</mml:mo>
                                                    <mml:msub>
                                                        <mml:mi>C</mml:mi>
                                                        <mml:mi>i</mml:mi>
                                                    </mml:msub>
                                                </mml:mrow>
                                            </mml:munder>
                                            <mml:mrow>
                                                <mml:mrow>
                                                    <mml:mo>{</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mtable columnalign="left">
                                                            <mml:mtr columnalign="left">
                                                                <mml:mtd columnalign="left">
                                                                    <mml:mrow>
                                                                        <mml:mn>1</mml:mn>
                                                                        <mml:mo>,</mml:mo>
                                                                    </mml:mrow>
                                                                </mml:mtd>
                                                                <mml:mtd columnalign="left">
                                                                    <mml:mrow>
                                                                        <mml:mtext>if</mml:mtext>
                                                                        <mml:mspace width=".2em"/>
                                                                        <mml:mi>l</mml:mi>
                                                                        <mml:mi>a</mml:mi>
                                                                        <mml:mi>b</mml:mi>
                                                                        <mml:mi>e</mml:mi>
                                                                        <mml:msub>
                                                                            <mml:mi>l</mml:mi>
                                                                            <mml:mi>j</mml:mi>
                                                                        </mml:msub>
                                                                        <mml:mo>=</mml:mo>
                                                                        <mml:msub>
                                                                            <mml:mrow>
                                                                                <mml:mtext>dom</mml:mtext>
                                                                            </mml:mrow>
                                                                            <mml:mi>i</mml:mi>
                                                                        </mml:msub>
                                                                    </mml:mrow>
                                                                </mml:mtd>
                                                            </mml:mtr>
                                                            <mml:mtr columnalign="left">
                                                                <mml:mtd columnalign="left">
                                                                    <mml:mrow>
                                                                        <mml:mn>0</mml:mn>
                                                                        <mml:mo>,</mml:mo>
                                                                    </mml:mrow>
                                                                </mml:mtd>
                                                                <mml:mtd columnalign="left">
                                                                    <mml:mrow>
                                                                        <mml:mtext>otherwise</mml:mtext>
                                                                    </mml:mrow>
                                                                </mml:mtd>
                                                            </mml:mtr>
                                                        </mml:mtable>
                                                    </mml:mrow>
                                                </mml:mrow>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>n</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mfrac>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where 
                    <italic toggle="yes">C</italic>
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub> = {
                    <italic toggle="yes">j</italic>|cell
                    <sub>
                        <italic toggle="yes">j</italic>
                    </sub> &#x2208; cluster
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub>}, 
                    <italic toggle="yes">label</italic>
                    <sub>
                        <italic toggle="yes">j</italic>
                    </sub> is the cell type of 
                    <italic toggle="yes">cell</italic>
                    <sub>
                        <italic toggle="yes">j</italic>
                    </sub>, 
                    <italic toggle="yes">n</italic>
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub> = |
                    <italic toggle="yes">C</italic>
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub>| is the number of cells in cluster 
                    <italic toggle="yes">i</italic>, and</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="math6">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mrow>
                                        <mml:mtext>dom</mml:mtext>
                                    </mml:mrow>
                                    <mml:mi>i</mml:mi>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mi>arg</mml:mi>
                                <mml:mo>&#x2061;</mml:mo>
                                <mml:munder>
                                    <mml:mrow>
                                        <mml:mi>max</mml:mi>
                                        <mml:mo>&#x2061;</mml:mo>
                                    </mml:mrow>
                                    <mml:mi>l</mml:mi>
                                </mml:munder>
                                <mml:mstyle displaystyle="true">
                                    <mml:munder>
                                        <mml:mo>&#x2211;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>j</mml:mi>
                                            <mml:mo>&#x2208;</mml:mo>
                                            <mml:msub>
                                                <mml:mi>C</mml:mi>
                                                <mml:mi>i</mml:mi>
                                            </mml:msub>
                                        </mml:mrow>
                                    </mml:munder>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>{</mml:mo>
                                            <mml:mrow>
                                                <mml:mtable columnalign="left">
                                                    <mml:mtr columnalign="left">
                                                        <mml:mtd columnalign="left">
                                                            <mml:mrow>
                                                                <mml:mn>1</mml:mn>
                                                                <mml:mo>,</mml:mo>
                                                            </mml:mrow>
                                                        </mml:mtd>
                                                        <mml:mtd columnalign="left">
                                                            <mml:mrow>
                                                                <mml:mtext>if</mml:mtext>
                                                                <mml:mspace width="0.2em"/>
                                                                <mml:mi>l</mml:mi>
                                                                <mml:mi>a</mml:mi>
                                                                <mml:mi>b</mml:mi>
                                                                <mml:mi>e</mml:mi>
                                                                <mml:msub>
                                                                    <mml:mi>l</mml:mi>
                                                                    <mml:mi>j</mml:mi>
                                                                </mml:msub>
                                                                <mml:mo>=</mml:mo>
                                                                <mml:mi>l</mml:mi>
                                                            </mml:mrow>
                                                        </mml:mtd>
                                                    </mml:mtr>
                                                    <mml:mtr columnalign="left">
                                                        <mml:mtd columnalign="left">
                                                            <mml:mrow>
                                                                <mml:mn>0</mml:mn>
                                                                <mml:mo>,</mml:mo>
                                                            </mml:mrow>
                                                        </mml:mtd>
                                                        <mml:mtd columnalign="left">
                                                            <mml:mrow>
                                                                <mml:mtext>otherwise</mml:mtext>
                                                            </mml:mrow>
                                                        </mml:mtd>
                                                    </mml:mtr>
                                                </mml:mtable>
                                            </mml:mrow>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>is the dominant cell type in cluster 
                    <italic toggle="yes">C</italic>
                    <sub>
                        <italic toggle="yes">i</italic>
                    </sub>.</p>
                <p>In addition to the cluster purity metric, we computed the Adjusted Mutual Information (AMI)
                    <sup>
                        <xref ref-type="bibr" rid="ref-26">26</xref>
                    </sup>, an information theoretic measure of clustering accuracy which accounts for true positives (two cells of the same type in the same cluster) being caused by chance. The AMI between a clustering 
                    <italic toggle="yes">C</italic> and the true labels 
                    <italic toggle="yes">L</italic> is given by</p>
                <p>
                    <disp-formula id="e2">
                        <mml:math display="block" id="math7">
                            <mml:mrow>
                                <mml:mi>A</mml:mi>
                                <mml:mi>M</mml:mi>
                                <mml:mi>I</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>L</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>C</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mfrac>
                                    <mml:mrow>
                                        <mml:mi>M</mml:mi>
                                        <mml:mi>I</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>L</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>C</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mi>E</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>[</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>M</mml:mi>
                                                <mml:mi>I</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>L</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>C</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>]</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mtext mathvariant="italic">max</mml:mtext>
                                        <mml:mo>&#x2061;</mml:mo>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>H</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mi>L</mml:mi>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>H</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mi>C</mml:mi>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mi>E</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>[</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>M</mml:mi>
                                                <mml:mi>I</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>L</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>C</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>]</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mfrac>
                                <mml:mo>,</mml:mo>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where 
                    <italic toggle="yes">MI</italic>(
                    <italic toggle="yes">a</italic>, 
                    <italic toggle="yes">b</italic>) is the mutual information between labellings 
                    <italic toggle="yes">a</italic> and 
                    <italic toggle="yes">b</italic>, 
                    <italic toggle="yes">H</italic>(
                    <italic toggle="yes">a</italic>) is entropy of clustering 
                    <italic toggle="yes">a</italic>, and 
                    <italic toggle="yes">E</italic>[
                    <italic toggle="yes">&#x00b7;</italic>] denotes the expectation.</p>
                <p>We do not compare the clusterings using the Rand index, as that measure penalizes for so-called 
                    <italic toggle="yes">false negatives</italic> (two cells of the same cell type but in different clusters), which is undesirable as cells from the same cell type might be rightly split into several clusters when a novel cell type is identified.</p>
            </sec>
            <sec>
                <title>Construction of the smoothing kernel</title>
                <p>The PPI graph from which the diffusion kernel was derived was constructed using data from string-db
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup>. For each pair of proteins, string-db provides a 
                    <italic toggle="yes">combined interaction score</italic>, which is a score indicating how confident we can be in the interaction between the proteins, given the different kinds of evidence string-db collates. We subset the links to only those above the 90th percentile of combined interaction scores, only keeping the 10% most confident interactions. For mouse that is 1,020,816 interactions among 17013 genes. For human, 852,722 interactions among 17467 genes.</p>
            </sec>
            <sec>
                <title>MAGIC and scImpute parameters</title>
                <p>For all the results presented in this paper, scImpute was run using the default parameters (
                    <monospace>drop_thre = 0.5</monospace>). For MAGIC, we used values for the diffusion time parameter (
                    <italic toggle="yes">T</italic> = {
                    <italic toggle="yes">1, 2, 4, 8, 16</italic>}). Unlike 
                    <italic toggle="yes">netSmooth</italic>, for MAGIC the proportion of samples in robust clusters and the cluster purities were anti-correlated; thus we picked the one that gave the best cluster purities as the best MAGIC parameter. The chosen T values are given in 
                    <xref ref-type="table" rid="T3">Table 3</xref>. We used MAGIC version 0.1
                    <xref ref-type="fn" rid="FN4">
                        <sup>iv</sup>
                    </xref> and scImpute version 0.0.2
                    <xref ref-type="fn" rid="FN5">
                        <sup>v</sup>
                    </xref>.</p>
                <table-wrap id="T3" orientation="portrait" position="anchor">
                    <label>Table 3. </label>
                    <caption>
                        <title>Opitimal diffusion time values for MAGIC.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1">Dataset</th>
                                <th align="left" colspan="1" rowspan="1">Optimal T</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1">Hematopoiesis</td>
                                <td align="right" colspan="1" rowspan="1">1</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1">Embryonic cells</td>
                                <td align="right" colspan="1" rowspan="1">4</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1">Glioblastoma</td>
                                <td align="right" colspan="1" rowspan="1">2</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
        </sec>
        <sec>
            <title>Data and software availability</title>
            <p>The hematopoiesis dataset
                <sup>
                    <xref ref-type="bibr" rid="ref-15">15</xref>
                </sup> was obtained from the Gene Expression Omnibus
                <sup>
                    <xref ref-type="bibr" rid="ref-27">27</xref>
                </sup>. The embryonic
                <sup>
                    <xref ref-type="bibr" rid="ref-18">18</xref>
                </sup> and glioblastoma
                <sup>
                    <xref ref-type="bibr" rid="ref-19">19</xref>
                </sup> datasets were obtained from 
                <italic toggle="yes">conquer</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref-28">28</xref>
                </sup>, a repository of uniformly processed scRNA-seq datasets. The datasets are available publicly, see 
                <xref ref-type="table" rid="T1">Table 1</xref>. The analysis for this paper was done using the companion 
                <italic toggle="yes">netSmooth</italic> R-package, which is available online under Artistic-2.0 license: 
                <ext-link ext-link-type="uri" xlink:href="https://github.com/BIMSBbioinfo/netSmooth">https://github.com/BIMSBbioinfo/netSmooth</ext-link>. The 
                <italic toggle="yes">netSmooth</italic> R package was developed and tested under R version 3.4.2.</p>
            <p>Archived source code of netSmooth at time of publication: 
                <ext-link ext-link-type="uri" xlink:href="https://dx.doi.org/10.5281/zenodo.1119064">https://doi.org/10.5281/zenodo.1119064</ext-link>
                <sup>
                    <xref ref-type="bibr" rid="ref-29">29</xref>
                </sup>
            </p>
            <p>License: GNU 3.0</p>
        </sec>
    </body>
    <back>
        <ack>
            <title>Acknowledgements</title>
            <p>We would like to thank Vedran Franke, Bora Uyar and Brendan Osberg for valuable comments and input for the development of this manuscript.</p>
        </ack>
        <sec id="SM1" sec-type="supplementary-material">
            <title>Supplementary material</title>
            <p id="SF1">
                <bold>Supplementary Figure 1: PCA plots of the HSPC dataset.</bold> A) no preprocessing, B) after application of 
                <italic toggle="yes">netSmooth</italic>, C), using scImpute, and D) after application of MAGIC. </p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/99f282f4-0c11-4751-9885-bb37b4d70a44.pdf">Click here to access the data</ext-link>.</p>
            <p id="SF2">
                <bold>Supplementary Figure 2: t-SNE plots of the HSPC dataset.</bold> A) no preprocessing, B) after application of 
                <italic toggle="yes">netSmooth</italic>, C), using scImpute, and D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/d6bf6edf-0f65-42ea-9b1f-25807e54fde0.pdf">Click here to access the data</ext-link>.</p>
            <p id="SF3">
                <bold>Supplementary Figure 3: Single cells from the embryonic development dataset were clustered using the robust clustering procedure, and the 500 most differentially expressed genes (by edgeR-QLF test adjusted P value) in any of the discovered clusters are shown in a heatmap, as well as cluster assignments and cell types.</bold> A) raw (no imputation), B) after application of 
                <italic toggle="yes">netSmooth</italic>, C) missing values imputed using scImpute D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/079301c6-ed58-4d24-b549-187c82d86785.png">Click here to access the data</ext-link>.</p>
            <p id="SF4">
                <bold>Supplementary Figure 4: t-SNE plots of the embvryonic development dataset.</bold> A) no preprocessing, B) after application of 
                <italic toggle="yes">netSmooth</italic>, C), using scImpute, and D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/9f4b95b6-3841-4e4b-8772-5df61057b51f.pdf">Click here to access the data</ext-link>.</p>
            <p id="SF5">
                <bold>Supplementary Figure 5: The proportion of genes with 0 counts is a proxy for technical dropouts.</bold> A) no preprocessing, B) after application of 
                <italic toggle="yes">netSmooth</italic>, C), using scImpute, and D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/c0e19181-e7a2-4afa-af64-2e326f7b24b3.pdf">Click here to access the data</ext-link>.</p>
            <p id="SF6">
                <bold>Supplementary Figure 6: Single cells from the glioblastoma dataset were clustered using the robust clustering procedure, and the 500 most differentially expressed genes (by edgeR-QLF test adjusted P value) in any of the discovered clusters are shown in a heatmap, as well as cluster assignments and cell types.</bold> A) raw (no imputation), B) after application of 
                <italic toggle="yes">netSmooth</italic>, C) missing values imputed using scImpute D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/8b2dffda-2b11-492c-a7fa-3da806075aa4.png">Click here to access the data</ext-link>.</p>
            <p id="SF7">
                <bold>Supplementary Figure 7: PCA plots of the glioblastoma dataset.</bold> A) no preprocessing, B) after application of 
                <italic toggle="yes">netSmooth</italic>, C), using scImpute, and D) after application of MAGIC.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/ac1e741c-72cd-4ea6-9f8e-1644a5b6c60b.pdf">Click here to access the data</ext-link>.</p>
            <p id="SF8">
                <bold>Supplementary Figure 8: Cluster purity by smoothing parameter.</bold> A) for the hematopoiesis dataset with a directional (signed) graph, where inhibitory interactions have a negative edge weight. B) For the hematopoiesis dataset using a gene network with only genes that have a cell-type specific expression in any cell type. C) In the glioblastoma dataset using a gene network from HumanNet.</p>
            <p>
                <ext-link ext-link-type="uri" xlink:href="https://f1000researchdata.s3.amazonaws.com/supplementary/13511/e56b6af9-6409-40c5-9837-9f08d08da5eb.pdf">Click here to access the data</ext-link>.</p>
        </sec>
        <fn-group>
            <fn id="FN1">
                <label>i</label>
                <p>Most interactions in string-db do not specify the direction, or nature of the interaction</p>
            </fn>
            <fn id="FN2">
                <label>ii</label>
                <p>Version 1.4.0, available from Bioconductor 
                    <ext-link ext-link-type="uri" xlink:href="https://bioconductor.org/packages/release/bioc/html/clusterExperiment.html">https://bioconductor.org/packages/release/bioc/html/clusterExperiment.html</ext-link>
                </p>
            </fn>
            <fn id="FN3">
                <label>iii</label>
                <p>Version 1.2.1, available from CRAN: 
                    <ext-link ext-link-type="uri" xlink:href="https://cran.rproject.org/web/packages/entropy/index.html">https://cran.rproject.org/web/packages/entropy/index.html</ext-link>
                </p>
            </fn>
            <fn id="FN4">
                <label>iv</label>
                <p>Available from GitHub: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/pkathail/magic">https://github.com/pkathail/magic</ext-link>.</p>
            </fn>
            <fn id="FN5">
                <label>v</label>
                <p>Available from GitHub: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/Vivianstats/scImpute">https://github.com/Vivianstats/scImpute</ext-link>.</p>
            </fn>
        </fn-group>
        <ref-list>
            <ref id="ref-1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wagner</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Regev</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Yosef</surname>
                            <given-names>N</given-names>
                        </name>
</person-group>:
                    <article-title>Revealing the vectors of cellular identity with single-cell genomics.</article-title>
                    <source>

                        <italic toggle="yes">Nat Biotechnol.</italic>
</source>
                    <year>2016</year>;<volume>34</volume>(<issue>11</issue>):<fpage>1145</fpage>&#x2013;<lpage>1160</lpage>.
                    <pub-id pub-id-type="pmid">27824854</pub-id>
                    <pub-id pub-id-type="doi">10.1038/nbt.3711</pub-id>
                    <pub-id pub-id-type="pmcid">5465644</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kharchenko</surname>
                            <given-names>PV</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Silberstein</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Scadden</surname>
                            <given-names>DT</given-names>
                        </name>
</person-group>:
                    <article-title>Bayesian approach to single-cell differential expression analysis.</article-title>
                    <source>

                        <italic toggle="yes">Nat Methods.</italic>
</source>
                    <year>2014</year>;<volume>11</volume>(<issue>7</issue>):<fpage>740</fpage>&#x2013;<lpage>742</lpage>.
                    <pub-id pub-id-type="pmid">24836921</pub-id>
                    <pub-id pub-id-type="doi">10.1038/nmeth.2967</pub-id>
                    <pub-id pub-id-type="pmcid">4112276</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wu</surname>
                            <given-names>AR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Neff</surname>
                            <given-names>NF</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kalisky</surname>
                            <given-names>T</given-names>
                        </name>
</person-group>:
                    <article-title>Quantitative assessment of single-cell RNA-sequencing methods.</article-title>
                    <source>

                        <italic toggle="yes">Nat Methods.</italic>
</source>
                    <year>2014</year>;<volume>11</volume>(<issue>1</issue>):<fpage>41</fpage>&#x2013;<lpage>46</lpage>.
                    <pub-id pub-id-type="pmid">24141493</pub-id>
                    <pub-id pub-id-type="doi">10.1038/nmeth.2694</pub-id>
                    <pub-id pub-id-type="pmcid">4022966</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Pierson</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Yau</surname>
                            <given-names>C</given-names>
                        </name>
</person-group>:
                    <article-title>ZIFA: Dimensionality reduction for zero-inflated single-cell gene expression analysis.</article-title>
                    <source>

                        <italic toggle="yes">Genome Biol.</italic>
</source>
                    <year>2015</year>;<volume>16</volume>:<fpage>241</fpage>.
                    <pub-id pub-id-type="pmid">26527291</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s13059-015-0805-z</pub-id>
                    <pub-id pub-id-type="pmcid">4630968</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lin</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Troup</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ho</surname>
                            <given-names>JW</given-names>
                        </name>
</person-group>:
                    <article-title>CIDR: Ultrafast and accurate clustering through imputation for single-cell RNA-seq data.</article-title>
                    <source>

                        <italic toggle="yes">Genome Biol.</italic>
</source>
                    <year>2017</year>;<volume>18</volume>(<issue>1</issue>):<fpage>59</fpage>.
                    <pub-id pub-id-type="pmid">28351406</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s13059-017-1188-0</pub-id>
                    <pub-id pub-id-type="pmcid">5371246</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>WV</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>JJ</given-names>
                        </name>
</person-group>:
                    <article-title>scimpute: Accurate and robust imputation for single cell rna-seq data.</article-title>
                    <source>

                        <italic toggle="yes">bioRxiv.</italic>
</source>
                    <year>2017</year>.
                    <pub-id pub-id-type="doi">10.1101/141598</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>van Dijk</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nainys</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sharma</surname>
                            <given-names>R</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Magic: A diffusion-based imputation method reveals gene-gene interactions in single-cell rna-sequencing data.</article-title>
                    <source>

                        <italic toggle="yes">bioRxiv.</italic>
</source>
                    <year>2017</year>.
                    <pub-id pub-id-type="doi">10.1101/111591</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bhardwaj</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lu</surname>
                            <given-names>H</given-names>
                        </name>
</person-group>:
                    <article-title>Correlation between gene expression profiles and protein-protein interactions within and across genomes.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2005</year>;<volume>21</volume>(<issue>11</issue>):<fpage>2730</fpage>&#x2013;<lpage>2738</lpage>.
                    <pub-id pub-id-type="pmid">15797912</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/bti398</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Fraser</surname>
                            <given-names>HB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hirsh</surname>
                            <given-names>AE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wall</surname>
                            <given-names>DP</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Coevolution of gene expression among interacting proteins.</article-title>
                    <source>

                        <italic toggle="yes">Proc Natl Acad Sci U S A.</italic>
</source>
                    <year>2004</year>;<volume>101</volume>(<issue>24</issue>):<fpage>9033</fpage>&#x2013;<lpage>9038</lpage>.
                    <pub-id pub-id-type="pmid">15175431</pub-id>
                    <pub-id pub-id-type="doi">10.1073/pnas.0402591101</pub-id>
                    <pub-id pub-id-type="pmcid">439012</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Szklarczyk</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Morris</surname>
                            <given-names>JH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cook</surname>
                            <given-names>H</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The STRING database in 2017: quality-controlled protein-protein association networks, made broadly accessible.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2017</year>;<volume>45</volume>(<issue>D1</issue>):<fpage>D362</fpage>&#x2013;<lpage>D368</lpage>.
                    <pub-id pub-id-type="pmid">27924014</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkw937</pub-id>
                    <pub-id pub-id-type="pmcid">5210637</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>I</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Blom</surname>
                            <given-names>UM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>PI</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Prioritizing candidate disease genes by network-based boosting of genome-wide association data.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2011</year>;<volume>21</volume>(<issue>7</issue>):<fpage>1109</fpage>&#x2013;<lpage>1121</lpage>.
                    <pub-id pub-id-type="pmid">21536720</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.118992.110</pub-id>
                    <pub-id pub-id-type="pmcid">3129253</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hofree</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shen</surname>
                            <given-names>JP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Carter</surname>
                            <given-names>H</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Network-based stratification of tumor mutations.</article-title>
                    <source>

                        <italic toggle="yes">Nat Methods.</italic>
</source>
                    <year>2013</year>;<volume>10</volume>(<issue>11</issue>):<fpage>1108</fpage>&#x2013;<lpage>1115</lpage>.
                    <pub-id pub-id-type="pmid">24037242</pub-id>
                    <pub-id pub-id-type="doi">10.1038/nmeth.2651</pub-id>
                    <pub-id pub-id-type="pmcid">3866081</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vandin</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Upfal</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Raphael</surname>
                            <given-names>BJ</given-names>
                        </name>
</person-group>:
                    <article-title>Algorithms for detecting significantly mutated pathways in cancer.</article-title>
                    <source>

                        <italic toggle="yes">J Comput Biol.</italic>
</source>
                    <year>2011</year>;<volume>18</volume>(<issue>3</issue>):<fpage>507</fpage>&#x2013;<lpage>522</lpage>.
                    <pub-id pub-id-type="pmid">21385051</pub-id>
                    <pub-id pub-id-type="doi">10.1089/cmb.2010.0265</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>D&#x00f8;rum</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Snipen</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Solheim</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Smoothing gene expression data with network information improves consistency of regulated genes.</article-title>
                    <source>

                        <italic toggle="yes">Stat Appl Genet Mol Biol.</italic>
</source>
                    <year>2011</year>;<volume>10</volume>(<issue>1</issue>): pii: /j/sagmb.2011.10.issue-1/sagmb.2011.10.1.1618/sagmb.2011.10.1.1618.xml.
                    <pub-id pub-id-type="pmid">23089828</pub-id>
                    <pub-id pub-id-type="doi">10.2202/1544-6115.1618</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Nestorowa</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hamey</surname>
                            <given-names>FK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pijuan Sala</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation.</article-title>
                    <source>

                        <italic toggle="yes">Blood.</italic>
</source>
                    <year>2016</year>;<volume>128</volume>(<issue>8</issue>):<fpage>e20</fpage>&#x2013;<lpage>31</lpage>.
                    <pub-id pub-id-type="pmid">27365425</pub-id>
                    <pub-id pub-id-type="doi">10.1182/blood-2016-05-716480</pub-id>
                    <pub-id pub-id-type="pmcid">5305050</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Purdom</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Risso</surname>
                            <given-names>D</given-names>
                        </name>
</person-group>:
                    <article-title>clusterExperiment: Compare Clusterings for Single-Cell Sequencing</article-title>.<year>2017</year>; R package version 1.2.0.
                    <ext-link ext-link-type="uri" xlink:href="https://bioconductor.statistik.tu-dortmund.de/packages/3.5/bioc/manuals/clusterExperiment/man/clusterExperiment.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Robinson</surname>
                            <given-names>MD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>McCarthy</surname>
                            <given-names>DJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Smyth</surname>
                            <given-names>GK</given-names>
                        </name>
</person-group>:
                    <article-title>edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2010</year>;<volume>26</volume>(<issue>1</issue>):<fpage>139</fpage>&#x2013;<lpage>140</lpage>.
                    <pub-id pub-id-type="pmid">19910308</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/btp616</pub-id>
                    <pub-id pub-id-type="pmcid">2796818</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Deng</surname>
                            <given-names>Q</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ramsk&#x00f6;ld</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reinius</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells.</article-title>
                    <source>

                        <italic toggle="yes">Science.</italic>
</source>
                    <year>2014</year>;<volume>343</volume>(<issue>6167</issue>):<fpage>193</fpage>&#x2013;<lpage>196</lpage>.
                    <pub-id pub-id-type="pmid">24408435</pub-id>
                    <pub-id pub-id-type="doi">10.1126/science.1245316</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Patel</surname>
                            <given-names>AP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tirosh</surname>
                            <given-names>I</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Trombetta</surname>
                            <given-names>JJ</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Single-cell RNA-seq highlights intratumoral heterogeneity in primary glioblastoma.</article-title>
                    <source>

                        <italic toggle="yes">Science.</italic>
</source>
                    <year>2014</year>;<volume>344</volume>(<issue>6190</issue>):<fpage>1396</fpage>&#x2013;<lpage>1401</lpage>.
                    <pub-id pub-id-type="pmid">24925914</pub-id>
                    <pub-id pub-id-type="doi">10.1126/science.1254257</pub-id>
                    <pub-id pub-id-type="pmcid">4123637</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Petryszak</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Keays</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tang</surname>
                            <given-names>YA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Expression Atlas update--an integrated database of gene and protein expression in humans, animals and plants.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Research.</italic>
</source>
                    <year>2016</year>;<volume>44</volume>(<issue>D1</issue>):<fpage>D746</fpage>&#x2013;<lpage>D752</lpage>.
                    <pub-id pub-id-type="pmid">26481351</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkv1045</pub-id>
                    <pub-id pub-id-type="pmcid">4702781</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>I</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Blom</surname>
                            <given-names>UM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>PI</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Prioritizing candidate disease genes by network-based boosting of genome-wide association data.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2011</year>;<volume>21</volume>(<issue>7</issue>):<fpage>1109</fpage>&#x2013;<lpage>1121</lpage>.
                    <pub-id pub-id-type="pmid">21536720</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.118992.110</pub-id>
                    <pub-id pub-id-type="pmcid">3129253</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>McCarthy</surname>
                            <given-names>DJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Campbell</surname>
                            <given-names>KR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lun</surname>
                            <given-names>AT</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Scater: pre-processing, quality control, normalization and visualization of single-cell RNA-seq data in R.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2017</year>;<volume>33</volume>(<issue>8</issue>):<fpage>1179</fpage>&#x2013;<lpage>1186</lpage>.
                    <pub-id pub-id-type="pmid">28088763</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/btw777</pub-id>
                    <pub-id pub-id-type="pmcid">5408845</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-23">
                <label>23</label>
                <mixed-citation publication-type="book">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hastie</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tibshirani</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Friedman</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>The Elements of Statistical Learning</article-title>.<year>2001</year>; Springer Series in Statistics. Springer New York Inc., New York, NY USA.
                    <ext-link ext-link-type="uri" xlink:href="https://books.google.co.in/books?id=VRzITwgNV2UC&amp;pg=frontcover">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-24">
                <label>24</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>van der Maaten</surname>
                            <given-names>LJP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hinton</surname>
                            <given-names>GE</given-names>
                        </name>
</person-group>:
                    <article-title>Visualizing high-dimensional data using t-SNE.</article-title>
                    <source>

                        <italic toggle="yes">J Mach Learn Res.</italic>
</source>
                    <year>2008</year>;<volume>9</volume>:<fpage>2579</fpage>&#x2013;<lpage>2605</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hausser</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Strimmer</surname>
                            <given-names>K</given-names>
                        </name>
</person-group>:
                    <article-title>entropy: Estimation of Entropy, Mutual Information and Related Quantities</article-title>.<year>2014</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/entropy/entropy.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vinh</surname>
                            <given-names>NX</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Epps</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bailey</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance.</article-title>
                    <source>

                        <italic toggle="yes">J Mach Learn Res.</italic>
</source>
                    <year>2010</year>;<volume>11</volume>:<fpage>2837</fpage>&#x2013;<lpage>2854</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/ft_gateway.cfm?id=1953024&amp;ftid=927721&amp;dwn=1&amp;CFID=1019407233&amp;CFTOKEN=50901951">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-27">
                <label>27</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Edgar</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Domrachev</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lash</surname>
                            <given-names>AE</given-names>
                        </name>
</person-group>:
                    <article-title>Gene Expression Omnibus: NCBI gene expression and hybridization array data repository.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2002</year>;<volume>30</volume>(<issue>1</issue>):<fpage>207</fpage>&#x2013;<lpage>210</lpage>.
                    <pub-id pub-id-type="pmid">11752295</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/30.1.207</pub-id>
                    <pub-id pub-id-type="pmcid">99122</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-28">
                <label>28</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Soneson</surname>
                            <given-names>C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Robinson</surname>
                            <given-names>MD</given-names>
                        </name>
</person-group>:
                    <article-title>Bias, robustness and scalability in differential expression analysis of single-cell rna-seq data.</article-title>
                    <source>

                        <italic toggle="yes">bioRxiv.</italic>
</source>
                    <year>2017</year>.
                    <pub-id pub-id-type="doi">10.1101/143289 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-29">
                <label>29</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ronen</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Akalin</surname>
                            <given-names>A</given-names>
                        </name>
</person-group>:
                    <article-title>BIMSBbioinfo/netSmooth: first release for zenodo (Version v0.1.0).</article-title>
                    <source>

                        <italic toggle="yes">Zenodo.</italic>
</source>
                    <year>2017</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.5281/zenodo.1119064">Data Source</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
</article>
