<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="other" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.22292.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Software Tool Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>ccbmlib &#x2013; a Python package for modeling Tanimoto similarity value distributions</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 2 approved]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Vogt</surname>
                        <given-names>Martin</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-3931-9516</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Bajorath</surname>
                        <given-names>J&#x00fc;rgen</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0557-5714</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Department of Life Science Informatics, B-IT, University of Bonn, Endenicher Allee 19c, Bonn, NRW, 53115, Germany</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:bajorath@bit.uni-bonn.de">bajorath@bit.uni-bonn.de</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>10</day>
                <month>2</month>
                <year>2020</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2020</year>
            </pub-date>
            <volume>9</volume>
            <elocation-id>Chem Inf Sci-100</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>4</day>
                    <month>2</month>
                    <year>2020</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Vogt M and Bajorath J</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/9-100/pdf"/>
            <abstract>
                <p>The ccbmlib Python package is a collection of modules for modeling similarity value distributions based on Tanimoto coefficients for fingerprints available in RDKit. It can be used to assess the statistical significance of Tanimoto coefficients and evaluate how molecular similarity is reflected when different fingerprint representations are used. Significance measures derived from 
                    <italic toggle="yes">p</italic>-values allow a quantitative comparison of similarity scores obtained from different fingerprint representations that might have very different value ranges. Furthermore, the package models conditional distributions of similarity coefficients for a given reference compound. The conditional significance score estimates where a test compound would be ranked in a similarity search. The models are based on the statistical analysis of feature distributions and feature correlations of fingerprints of a reference database. The resulting models have been evaluated for 11 RDKit fingerprints, taking a collection of ChEMBL compounds as a reference data set. For most fingerprints, highly accurate models were obtained, with differences of 1% or less for Tanimoto coefficients indicating high similarity.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>Bernoulli model</kwd>
                <kwd>fingerprints</kwd>
                <kwd>p-value</kwd>
                <kwd>similarity value distributions</kwd>
                <kwd>Tanimoto coefficient.</kwd>
            </kwd-group>
            <funding-group>
                <funding-statement>The author(s) declared that no grants were involved in supporting this work.</funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="intro">
            <title>Introduction</title>
            <p>The quantitative assessment of molecular similarity is a central concept in chemoinformatics
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref-4">4</xref>
                </sup>. It forms the basis of similarity searching and ligand-based virtual screening to identify novel molecules in large databases with biological properties similar to given reference compounds
                <sup>
                    <xref ref-type="bibr" rid="ref-5">5</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref-7">7</xref>
                </sup>. Assessment of molecular similarity plays a central role in chemical space analysis and the study of activity landscapes where chemical space projections onto low-dimensional representations are based on quantified similarities
                <sup>
                    <xref ref-type="bibr" rid="ref-8">8</xref>,
                    <xref ref-type="bibr" rid="ref-9">9</xref>
                </sup>.</p>
            <p>The use of fingerprints and the Tanimoto coefficient
                <sup>
                    <xref ref-type="bibr" rid="ref-10">10</xref>
                </sup> (Tc), also known as the Jaccard index
                <sup>
                    <xref ref-type="bibr" rid="ref-11">11</xref>
                </sup>, represents one of the most popular methods for quantifying molecular similarity
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref-4">4</xref>
                </sup>. Fingerprints encode structural features of a molecule in a binary vector format and the Tc quantifies the overlap of features of two molecules as the ratio of the number of common features to the total number of features in each fingerprint. The Tc has the value range 0 to 1 and can be interpreted as the percentage of features shared by two molecules. However, whether a given percentage of overlap should be considered a significant similarity of two molecules depends on the fingerprint design and the global frequency of encoded features. Fingerprint designs might be categorized as dense or sparse. Dense fingerprints have a relatively small dimensionality of at most a few thousand features, but a significant fraction of these might be present in any given molecule. On the other hand, sparse fingerprints can have a theoretically infinite set of features (typical integer encodings allow up to 4 billion features). However, only tens or hundreds of these features might be found in a single molecule. Consequently, sparse fingerprint representations generally lead to smaller Tc values than dense fingerprints.</p>
            <p>While it is not meaningful to compare Tc values of different fingerprint designs directly, statistical approaches can be applied to assess the significance of Tc values with respect to a reference data set. By using the distribution of Tc values obtained from comparing random compounds as a reference, Tc value significance can be determined by calculating the probability of obtaining a given Tc or higher value by chance. In statistical terms, the reference distribution corresponds to a null hypothesis and the significance measure is known as 
                <italic toggle="yes">p</italic>-value or 
                <italic toggle="yes">p</italic>-score. This score has the range 0 to 1 and indicates the probability that a given Tc would be obtained by chance. Thus, smaller 
                <italic toggle="yes">p</italic>-values indicate higher significance. Here, we will use the measure 1 &#x2013; (
                <italic toggle="yes">p</italic>-value) to assess significance. Although it is in principle possible to obtain Tc distributions by random sampling, this process is time consuming. Instead, the ccbmlib package presented here provides methods for the generation of Tc distribution models that are based on the statistical analysis of feature frequencies and feature correlations between fingerprints for a reference data set. Some mathematical models of Tc-value distributions
                <sup>
                    <xref ref-type="bibr" rid="ref-12">12</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref-14">14</xref>
                </sup> have been introduced in the past. The ccbmlib implementation makes use of the conditional correlated Bernoulli model (CCBM) that has been shown to accurately model Tc distributions for a variety of fingerprint designs
                <sup>
                    <xref ref-type="bibr" rid="ref-13">13</xref>,
                    <xref ref-type="bibr" rid="ref-14">14</xref>
                </sup>. An unconditional distribution model accounts for Tc distributions of fingerprints of randomly selected compounds. However, it is of particular interest to model distributions where one compound fingerprint is used as a reference, which forms the basis of similarity searching. 
                <italic toggle="yes">P</italic>-values obtained from such conditional distribution models efficiently estimate how high a test compound would be ranked in a similarity search with respect to a given reference compound. Hence, conditional models can be used to predict similarity search performance
                <sup>
                    <xref ref-type="bibr" rid="ref-13">13</xref>,
                    <xref ref-type="bibr" rid="ref-14">14</xref>
                </sup>.</p>
            <p>The implementation presented here is based on RDKit
                <sup>
                    <xref ref-type="bibr" rid="ref-15">15</xref>
                </sup> and provides methods for statistically analyzing fingerprint feature distributions and building models for fingerprints implemented in RDKit. Methods are provided for calculating significance from Tc values, which enable a meaningful comparison of Tc values calculated using fingerprints of different design. The CCBM requires knowledge of the frequency of individual features as well as their pairwise covariances. This statistical analysis needs to be carried out once for each reference data set and fingerprint design. This step can be time consuming for large data sets. The ccbmlib implementation stores resulting statistics permanently to avoid redundant calculations. For our reference implementation and evaluation, compounds from ChEMBL (release 25)
                <sup>
                    <xref ref-type="bibr" rid="ref-16">16</xref>
                </sup> were selected as a representative sample of bioactive chemical space.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <sec>
                <title>Fingerprint representations</title>
                <p>
                    <ext-link ext-link-type="uri" xlink:href="https://www.rdkit.org/">RDKit</ext-link> provides implementations for a variety of fingerprints. Available fingerprints are reported in 
                    <xref ref-type="table" rid="T1">Table 1</xref>. The atom pair fingerprint encodes typed pairs of atoms and their bond distance and is based on the description given by Carhart and Smith
                    <sup>
                        <xref ref-type="bibr" rid="ref-17">17</xref>
                    </sup>, representing a sparse fingerprint. The Avalon fingerprint
                    <sup>
                        <xref ref-type="bibr" rid="ref-18">18</xref>
                    </sup> is a hashed fingerprint enumerating paths and feature classes. MACCS (Molecular ACCess System) keys record the presence or absence of a dictionary of 166 substructural features
                    <sup>
                        <xref ref-type="bibr" rid="ref-19">19</xref>
                    </sup>. Morgan fingerprints are an RDKit implementation of extended connectivity fingerprints (ECFPs)
                    <sup>
                        <xref ref-type="bibr" rid="ref-20">20</xref>
                    </sup> and enumerate atom environments up to a selected radius. We calculated Morgan fingerprints for radius 1 and 2 corresponding to ECFP with diameter 2 and 4, respectively. The topological torsion fingerprints encode sequences of four bonded atoms in a sparse fingerprint
                    <sup>
                        <xref ref-type="bibr" rid="ref-21">21</xref>
                    </sup>. The RDKit fingerprint is a hashed substructure/path fingerprint similar to the Daylight fingerprints
                    <sup>
                        <xref ref-type="bibr" rid="ref-22">22</xref>
                    </sup>. Atom pairs, Morgan fingerprints, and the topological torsion fingerprint result in sparse vector representations whose dimensions are only limited by the underlying numerical representation. Hashing is often used to yield a dense fingerprint representation of constant length. We evaluated our models using the sparse and hashed versions with a default size of 2048 bits.</p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Fingerprints available in RDKit.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Fingerprint</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Dimension</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Description</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">
                                    <italic toggle="yes">&#x03bc;</italic>(
                                    <italic toggle="yes">FC</italic>)</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">
                                    <italic toggle="yes">&#x03c3;</italic>(
                                    <italic toggle="yes">FC</italic>)</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Atom pairs</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">sparse</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">typed atom pairs</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">199.8</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">155.9</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Atom pairs &#x2013; hashed</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2048</td>
                                <td align="left" colspan="1" rowspan="1" valign="top"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">186.3</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">126.4</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Avalon</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">512</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">path-based</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">206.3</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">78.9</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">MACCS keys</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">166</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">substructures</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">52.1</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">13.5</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 1</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">sparse</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">atom environments</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">30.5</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">8.4</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 1 &#x2013; hashed</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2048</td>
                                <td align="left" colspan="1" rowspan="1" valign="top"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">30.1</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">8.2</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 2</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">sparse</td>
                                <td align="left" colspan="1" rowspan="1" valign="top"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">51.0</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">15.3</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 2 &#x2013; hashed</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2048</td>
                                <td align="left" colspan="1" rowspan="1" valign="top"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">50.3</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">14.9</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Topological torsions</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">sparse</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">4-atom-paths</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">34.7</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">13.8</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Topological torsions &#x2013; hashed</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2048</td>
                                <td align="left" colspan="1" rowspan="1" valign="top"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">34.2</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">13.4</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">RDKit</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2048</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">path-based</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">877.5</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">324.0</td>
                            </tr>
                        </tbody>
                    </table>
                    <table-wrap-foot>
                        <fn>
                            <p>
                                <italic toggle="yes">&#x03bc;</italic>(
                                <italic toggle="yes">FC</italic>) and 
                                <italic toggle="yes">&#x03c3;</italic>(
                                <italic toggle="yes">FC</italic>) are the average number and standard deviation of the number of features per fingerprint for ChEMBL compounds, respectively.</p>
                        </fn>
                    </table-wrap-foot>
                </table-wrap>
                <p>For the following mathematical description of the models, we will use lowercase bold letters to indicate bit vector representations and uppercase italic symbols to denote the corresponding feature set representations:</p>
                <p>
                    <disp-formula id="e1">
                        <mml:math display="block" id="M1">
                            <mml:mtable>
                                <mml:mtr>
                                    <mml:mtd>
                                        <mml:mo mathvariant="bold">a</mml:mo>
                                        <mml:mo>=</mml:mo>
                                        <mml:mo>(</mml:mo>
                                        <mml:mrow>
                                            <mml:msub>
                                                <mml:mi>a</mml:mi>
                                                <mml:mn>1</mml:mn>
                                            </mml:msub>
                                            <mml:mo>,</mml:mo>
                                            <mml:msub>
                                                <mml:mi>a</mml:mi>
                                                <mml:mn>2</mml:mn>
                                            </mml:msub>
                                            <mml:mo>,</mml:mo>
                                            <mml:mo>&#x2026;</mml:mo>
                                            <mml:mo>,</mml:mo>
                                            <mml:msub>
                                                <mml:mi>a</mml:mi>
                                                <mml:mi>d</mml:mi>
                                            </mml:msub>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                        <mml:mspace width="0.2em"/>
                                        <mml:mtext>where</mml:mtext>
                                        <mml:mspace width="0.2em"/>
                                        <mml:msub>
                                            <mml:mi>a</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2208;</mml:mo>
                                        <mml:mrow>
                                            <mml:mo>{</mml:mo>
                                            <mml:mrow>
                                                <mml:mn>0</mml:mn>
                                                <mml:mtext>,</mml:mtext>
                                                <mml:mn>1</mml:mn>
                                            </mml:mrow>
                                            <mml:mo>}</mml:mo>
                                        </mml:mrow>
                                        <mml:mo>,</mml:mo>
                                        <mml:mn>1</mml:mn>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2264;</mml:mo>
                                        <mml:mi>i</mml:mi>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2264;</mml:mo>
                                        <mml:mi>d</mml:mi>
                                    </mml:mtd>
                                </mml:mtr>
                                <mml:mtr>
                                    <mml:mtd>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mi>A</mml:mi>
                                        <mml:mo>=</mml:mo>
                                        <mml:mrow>
                                            <mml:mo>{</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>i</mml:mi>
                                                <mml:mo>|</mml:mo>
                                                <mml:msub>
                                                    <mml:mi>a</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                                <mml:mo>=</mml:mo>
                                                <mml:mn>1</mml:mn>
                                                <mml:mo>,</mml:mo>
                                                <mml:mn>1</mml:mn>
                                                <mml:mspace width="0.1em"/>
                                                <mml:mo>&#x2264;</mml:mo>
                                                <mml:mi>i</mml:mi>
                                                <mml:mspace width="0.1em"/>
                                                <mml:mo>&#x2264;</mml:mo>
                                                <mml:mi>d</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>}</mml:mo>
                                        </mml:mrow>
                                    </mml:mtd>
                                </mml:mtr>
                            </mml:mtable>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>1</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Here, 
                    <italic toggle="yes">d</italic> &#x2208; &#x2115; is the dimension of the fingerprint.</p>
            </sec>
            <sec>
                <title>Fingerprint similarity</title>
                <p>Similarity of fingerprints is most often assessed on the basis of the set of features common to two fingerprints. The Tanimoto coefficient
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>,
                        <xref ref-type="bibr" rid="ref-11">11</xref>
                    </sup> is defined as the ratio of the number of features common to two fingerprints A and B to the total number of features present in either A or B:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M2">
                            <mml:mrow>
                                <mml:mi>T</mml:mi>
                                <mml:mi>c</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>A</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mi>B</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mspace width="0.2em"/>
                                <mml:mfrac>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>A</mml:mi>
                                                <mml:mo>&#x2229;</mml:mo>
                                                <mml:mi>B</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>A</mml:mi>
                                                <mml:mo>&#x222a;</mml:mo>
                                                <mml:mi>B</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mfrac>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.2em"/>
                                <mml:mfrac>
                                    <mml:mrow>
                                        <mml:mi>I</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>A</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>B</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mi>U</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>A</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>B</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mfrac>
                            </mml:mrow>
                            <mml:mspace width="19em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>2</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where 
                    <italic toggle="yes">I</italic>(
                    <italic toggle="yes">A, B</italic>) = |
                    <italic toggle="yes">A</italic> &#x2229; 
                    <italic toggle="yes">B</italic>| and 
                    <italic toggle="yes">U</italic>(
                    <italic toggle="yes">A, B</italic>) = |
                    <italic toggle="yes">A</italic> &#x222a; 
                    <italic toggle="yes">B</italic>| are the cardinalities of the intersection and union of 
                    <italic toggle="yes">A</italic> and 
                    <italic toggle="yes">B</italic>, respectively.</p>
            </sec>
            <sec>
                <title>Modeling similarity value distributions</title>
                <p>The distribution of Tc values depends on the fingerprints of a reference compound data set. The resulting 
                    <italic toggle="yes">p</italic>-values must be interpreted with respect to the reference data set.</p>
                <p>As indicated in 
                    <xref ref-type="other" rid="e1">Equation 1</xref>, fingerprints can be represented as sets of features and similarity metrics like the Tc depend on the cardinalities of the intersection and union of sets. Each of the 
                    <italic toggle="yes">d</italic> features 
                    <italic toggle="yes">X
                        <sub>i</sub>
                    </italic> of a fingerprint can be modeled as a Bernoulli variable that occurs with a certain probability 
                    <italic toggle="yes">p
                        <sub>i</sub>
                    </italic>. Given a reference data set of 
                    <italic toggle="yes">N</italic> compounds and their fingerprints 
                    <italic toggle="yes">A</italic> = {
                    <bold>a
                        <sub>
                            <italic toggle="yes">k</italic>
                        </sub>
                    </bold>|1 &#x2264; 
                    <italic toggle="yes">k</italic> &#x2264; 
                    <italic toggle="yes">N</italic>} where 
                    <bold>a</bold>
                    <sub>
                        <italic toggle="yes">k</italic>
                    </sub> = (
                    <italic toggle="yes">a</italic>
                    <sub>
                        <italic toggle="yes">j</italic>1</sub>
                    <italic toggle="yes">, a</italic>
                    <sub>
                        <italic toggle="yes">j</italic>2</sub>
                    <italic toggle="yes">,</italic> &#x2026; 
                    <italic toggle="yes">a
                        <sub>jd</sub>
                    </italic>) the probabilities can be estimated from the relative frequencies:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M3">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>p</mml:mi>
                                    <mml:mi>i</mml:mi>
                                </mml:msub>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>X</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mfrac>
                                    <mml:mn>1</mml:mn>
                                    <mml:mi>N</mml:mi>
                                </mml:mfrac>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>k</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>N</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>a</mml:mi>
                                            <mml:mrow>
                                                <mml:mi>k</mml:mi>
                                                <mml:mi>i</mml:mi>
                                            </mml:mrow>
                                        </mml:msub>
                                        <mml:mspace width="0.2em"/>
                                        <mml:mo>,</mml:mo>
                                        <mml:mn>1</mml:mn>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2264;</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mi>i</mml:mi>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2264;</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mi>d</mml:mi>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="16em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>3</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>The cardinality of a fingerprint itself, of the intersection, and of the union can then be modeled as a sum of non-identically distributed Bernoulli variables. In the case of independent variables, the sum follows a Poisson binomial distribution with mean</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M4">
                            <mml:mrow>
                                <mml:mi>&#x03bc;</mml:mi>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="28em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>4</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>and variance</p>
                <p>
                    <disp-formula id="e5">
                        <mml:math display="block" id="M5">
                            <mml:mrow>
                                <mml:msup>
                                    <mml:mi>&#x03c3;</mml:mi>
                                    <mml:mn>2</mml:mn>
                                </mml:msup>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mtext>1</mml:mtext>
                                                <mml:mspace width="0.1em"/>
                                                <mml:mtext>&#x2013;</mml:mtext>
                                                <mml:mspace width="0.1em"/>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="24em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>5</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>and can be approximated by a normal distribution. Because the cardinalities of the intersection and union of two sets are not independent, the Tc is then modeled as the ratio of two correlated normal distributions for which approximations exist
                    <sup>
                        <xref ref-type="bibr" rid="ref-23">23</xref>,
                        <xref ref-type="bibr" rid="ref-24">24</xref>
                    </sup>.</p>
                <p>Fingerprint features are often correlated. Ignoring these correlations leads to a significant underestimation of the variance (
                    <xref ref-type="other" rid="e5">Equation 5</xref>)
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>,
                        <xref ref-type="bibr" rid="ref-14">14</xref>
                    </sup>. While the equation for the mean 
                    <italic toggle="yes">&#x03bc;</italic> remains valid for correlated random variables, the formula for the variance 
                    <italic toggle="yes">&#x03c3;</italic>
                    <sup>2</sup> requires taking the pairwise covariances 
                    <italic toggle="yes">c
                        <sub>ij</sub>
                    </italic> = cov(
                    <italic toggle="yes">X
                        <sub>i</sub>,X
                        <sub>j</sub>
                    </italic>) between the different features into account. These can also be estimated from the reference set:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M6">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>c</mml:mi>
                                    <mml:mrow>
                                        <mml:mi>i</mml:mi>
                                        <mml:mi>j</mml:mi>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>X</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                                <mml:mspace width="0.1em"/>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:mspace width="0.1em"/>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>X</mml:mi>
                                                    <mml:mi>j</mml:mi>
                                                </mml:msub>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>j</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>X</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                        <mml:msub>
                                            <mml:mi>X</mml:mi>
                                            <mml:mi>j</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>&#x2212;</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:msub>
                                    <mml:mi>p</mml:mi>
                                    <mml:mi>i</mml:mi>
                                </mml:msub>
                                <mml:msub>
                                    <mml:mi>p</mml:mi>
                                    <mml:mi>j</mml:mi>
                                </mml:msub>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mfrac>
                                    <mml:mn>1</mml:mn>
                                    <mml:mi>N</mml:mi>
                                </mml:mfrac>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>k</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>N</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>a</mml:mi>
                                            <mml:mrow>
                                                <mml:mi>k</mml:mi>
                                                <mml:mi>i</mml:mi>
                                            </mml:mrow>
                                        </mml:msub>
                                        <mml:msub>
                                            <mml:mi>a</mml:mi>
                                            <mml:mrow>
                                                <mml:mi>k</mml:mi>
                                                <mml:mi>j</mml:mi>
                                            </mml:mrow>
                                        </mml:msub>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>j</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="1em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>6</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Accordingly, the value 
                    <italic toggle="yes">c
                        <sub>ii</sub>
                    </italic> = 
                    <italic toggle="yes">p
                        <sub>i</sub>
                    </italic> (1 &#x2013; 
                    <italic toggle="yes">p
                        <sub>i</sub>
                    </italic>) denotes the variance of 
                    <italic toggle="yes">X
                        <sub>i</sub>
                    </italic>.</p>
                <p>Based on these estimates, the average cardinality of a fingerprint itself, of the intersection, and of the union of two unknown fingerprints can be determined:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M7">
                            <mml:mrow>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mi>X</mml:mi>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="26em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>7</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M8">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>I</mml:mi>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>I</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>X</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>Y</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msubsup>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                            <mml:mn>2</mml:mn>
                                        </mml:msubsup>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="22em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>8</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M9">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>U</mml:mi>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>U</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>X</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>Y</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mi>X</mml:mi>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>+</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mi>Y</mml:mi>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mi>I</mml:mi>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:mi>X</mml:mi>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>Y</mml:mi>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mn>2</mml:mn>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>&#x2212;</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:msubsup>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                            <mml:mn>2</mml:mn>
                                        </mml:msubsup>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="4em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>9</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>For the respective variances, one obtains:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M10">
                            <mml:mrow>
                                <mml:mtext>Var</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mi>X</mml:mi>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mo>=</mml:mo>
                                <mml:mspace width="0.1em"/>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mo>=</mml:mo>
                                            <mml:mspace width="0.1em"/>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mspace width="0.1em"/>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msubsup>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mspace width="0.1em"/>
                                                    <mml:mo>=</mml:mo>
                                                    <mml:mspace width="0.1em"/>
                                                    <mml:mn>1</mml:mn>
                                                </mml:mrow>
                                                <mml:mi>d</mml:mi>
                                            </mml:msubsup>
                                            <mml:mrow>
                                                <mml:mspace width="0.1em"/>
                                                <mml:msub>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                </mml:msub>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="22em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>10</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M11">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>&#x03c3;</mml:mi>
                                    <mml:mi>I</mml:mi>
                                    <mml:mn>2</mml:mn>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Var</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>I</mml:mi>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>X</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>Y</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>=</mml:mo>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msubsup>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mo>=</mml:mo>
                                                    <mml:mn>1</mml:mn>
                                                </mml:mrow>
                                                <mml:mi>d</mml:mi>
                                            </mml:msubsup>
                                            <mml:mrow>
                                                <mml:mo stretchy="false">(</mml:mo>
                                                <mml:msubsup>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msubsup>
                                                <mml:mo>+</mml:mo>
                                                <mml:mn>2</mml:mn>
                                                <mml:msub>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                </mml:msub>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>j</mml:mi>
                                                </mml:msub>
                                                <mml:mo stretchy="false">)</mml:mo>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="14em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>11</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M12">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>&#x03c3;</mml:mi>
                                    <mml:mi>U</mml:mi>
                                    <mml:mn>2</mml:mn>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Var</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>U</mml:mi>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>X</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>Y</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>=</mml:mo>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msubsup>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mo>=</mml:mo>
                                                    <mml:mn>1</mml:mn>
                                                </mml:mrow>
                                                <mml:mi>d</mml:mi>
                                            </mml:msubsup>
                                            <mml:mrow>
                                                <mml:mn>2</mml:mn>
                                                <mml:msub>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                </mml:msub>
                                                <mml:mo stretchy="false">(</mml:mo>
                                                <mml:mn>1</mml:mn>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:mn>2</mml:mn>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>j</mml:mi>
                                                </mml:msub>
                                                <mml:mo stretchy="false">)</mml:mo>
                                                <mml:mo>+</mml:mo>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msubsup>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="13em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>12</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>The covariance between the cardinality of union and intersection is given by:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M13">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mrow>
                                        <mml:mtext>cov</mml:mtext>
                                        <mml:mo>&#x2061;</mml:mo>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mi>I</mml:mi>
                                        <mml:mi>U</mml:mi>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Cov</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>I</mml:mi>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>X</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>Y</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>U</mml:mi>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mi>X</mml:mi>
                                        <mml:mo>,</mml:mo>
                                        <mml:mi>Y</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msubsup>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>=</mml:mo>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>d</mml:mi>
                                    </mml:msubsup>
                                    <mml:mrow>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msubsup>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mo>=</mml:mo>
                                                    <mml:mn>1</mml:mn>
                                                </mml:mrow>
                                                <mml:mi>d</mml:mi>
                                            </mml:msubsup>
                                            <mml:mrow>
                                                <mml:mn>2</mml:mn>
                                                <mml:msub>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                </mml:msub>
                                                <mml:msub>
                                                    <mml:mi>p</mml:mi>
                                                    <mml:mi>j</mml:mi>
                                                </mml:msub>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msubsup>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="12em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>13</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Normal distributions are defined by their mean and standard deviation and can thus be calculated from the estimates of the averages and variances. However, given the fact that the underlying features are not independent, the suitability of using normal distributions as approximations cannot be guaranteed from a theoretical point of view. Nevertheless, as has been previously shown
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>,
                        <xref ref-type="bibr" rid="ref-14">14</xref>
                    </sup>, and as can be seen from our current evaluation (
                    <italic toggle="yes">vide infra</italic>), practical applications of the model yield good performance for a variety of different fingerprint designs. Under the assumption of normality, the following models are obtained:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M14">
                            <mml:mrow>
                                <mml:mi>I</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>X</mml:mi>
                                <mml:mo>,</mml:mo>
                                <mml:mi>Y</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>&#x2248;</mml:mo>
                                <mml:mi>N</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:msub>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>I</mml:mi>
                                </mml:msub>
                                <mml:mo>,</mml:mo>
                                <mml:msubsup>
                                    <mml:mi>&#x03c3;</mml:mi>
                                    <mml:mi>I</mml:mi>
                                    <mml:mn>2</mml:mn>
                                </mml:msubsup>
                                <mml:mo stretchy="false">)</mml:mo>
                            </mml:mrow>
                            <mml:mspace width="20em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>14</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M15">
                            <mml:mrow>
                                <mml:mi>U</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>X</mml:mi>
                                <mml:mo>,</mml:mo>
                                <mml:mi>Y</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>&#x2248;</mml:mo>
                                <mml:mi>N</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:msub>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>U</mml:mi>
                                </mml:msub>
                                <mml:mo>,</mml:mo>
                                <mml:msubsup>
                                    <mml:mi>&#x03c3;</mml:mi>
                                    <mml:mi>U</mml:mi>
                                    <mml:mn>2</mml:mn>
                                </mml:msubsup>
                                <mml:mo stretchy="false">)</mml:mo>
                            </mml:mrow>
                            <mml:mspace width="20em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>15</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>where 
                    <italic toggle="yes">N</italic>(
                    <italic toggle="yes">&#x03bc;,&#x03c3;</italic>
                    <sup>2</sup>) is the normal distribution with mean 
                    <italic toggle="yes">&#x03bc;</italic> and standard deviation 
                    <italic toggle="yes">&#x03c3;</italic>. The Tc distribution is then modeled as a ratio of these two correlated distributions. An analytical form of the probability distribution function exists
                    <sup>
                        <xref ref-type="bibr" rid="ref-23">23</xref>
                    </sup>; however, for determining 
                    <italic toggle="yes">p</italic>-values and the significance, the following approximation of the cumulative distribution function (CDF) is used
                    <sup>
                        <xref ref-type="bibr" rid="ref-24">24</xref>
                    </sup>:</p>
                <p>
                    <disp-formula id="e16">
                        <mml:math display="block" id="M16">
                            <mml:mrow>
                                <mml:mi>F</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>t</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>&#x2248;</mml:mo>
                                <mml:mo>&#x03a6;</mml:mo>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mfrac>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>&#x03bc;</mml:mi>
                                                    <mml:mi>U</mml:mi>
                                                </mml:msub>
                                                <mml:mi>t</mml:mi>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:msub>
                                                    <mml:mi>&#x03bc;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                </mml:msub>
                                                <mml:msub>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>U</mml:mi>
                                                </mml:msub>
                                                <mml:mi>a</mml:mi>
                                                <mml:mo stretchy="false">(</mml:mo>
                                                <mml:mi>t</mml:mi>
                                                <mml:mo stretchy="false">)</mml:mo>
                                            </mml:mrow>
                                        </mml:mfrac>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mspace width="0.1em"/>
                                <mml:mtext>where</mml:mtext>
                                <mml:mspace width="0.2em"/>
                                <mml:mi>a</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>t</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>=</mml:mo>
                                <mml:msqrt>
                                    <mml:mrow>
                                        <mml:mfrac>
                                            <mml:mrow>
                                                <mml:msup>
                                                    <mml:mi>t</mml:mi>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msup>
                                            </mml:mrow>
                                            <mml:mrow>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msubsup>
                                            </mml:mrow>
                                        </mml:mfrac>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mfrac>
                                            <mml:mrow>
                                                <mml:mn>2</mml:mn>
                                                <mml:mi>&#x03c1;</mml:mi>
                                                <mml:mi>t</mml:mi>
                                            </mml:mrow>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                </mml:msub>
                                                <mml:msub>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>U</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                        </mml:mfrac>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mfrac>
                                            <mml:mn>1</mml:mn>
                                            <mml:mrow>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>U</mml:mi>
                                                    <mml:mn>2</mml:mn>
                                                </mml:msubsup>
                                            </mml:mrow>
                                        </mml:mfrac>
                                    </mml:mrow>
                                </mml:msqrt>
                            </mml:mrow>
                            <mml:mspace width="11em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>16</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Here, 
                    <italic toggle="yes">&#x03c1;</italic> = cov
                    <sub>
                        <italic toggle="yes">IU</italic>
                    </sub> / (
                    <italic toggle="yes">&#x03c3;
                        <sub>I</sub>&#x03c3;
                        <sub>U</sub>
                    </italic>) is the correlation between intersection and union and &#x03a6; is the CDF of the standard normal distribution:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M17">
                            <mml:mrow>
                                <mml:mo>&#x03a6;</mml:mo>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>u</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>=</mml:mo>
                                <mml:mfrac>
                                    <mml:mn>1</mml:mn>
                                    <mml:mrow>
                                        <mml:msqrt>
                                            <mml:mrow>
                                                <mml:mn>2</mml:mn>
                                                <mml:mi>&#x03c0;</mml:mi>
                                            </mml:mrow>
                                        </mml:msqrt>
                                    </mml:mrow>
                                </mml:mfrac>
                                <mml:mstyle displaystyle="true">
                                    <mml:mrow>
                                        <mml:msubsup>
                                            <mml:mo>&#x222b;</mml:mo>
                                            <mml:mrow>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:mi>&#x221e;</mml:mi>
                                            </mml:mrow>
                                            <mml:mi>u</mml:mi>
                                        </mml:msubsup>
                                        <mml:mrow>
                                            <mml:mi>exp</mml:mi>
                                            <mml:mo>&#x2061;</mml:mo>
                                            <mml:mrow>
                                                <mml:mo>(</mml:mo>
                                                <mml:mrow>
                                                    <mml:mo>&#x2212;</mml:mo>
                                                    <mml:mfrac>
                                                        <mml:mrow>
                                                            <mml:msup>
                                                                <mml:mi>x</mml:mi>
                                                                <mml:mn>2</mml:mn>
                                                            </mml:msup>
                                                        </mml:mrow>
                                                        <mml:mn>2</mml:mn>
                                                    </mml:mfrac>
                                                </mml:mrow>
                                                <mml:mo>)</mml:mo>
                                            </mml:mrow>
                                        </mml:mrow>
                                    </mml:mrow>
                                </mml:mstyle>
                                <mml:mtext>d</mml:mtext>
                                <mml:mi>x</mml:mi>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>17</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>The 
                    <italic toggle="yes">p</italic>-value can then be determined as:</p>
                <p>
                    <disp-formula>
                        <mml:math display="block" id="M18">
                            <mml:mrow>
                                <mml:mi>p</mml:mi>
                                <mml:mo>=</mml:mo>
                                <mml:mn>1</mml:mn>
                                <mml:mo>&#x2212;</mml:mo>
                                <mml:mi>F</mml:mi>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mi>t</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Pr</mml:mtext>
                                <mml:mspace width="0.2em"/>
                                <mml:mo>&#x2061;</mml:mo>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:mtext>Tc</mml:mtext>
                                <mml:mo>&gt;</mml:mo>
                                <mml:mi>t</mml:mi>
                                <mml:mo stretchy="false">)</mml:mo>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>18</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>For model evaluation, we use 
                    <italic toggle="yes">F</italic>(
                    <italic toggle="yes">t</italic>) = Pr (Tc &#x2264; 
                    <italic toggle="yes">t</italic>) directly as an indication of significance.</p>
            </sec>
            <sec>
                <title>Modeling conditional value distributions</title>
                <p>For similarity searching, reference compounds are used and Tc values of database compounds are calculated relative to the references. As has been shown
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup>, distributions of Tc values can vary greatly depending on the reference fingerprint. In this case, the significance of Tc values should to be considered for a given reference compound. Mathematically, this corresponds to determining the conditional distributions when one fingerprint is given. As in the unconditional case, the distributions are based on sums of correlated Bernoulli variables that are modeled as normal distributions based on the conditional means and variances:</p>
                <p>
                    <disp-formula id="e19">
                        <mml:math display="block" id="math19">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>I</mml:mi>
                                    <mml:mi>A</mml:mi>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mrow>
                                                <mml:mi>I</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mi>A</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msub>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>&#x2208;</mml:mo>
                                            <mml:mi>A</mml:mi>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>19</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula id="e20">
                        <mml:math display="block" id="math20">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mi>&#x03bc;</mml:mi>
                                    <mml:mi>U</mml:mi>
                                    <mml:mi>A</mml:mi>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mi>U</mml:mi>
                                        <mml:mrow>
                                            <mml:mrow>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mi>A</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mi>E</mml:mi>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>|</mml:mo>
                                            <mml:mi>A</mml:mi>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mo>+</mml:mo>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msub>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>i</mml:mi>
                                                    <mml:mo>&#x2209;</mml:mo>
                                                    <mml:mi>A</mml:mi>
                                                </mml:mrow>
                                            </mml:msub>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>X</mml:mi>
                                                    <mml:mi>i</mml:mi>
                                                </mml:msub>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mrow>
                                    <mml:mo>|</mml:mo>
                                    <mml:mi>A</mml:mi>
                                    <mml:mo>|</mml:mo>
                                </mml:mrow>
                                <mml:mo>+</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msub>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>&#x2209;</mml:mo>
                                            <mml:mi>A</mml:mi>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>p</mml:mi>
                                            <mml:mi>i</mml:mi>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>20</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula id="e21">
                        <mml:math display="block" id="math21">
                            <mml:mrow>
                                <mml:msup>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>I</mml:mi>
                                                    <mml:mi>A</mml:mi>
                                                </mml:msubsup>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mn>2</mml:mn>
                                </mml:msup>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Var</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mrow>
                                                <mml:mi>I</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mi>A</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msub>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>,</mml:mo>
                                            <mml:mi>j</mml:mi>
                                            <mml:mo>&#x2208;</mml:mo>
                                            <mml:mi>A</mml:mi>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>c</mml:mi>
                                            <mml:mrow>
                                                <mml:mi>i</mml:mi>
                                                <mml:mi>j</mml:mi>
                                            </mml:mrow>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>21</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula id="e22">
                        <mml:math display="block" id="math22">
                            <mml:mrow>
                                <mml:msup>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mo>(</mml:mo>
                                            <mml:mrow>
                                                <mml:msubsup>
                                                    <mml:mi>&#x03c3;</mml:mi>
                                                    <mml:mi>U</mml:mi>
                                                    <mml:mi>A</mml:mi>
                                                </mml:msubsup>
                                            </mml:mrow>
                                            <mml:mo>)</mml:mo>
                                        </mml:mrow>
                                    </mml:mrow>
                                    <mml:mn>2</mml:mn>
                                </mml:msup>
                                <mml:mo>=</mml:mo>
                                <mml:mtext>Var</mml:mtext>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mrow>
                                                <mml:mi>U</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mi>A</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msub>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>,</mml:mo>
                                            <mml:mi>j</mml:mi>
                                            <mml:mo>&#x2209;</mml:mo>
                                            <mml:mi>A</mml:mi>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:msub>
                                            <mml:mi>c</mml:mi>
                                            <mml:mrow>
                                                <mml:mi>i</mml:mi>
                                                <mml:mi>j</mml:mi>
                                            </mml:mrow>
                                        </mml:msub>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>22</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>
                    <disp-formula id="e23">
                        <mml:math display="block" id="math23">
                            <mml:mrow>
                                <mml:msubsup>
                                    <mml:mrow>
                                        <mml:mi>cov</mml:mi>
                                        <mml:mo>&#x2061;</mml:mo>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mtext>IU</mml:mtext>
                                    </mml:mrow>
                                    <mml:mi>A</mml:mi>
                                </mml:msubsup>
                                <mml:mo>=</mml:mo>
                                <mml:mi>cov</mml:mi>
                                <mml:mo>&#x2061;</mml:mo>
                                <mml:mrow>
                                    <mml:mo>(</mml:mo>
                                    <mml:mrow>
                                        <mml:mrow>
                                            <mml:mrow>
                                                <mml:mi>I</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                                <mml:mo>,</mml:mo>
                                                <mml:mi>U</mml:mi>
                                                <mml:mrow>
                                                    <mml:mo>(</mml:mo>
                                                    <mml:mrow>
                                                        <mml:mi>A</mml:mi>
                                                        <mml:mo>,</mml:mo>
                                                        <mml:mi>X</mml:mi>
                                                    </mml:mrow>
                                                    <mml:mo>)</mml:mo>
                                                </mml:mrow>
                                            </mml:mrow>
                                            <mml:mo>|</mml:mo>
                                        </mml:mrow>
                                        <mml:mi>A</mml:mi>
                                    </mml:mrow>
                                    <mml:mo>)</mml:mo>
                                </mml:mrow>
                                <mml:mo>=</mml:mo>
                                <mml:mstyle displaystyle="true">
                                    <mml:msub>
                                        <mml:mo>&#x03a3;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>&#x2208;</mml:mo>
                                            <mml:mi>A</mml:mi>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:mstyle displaystyle="true">
                                            <mml:msub>
                                                <mml:mo>&#x03a3;</mml:mo>
                                                <mml:mrow>
                                                    <mml:mi>j</mml:mi>
                                                    <mml:mo>&#x2209;</mml:mo>
                                                    <mml:mi>A</mml:mi>
                                                </mml:mrow>
                                            </mml:msub>
                                            <mml:mrow>
                                                <mml:msub>
                                                    <mml:mi>c</mml:mi>
                                                    <mml:mrow>
                                                        <mml:mi>i</mml:mi>
                                                        <mml:mi>j</mml:mi>
                                                    </mml:mrow>
                                                </mml:msub>
                                            </mml:mrow>
                                        </mml:mstyle>
                                    </mml:mrow>
                                </mml:mstyle>
                            </mml:mrow>
                            <mml:mspace width="10em"/>
                            <mml:mo stretchy="false">(</mml:mo>
                            <mml:mn>23</mml:mn>
                            <mml:mo stretchy="false">)</mml:mo>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>The conditional model is obtained by applying these parameters in 
                    <xref ref-type="other" rid="e16">Equation 16</xref>.</p>
                <p>A derivation of the formulas presented here for the CCBM can be found in the original publications
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>,
                        <xref ref-type="bibr" rid="ref-14">14</xref>
                    </sup>.</p>
            </sec>
            <sec>
                <title>Sparse fingerprints</title>
                <p>Sparse fingerprints like ECFPs or the Morgan fingerprint might result in hundreds of thousands of different features present in large data sets. Most of these will occur with very small probabilities 
                    <italic toggle="yes">p
                        <sub>i</sub>
                    </italic> and only have a small influence on the estimated means and variances. It is computationally unproblematic to handle these individual probability estimates; however, determining pairwise covariances of all possible features becomes infeasible for more than a few thousand features. To address this issue, the complete covariance matrix is only determined for the most frequent features of a sparse fingerprint (by default, the 2048 most frequent features are selected). Covariances involving rare fingerprints are not estimated. Given that feature probabilities of combinatorial fingerprints usually show pseudo-exponential drop-offs for rare features, contributions towards covariance estimates have negligible influence on the final estimates and are ignored in the current implementation.</p>
            </sec>
            <sec>
                <title>Data sets</title>
                <p>As reference data set, ChEMBL compounds were selected. SMILES representations of 1,870,461 compounds were downloaded and standardized using a previously published protocol included in the ccbmlib package
                    <sup>
                        <xref ref-type="bibr" rid="ref-25">25</xref>
                    </sup>. Additionally, stereochemical information was removed since most fingerprints implemented in RDKit do not account for stereochemistry, resulting in 1,691,786 unique compounds. Fingerprint statistics are reported in 
                    <xref ref-type="table" rid="T1">Table 1</xref>.</p>
            </sec>
            <sec>
                <title>Implementation and operation</title>
                <p>The software has been implemented as a module for 
                    <ext-link ext-link-type="uri" xlink:href="https://www.python.org/">Python</ext-link> 3.7. It requires the installation of 
                    <ext-link ext-link-type="uri" xlink:href="https://www.rdkit.org/">RDKit</ext-link> and has been tested with version 2019.03.4 of RDKit. Any system (Linux, Windows, MacOS) capable of running Python 3.7 and RDKit is sufficient for running our software. A 64-bit operating system with at least 8GB RAM is recommended. After obtaining the code it can be installed using Python&#x2019;s setup utility. The 
                    <monospace>ccbmlib</monospace> package contains three modules: 
                    <monospace>preprocessing</monospace>, 
                    <monospace>statistics</monospace>, and 
                    <monospace>models</monospace>.</p>
                <p>Module 
                    <monospace>preprocessing</monospace> consists of routines for standardizing molecules and preparing compound data sets. Standardization of molecules is a generally recommended preprocessing step, especially when compound data sets are assembled from different sources.</p>
                <p>Module 
                    <monospace>statistics</monospace> contains classes for feature statistics and distribution models. Its main classes are 
                    <monospace>PairwiseStats</monospace> and 
                    <monospace>CorrelatedNormalDistributions</monospace> for the fingerprint statistics and distribution models, respectively. Distribution models are obtained from 
                    <monospace>PairwiseStats</monospace> objects using the 
                    <monospace>get_tc_distribution</monospace> method, which are used to generate unconditional and conditional models.</p>
                <p> The module 
                    <monospace>models</monospace> provides the main interface for the package. It offers wrapper functions for calculating RDKit fingerprints and contains the central method 
                    <monospace>get_feature_statistics</monospace> for generating or retrieving fingerprint statistics for a reference data set. Once calculated, statistics are saved and can be retrieved for later use. Exemplary applications of the module are provided in the readme file of the ccbmlib distribution.</p>
            </sec>
        </sec>
        <sec sec-type="results | discussion">
            <title>Results and discussion</title>
            <p>Fingerprint statistics were calculated on the basis of the 1,691,786 unique ChEMBL compounds and distribution models were derived. To evaluate the quality of the general model, 1,000,000 Tc values were calculated from pairs of random compounds drawn from the ChEMBL data set and empirical CDFs were determined. 
                <xref ref-type="fig" rid="f1">Figure 1</xref> compares the empirical CDFs to the modeled unconditional CDFs for the fingerprints in 
                <xref ref-type="table" rid="T1">Table 1</xref>. Overall, the modeled CDFs match the different value ranges and shapes of the empirical CDFs very well. However, to assess the usefulness of the model as a quantitative and comparative tool, the quality of the model should be assessed with a focus on Tc values indicating high significance. The insets of the figures show an enlarged section with Tc values having a significance of 0.9 or higher. The models for the atom pair fingerprints are not able to accurately model the distribution in this region. However, most other Tc distributions can be modeled very well. For the MACCS, Morgan, and topological torsion fingerprint distributions, high-quality models are obtained with small differences between the theoretical and empirical model. The hashed variants of the Morgan and topological torsion fingerprints have distributions highly similar to their sparse counterparts. This can be expected because the average feature counts reported in 
                <xref ref-type="table" rid="T1">Table 1</xref> are also very similar, indicating that most of the sparse features are hashed to unique values and only few collisions occur between hashed values. The path-based Avalon and RDKit fingerprints still have usable, although less accurate models. These observations are consistent with previous observations
                <sup>
                    <xref ref-type="bibr" rid="ref-13">13</xref>
                </sup>. CCBM models pharmacophore-based fingerprints only to a limited extent. This might be due to the specific nature of correlations between pharmacophore features.</p>
            <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                <label>Figure 1. </label>
                <caption>
                    <title>Empirical and modeled cumulative distribution functions.</title>
                    <p>The empirical and modeled cumulative distribution functions for the fingerprints reported in 
                        <xref ref-type="table" rid="T1">Table 1</xref> are shown in (
                        <bold>a</bold>) &#x2013; (
                        <bold>k</bold>). Blue lines indicate empirical distributions obtained from randomly sampling 1,000,000 pairs of compounds from ChEMBL. Red lines show the corresponding modeled distributions according to 
                        <xref ref-type="other" rid="e16">Equation (16)</xref>. The inserts highlight the correspondence between the curves for Tc values of high significance.</p>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/24591/e11ef1b0-c679-4f0f-954d-c813769a76e1_figure1.gif"/>
            </fig>
            <p>A quantitative summary of the observations is given in 
                <xref ref-type="table" rid="T2">Table 2</xref>. It reports the Kolmogorov-Smirnov statistic (KS)
                <sup>
                    <xref ref-type="bibr" rid="ref-26">26</xref>
                </sup>, which is defined as the maximum difference between empirical (
                <italic toggle="yes">F</italic>
                <sub>emp</sub>) and modeled (
                <italic toggle="yes">F
                    <sub>model</sub>
                </italic>) distributions:</p>
            <p>
                <disp-formula id="e24">
                    <mml:math display="block" id="math24">
                        <mml:mrow>
                            <mml:mtext>KS</mml:mtext>
                            <mml:mrow>
                                <mml:mo>(</mml:mo>
                                <mml:mrow>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>emp</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mo>,</mml:mo>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>model</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                </mml:mrow>
                                <mml:mo>)</mml:mo>
                            </mml:mrow>
                            <mml:mo>=</mml:mo>
                            <mml:msub>
                                <mml:mrow>
                                    <mml:mi>max</mml:mi>
                                    <mml:mo>&#x2061;</mml:mo>
                                </mml:mrow>
                                <mml:mi>x</mml:mi>
                            </mml:msub>
                            <mml:mrow>
                                <mml:mo>|</mml:mo>
                                <mml:mrow>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>emp</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:mo>(</mml:mo>
                                        <mml:mi>x</mml:mi>
                                        <mml:mo>)</mml:mo>
                                    </mml:mrow>
                                    <mml:mo>&#x2212;</mml:mo>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>model</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:mo>(</mml:mo>
                                        <mml:mi>x</mml:mi>
                                        <mml:mo>)</mml:mo>
                                    </mml:mrow>
                                </mml:mrow>
                                <mml:mo>|</mml:mo>
                            </mml:mrow>
                        </mml:mrow>
                        <mml:mspace width="10em"/>
                        <mml:mo stretchy="false">(</mml:mo>
                        <mml:mn>24</mml:mn>
                        <mml:mo stretchy="false">)</mml:mo>
                    </mml:math>
                </disp-formula>
            </p>
            <table-wrap id="T2" orientation="portrait" position="anchor">
                <label>Table 2. </label>
                <caption>
                    <title>Kolmogorov-Smirnov statistics.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Fingerprint</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">KS</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">KS
                                <sub>90</sub>
                            </th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Atom pairs</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">5.47%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4.22%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Atom pairs &#x2013; hashed</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">8.80%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">8.80%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Avalon</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">6.91%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1.04%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">MACCS</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">2.09%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.43%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 1</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">3.64%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.54%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 1 &#x2013; hashed </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">3.37%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.30%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 2</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4.16%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1.26%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Morgan radius 2 &#x2013; hashed</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">3.80%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.83%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Topological torsions</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9.31%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.47%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Topological torsions &#x2013; hashed</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">6.78%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.75%</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">RDKit</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">8.03%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1.70%</td>
                        </tr>
                    </tbody>
                </table>
                <table-wrap-foot>
                    <fn>
                        <p>KS reports the Kolmogorov-Smirnov statistic comparing the experimental to the modeled distributions. KS
                            <sub>90</sub> reports the Kolmogorov-Smirnov statistic limited to Tc values with an empirical significance of at least 90%.</p>
                    </fn>
                </table-wrap-foot>
            </table-wrap>
            <p>In addition, the maximum difference for the significance range beyond 90% is reported (KS
                <sub>90</sub>):</p>
            <p>
                <disp-formula id="e25">
                    <mml:math display="block" id="M25">
                        <mml:mrow>
                            <mml:msub>
                                <mml:mrow>
                                    <mml:mtext>KS</mml:mtext>
                                </mml:mrow>
                                <mml:mrow>
                                    <mml:mtext>90</mml:mtext>
                                </mml:mrow>
                            </mml:msub>
                            <mml:mrow>
                                <mml:mo>(</mml:mo>
                                <mml:mrow>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>emp</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mo>,</mml:mo>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>model</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                </mml:mrow>
                                <mml:mo>)</mml:mo>
                            </mml:mrow>
                            <mml:mo>=</mml:mo>
                            <mml:msub>
                                <mml:mrow>
                                    <mml:mi>max</mml:mi>
                                    <mml:mo>&#x2061;</mml:mo>
                                </mml:mrow>
                                <mml:mi>x</mml:mi>
                            </mml:msub>
                            <mml:mrow>
                                <mml:mo>|</mml:mo>
                                <mml:mrow>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>emp</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:mo>(</mml:mo>
                                        <mml:mi>x</mml:mi>
                                        <mml:mo>)</mml:mo>
                                    </mml:mrow>
                                    <mml:mo>&#x2212;</mml:mo>
                                    <mml:msub>
                                        <mml:mi>F</mml:mi>
                                        <mml:mrow>
                                            <mml:mtext>model</mml:mtext>
                                        </mml:mrow>
                                    </mml:msub>
                                    <mml:mrow>
                                        <mml:mo>(</mml:mo>
                                        <mml:mi>x</mml:mi>
                                        <mml:mo>)</mml:mo>
                                    </mml:mrow>
                                </mml:mrow>
                                <mml:mo>|</mml:mo>
                            </mml:mrow>
                        </mml:mrow>
                        <mml:mspace width="10em"/>
                        <mml:mo stretchy="false">(</mml:mo>
                        <mml:mn>25</mml:mn>
                        <mml:mo stretchy="false">)</mml:mo>
                    </mml:math>
                </disp-formula>
            </p>
            <p>The maximum difference for most models is observed for common Tc values, i.e., where the slope of the CDF is steepest. However, as can be seen from the KS
                <sub>90</sub> values, the high significance range can be accurately assessed within 1% for MACCS, most Morgan, the torsion, and the Avalon fingerprints. The RDKit fingerprint still performs reasonably well with a KS
                <sub>90</sub> of 1.70, whereas values of 4.22 and 8.80 for the atom pair fingerprint and its hashed variant indicate poor performance of the model in this region.</p>
            <p>In addition to the unconditional model, conditional distributions were investigated when a reference fingerprint was given. As each reference fingerprint will yield a different model, 100 compounds were randomly chosen as a reference and conditional models were derived and compared to empirical Tc distributions by comparing the reference compound to 100,000 randomly chosen compounds. The ranges of correspondences between empirical and modeled significance values are shown in 
                <xref ref-type="fig" rid="f2">Figure 2</xref>. The MACCS and Morgan fingerprints again showed the best conditional models, all of which were close to the ideal diagonal. For most reference compounds, the topological torsion fingerprint also yielded very good models; however, few outliers with large deviations were observed. This might be expected when reference fingerprints only contain very few features and approximations by normal distributions fail to yield accurate models.</p>
            <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                <label>Figure 2. </label>
                <caption>
                    <title>Empirical versus modeled significance values.</title>
                    <p>For the fingerprints in 
                        <xref ref-type="table" rid="T1">Table 1</xref>, each of the graphs (
                        <bold>a</bold>) &#x2013; (
                        <bold>k</bold>) shows the variation of correspondences between empirical and modeled significance values of 100 conditional distributions obtained by selecting random reference compounds. Empirical distributions for each reference compound were determined from comparisons of 100,000 randomly chosen compounds. The black line indicates the median correspondence between empirical and modeled distribution. The dark gray area shows the interquartile range and the light gray area the range from the 5
                        <sup>th</sup> to the 95
                        <sup>th</sup> percentile. The green line is the diagonal corresponding to a perfectly matching model. The inserts highlight correspondences for significance values larger than 0.9.</p>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/24591/e11ef1b0-c679-4f0f-954d-c813769a76e1_figure2.gif"/>
            </fig>
            <p>The Python code used for data generation, data analysis, and generation of the figures is available in form of a Jupyter notebook in the github repository
                <sup>
                    <xref ref-type="bibr" rid="ref-27">27</xref>
                </sup>.</p>
        </sec>
        <sec sec-type="conclusions">
            <title>Conclusions</title>
            <p>The tools provided make it possible to evaluate the significance of Tc values for a variety of fingerprints from RDKit. Users can generate distribution models for different fingerprints with respect to reference data sets. Accurate models are obtained for most RDKIT fingerprints including the popular MACCS and Morgan fingerprints. Based on these models, it can be assessed to what extent molecular similarity is accounted for by fingerprints of different design and to what extent similarity between compounds sharing the same activity is reflected by similarity scores calculated on the basis of different fingerprint representations. Furthermore, the conditional models can be used to predict the suitability of fingerprints for similarity searching and ligand-based virtual screening.</p>
        </sec>
        <sec>
            <title>Data availability</title>
            <sec>
                <title>Source data</title>
                <p>The data sets used in this paper are freely available from ChEMBL: 
                    <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/chembl/">https://www.ebi.ac.uk/chembl/</ext-link>
                </p>
                <p>Smiles structure representations were retrieved on 15 Jan 2020 from: 
                    <ext-link ext-link-type="uri" xlink:href="ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_25_chemreps.txt.gz">ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_25_chemreps.txt.gz</ext-link>
                </p>
            </sec>
        </sec>
        <sec>
            <title>Software availability</title>
            <sec>
                <title>RDKit</title>
                <p>Our package depends on RDKit, which is freely available from 
                    <ext-link ext-link-type="uri" xlink:href="https://www.rdkit.org">https://www.rdkit.org</ext-link>
                </p>
            </sec>
            <sec>
                <title>ccbmlib</title>
                <p>Source code is available from: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/vogt-m/ccbmlib">https://github.com/vogt-m/ccbmlib</ext-link>
                </p>
                <p>Archived source code at time of publication: 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3634953">https://doi.org/10.5281/zenodo.3634953</ext-link>
                    <sup>
                        <xref ref-type="bibr" rid="ref-27">27</xref>
                    </sup>
                </p>
                <p>License: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/vogt-m/ccbmlib/blob/master/LICENSE.txt">MIT</ext-link>
                </p>
            </sec>
        </sec>
    </body>
    <back>
        <ref-list>
            <ref id="ref-1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Willett</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Barnard</surname>
                            <given-names>JM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Downs</surname>
                            <given-names>GM</given-names>
                        </name>
</person-group>:
                    <article-title>Chemical similarity searching.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Comp Sci.</italic>
</source>
                    <year>1998</year>;<volume>38</volume>(<issue>6</issue>):<fpage>983</fpage>&#x2013;<lpage>996</lpage>.
                    <pub-id pub-id-type="doi">10.1021/ci9800211</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Willett</surname>
                            <given-names>P</given-names>
                        </name>
</person-group>:
                    <article-title>Similarity methods in chemoinformatics.</article-title>
                    <source>

                        <italic toggle="yes">Ann Rev Inf Sci Technol.</italic>
</source>
                    <year>2009</year>;<volume>43</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>117</lpage>.
                    <pub-id pub-id-type="doi">10.1002/aris.2009.1440430108</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Maggiora</surname>
                            <given-names>GM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shanmugasundaram</surname>
                            <given-names>V</given-names>
                        </name>
</person-group>:
                    <article-title>Molecular similarity measures.</article-title>In
                    <italic toggle="yes">Chemoinformatics and computational chemical biology</italic>. Humana Press, Totowa, NJ.
                    <source>

                        <italic toggle="yes">Methods Mol Biol.</italic>
</source>
                    <year>2011</year>;<volume>672</volume>:<fpage>39</fpage>&#x2013;<lpage>100</lpage>.
                    <pub-id pub-id-type="pmid">20838964</pub-id>
                    <pub-id pub-id-type="doi">10.1007/978-1-60761-839-3_2</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Maggiora</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Vogt</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Stumpfe</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Molecular similarity in medicinal chemistry: miniperspective.</article-title>
                    <source>

                        <italic toggle="yes">J Med Chem.</italic>
</source>
                    <year>2014</year>;<volume>57</volume>(<issue>8</issue>):<fpage>3186</fpage>&#x2013;<lpage>3204</lpage>.
                    <pub-id pub-id-type="pmid">24151987</pub-id>
                    <pub-id pub-id-type="doi">10.1021/jm401411z</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Eckert</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Molecular similarity analysis in virtual screening: foundations, limitations and novel approaches.</article-title>
                    <source>

                        <italic toggle="yes">Drug Discov Today.</italic>
</source>
                    <year>2007</year>;<volume>12</volume>(<issue>5&#x2013;6</issue>):<fpage>225</fpage>&#x2013;<lpage>233</lpage>.
                    <pub-id pub-id-type="pmid">17331887</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.drudis.2007.01.011</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Stumpfe</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Similarity searching.</article-title>
                    <source>

                        <italic toggle="yes">Wiley Interdiscip Rev Comput Mol Sci.</italic>
</source>
                    <year>2011</year>;<volume>1</volume>(<issue>2</issue>):<fpage>260</fpage>&#x2013;<lpage>282</lpage>.
                    <pub-id pub-id-type="doi">10.1002/wcms.23</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Willett</surname>
                            <given-names>P</given-names>
                        </name>
</person-group>:
                    <article-title>Combination of similarity rankings using data fusion.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Model.</italic>
</source>
                    <year>2013</year>;<volume>53</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>10</lpage>.
                    <pub-id pub-id-type="pmid">23297768 </pub-id>
                    <pub-id pub-id-type="doi">10.1021/ci300547g</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Maggiora</surname>
                            <given-names>GM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Chemical space networks: a powerful new paradigm for the description of chemical space.</article-title>
                    <source>

                        <italic toggle="yes">J Comput Aided Mol Des.</italic>
</source>
                    <year>2014</year>;<volume>28</volume>(<issue>8</issue>):<fpage>795</fpage>&#x2013;<lpage>802</lpage>.
                    <pub-id pub-id-type="pmid">24925682</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s10822-014-9760-0</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Guha</surname>
                            <given-names>R</given-names>
                        </name>
</person-group>:
                    <article-title>Exploring structure&#x2013;activity data using the landscape paradigm.</article-title>
                    <source>

                        <italic toggle="yes">Wiley Interdiscip Rev Comput Mol Sci.</italic>
</source>
                    <year>2012</year>;<volume>2</volume>(<issue>6</issue>):<fpage>829</fpage>&#x2013;<lpage>841</lpage>.
                    <pub-id pub-id-type="pmid">24163705</pub-id>
                    <pub-id pub-id-type="doi">10.1002/wcms.1087</pub-id>
                    <pub-id pub-id-type="pmcid">3807878</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Rogers</surname>
                            <given-names>DJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tanimoto</surname>
                            <given-names>TT</given-names>
                        </name>
</person-group>:
                    <article-title>A computer program for classifying plants.</article-title>
                    <source>

                        <italic toggle="yes">Science.</italic>
</source>
                    <year>1960</year>;<volume>132</volume>(<issue>3434</issue>):<fpage>1115</fpage>&#x2013;<lpage>1118</lpage>.
                    <pub-id pub-id-type="pmid">17790723</pub-id>
                    <pub-id pub-id-type="doi">10.1126/science.132.3434.1115 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Jaccard</surname>
                            <given-names>P</given-names>
                        </name>
</person-group>:
                    <article-title>The distribution of the flora in the alpine zone.</article-title>
                    <source>

                        <italic toggle="yes">New phytol.</italic>
</source>
                    <year>1912</year>;<volume>11</volume>(<issue>2</issue>):<fpage>37</fpage>&#x2013;<lpage>50</lpage>.
                    <pub-id pub-id-type="doi">10.1111/j.1469-8137.1912.tb05611.x</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Baldi</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nasr</surname>
                            <given-names>R</given-names>
                        </name>
</person-group>:
                    <article-title>When is chemical similarity significant? The statistical distribution of chemical similarity scores and its extreme values.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Model.</italic>
</source>
                    <year>2010</year>;<volume>50</volume>(<issue>7</issue>):<fpage>1205</fpage>&#x2013;<lpage>1222</lpage>.
                    <pub-id pub-id-type="pmid">20540577</pub-id>
                    <pub-id pub-id-type="doi">10.1021/ci100010v</pub-id>
                    <pub-id pub-id-type="pmcid">2914517</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vogt</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Introduction of the conditional correlated Bernoulli model of similarity value distributions and its application to the prospective prediction of fingerprint search performance.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Model.</italic>
</source>
                    <year>2011</year>;<volume>51</volume>(<issue>10</issue>):<fpage>2496</fpage>&#x2013;<lpage>2506</lpage>.
                    <pub-id pub-id-type="pmid">21892818</pub-id>
                    <pub-id pub-id-type="doi">10.1021/ci2003472</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vogt</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Modeling Tanimoto Similarity Value Distributions and Predicting Search Results.</article-title>
                    <source>

                        <italic toggle="yes">Mol Inform.</italic>
</source>
                    <year>2017</year>;<volume>36</volume>(<issue>7</issue>):<fpage>1600131</fpage>.
                    <pub-id pub-id-type="pmid">28032955</pub-id>
                    <pub-id pub-id-type="doi">10.1002/minf.201600131</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <article-title>RDKit: open-source cheminformatics software</article-title>. (accessed Jan 27, 2020).
                    <ext-link ext-link-type="uri" xlink:href="https://www.rdkit.org/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gaulton</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hersey</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nowotka</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The ChEMBL database in 2017.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2017</year>;<volume>45</volume>(<issue>D1</issue>):<fpage>D945</fpage>&#x2013;<lpage>D954</lpage>.
                    <pub-id pub-id-type="pmid">27899562</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkw1074</pub-id>
                    <pub-id pub-id-type="pmcid">5210557</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Carhart</surname>
                            <given-names>RE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Smith</surname>
                            <given-names>DH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Venkataraghavan</surname>
                            <given-names>R</given-names>
                        </name>
</person-group>:
                    <article-title>Atom pairs as molecular features in structure-activity studies: definition and applications.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Comp Sci.</italic>
</source>
                    <year>1985</year>;<volume>25</volume>(<issue>2</issue>):<fpage>64</fpage>&#x2013;<lpage>73</lpage>.
                    <pub-id pub-id-type="doi">10.1021/ci00046a002</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gedeck</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rohde</surname>
                            <given-names>B</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bartels</surname>
                            <given-names>C</given-names>
                        </name>
</person-group>:
                    <article-title>QSAR--how good is it in practice? Comparison of descriptor sets on an unbiased cross section of corporate data sets.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Model.</italic>
</source>
                    <year>2006</year>;<volume>46</volume>(<issue>5</issue>):<fpage>1924</fpage>&#x2013;<lpage>1936</lpage>.
                    <pub-id pub-id-type="pmid">16995723</pub-id>
                    <pub-id pub-id-type="doi">10.1021/ci050413p</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <article-title>MACCS Structural Keys.</article-title>Accelrys: San Diego, CA.<year>2011</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.dalkescientific.com/writings/diary/archive/2014/10/17/maccs_key_44.html">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Rogers</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hahn</surname>
                            <given-names>M</given-names>
                        </name>
</person-group>:
                    <article-title>Extended-connectivity fingerprints.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Model.</italic>
</source>
                    <year>2010</year>;<volume>50</volume>(<issue>5</issue>):<fpage>742</fpage>&#x2013;<lpage>54</lpage>.
                    <pub-id pub-id-type="pmid">20426451</pub-id>
                    <pub-id pub-id-type="doi">10.1021/ci100050t</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Nilakantan</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bauman</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dixon</surname>
                            <given-names>JS</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Topological torsion: a new molecular descriptor for SAR applications. Comparison with other descriptors.</article-title>
                    <source>

                        <italic toggle="yes">J Chem Inf Comp Sci.</italic>
</source>
                    <year>1987</year>;<volume>27</volume>(<issue>2</issue>):<fpage>82</fpage>&#x2013;<lpage>85</lpage>.
                    <pub-id pub-id-type="doi">10.1021/ci00054a008</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <article-title>Daylight Theory manual</article-title>. Daylight Chemical Information Systems, Inc : Laguna Niguel, CA. (accessed Jan 27, 2020).
                    <ext-link ext-link-type="uri" xlink:href="https://www.daylight.com/dayhtml/doc/theory/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-23">
                <label>23</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Marsaglia</surname>
                            <given-names>G</given-names>
                        </name>
</person-group>:
                    <article-title>Ratios of normal variables and ratios of sums of uniform variables.</article-title>
                    <source>

                        <italic toggle="yes">J Am Stat Assoc.</italic>
</source>
                    <year>1965</year>;<volume>60</volume>(<issue>309</issue>):<fpage>193</fpage>&#x2013;<lpage>204</lpage>.
                    <pub-id pub-id-type="doi">10.2307/2283145</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-24">
                <label>24</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hinkley</surname>
                            <given-names>DV</given-names>
                        </name>
</person-group>:
                    <article-title>On the ratio of two correlated normal random variables.</article-title>
                    <source>

                        <italic toggle="yes">Biometrika.</italic>
</source>
                    <year>1969</year>;<volume>56</volume>(<issue>3</issue>):<fpage>635</fpage>&#x2013;<lpage>639</lpage>.
                    <pub-id pub-id-type="doi">10.2307/2334671</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>de la Vega de Le&#x00f3;n</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lounkine</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Vogt</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Design of diverse and focused compound libraries</article-title>. In:
                    <italic toggle="yes">Tutorials in Chemoinformatics.</italic>John Wiley &amp; Sons Ltd, Chichester, UK.<year>2017</year>;<fpage>83</fpage>&#x2013;<lpage>101</lpage>.
                    <pub-id pub-id-type="doi">10.1002/9781119161110.ch5</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Birnbaum</surname>
                            <given-names>ZW</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tingey</surname>
                            <given-names>FH</given-names>
                        </name>
</person-group>:
                    <article-title>One-Sided Confidence Contours for Probability Distribution Functions.</article-title>
                    <source>

                        <italic toggle="yes">Ann Math Stat.</italic>
</source>
                    <year>1951</year>;<volume>22</volume>(<issue>4</issue>):<fpage>592</fpage>&#x2013;<lpage>596</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="https://www.jstor.org/stable/2236929?seq=1">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-27">
                <label>27</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vogt</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bajorath</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>ccbmlib &#x2013; a Python Package for Modeling Tanimoto Coefficient Distributions for Molecular Fingerprints</article-title>(Version v1.0).
                    <source>

                        <italic toggle="yes">Zenodo.</italic>
</source>
                    <year>2020</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.doi.org/10.5281/zenodo.3634953">http://www.doi.org/10.5281/zenodo.3634953</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report59805">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.24591.r59805</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Cosgrove</surname>
                        <given-names>David A.</given-names>
                    </name>
                    <xref ref-type="aff" rid="r59805a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-1293-4172</uri>
                </contrib>
                <aff id="r59805a1">
                    <label>1</label>CozChemix Limited, Macclesfield, UK</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>28</day>
                <month>2</month>
                <year>2020</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Cosgrove DA</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport59805" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.22292.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The authors report a method for analysing the occurrence of features in a set of fingerprints that have been generated from a reference collection of chemical structures. They use this analysis to generate models for assessing the statistical significance of the tanimoto coefficients for pairs of fingerprints in the set. Using the model, they can produce a plot of significance vs tanimoto coefficient (a CDF). In the paper, the accuracy of the model is assessed by comparing the curve so produced with those created by calculating the tanimoto coefficients for pairs of fingerprints from a large random sample of the set. The correspondence between the modelled and empirical distribution functions is high.</p>
            <p> </p>
            <p> The paper is clearly laid out and relatively easy to read, if one takes the maths at face value. It is likely that it would be possible to reproduce their analysis from the information given. However, that is not strictly necessary from a practical standpoint as the authors have made the software they have developed for the analysis available as a Python module for anyone to download and use. They are to be commended for this action, which is still rare in the field of cheminformatics. It is likely to increase the impact of the paper considerably.</p>
            <p> </p>
            <p> When I read a paper of this nature, a key question I pose myself is &#x201c;how, if at all, will this help me with my work?&#x201d; Here I fear the authors have been less successful. For example, there is an implementation in the RDKit toolkit of the Taylor-Buttina clustering method. This is a popular way of clustering fingerprints, and hence molecules, that is widely used for things like analysis of high-throughput screening results, organising the results from a virtual screen etc. A key input parameter to the algorithm is a threshold tanimoto coefficient &#x2013; all fingerprints within a cluster are guaranteed to be within this similarity of the first fingerprint placed in the cluster. The success of this method for clustering depends very strongly on the value chosen for this threshold. Too high, and one obtains an unhelpfully large number of small clusters; too low, and the clusters will be large and contain molecules without apparent similarity. It would be very useful if there were a way of taking a successful threshold for one fingerprint type and using it to decide upon a similarly successful threshold for a different type. I feel as though this paper contains a way of doing this, but it is unclear to me quite how it would be achieved with the results presented. If the authors could add to the paper an example of how one would take a CDF for one fingerprint type and use it to translate a useful tanimoto coefficient threshold for it into an equally useful threshold for a different fingerprint type, that would, in my opinion, make the paper much more valuable.</p>
            <p>Are the conclusions about the tool and its performance adequately supported by the findings presented in the article?</p>
            <p>Yes</p>
            <p>Is the rationale for developing the new software tool clearly explained?</p>
            <p>Partly</p>
            <p>Is the description of the software tool technically sound?</p>
            <p>Yes</p>
            <p>Are sufficient details of the code, methods and analysis (if applicable) provided to allow replication of the software development and its use by others?</p>
            <p>Yes</p>
            <p>Is sufficient information provided to allow interpretation of the expected output datasets and any results generated using the tool?</p>
            <p>Partly</p>
            <p>Reviewer Expertise:</p>
            <p>Cheminformatics software development within the pharmaceutical industry.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <sub-article article-type="response" id="comment5269-59805">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Bajorath</surname>
                            <given-names>J&#x00fc;rgen</given-names>
                        </name>
                        <aff>University of Bonn, Germany</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>29</day>
                    <month>2</month>
                    <year>2020</year>
                </pub-date>
            </front-stub>
            <body>
                <p>Thank you for your comments and your suggestion. Indeed, a potential application of the methodology is establishing correspondences between Tc values of different fingerprints according to their statistical significance. Therefore, a paragraph has been added to the manuscript explaining how modeled distributions can be used to identify corresponding Tanimoto coefficients (Tc values) for fingerprints of different design. In addition, a figure has been added displaying the relationship between MACCS Tc values and Tc values of other fingerprints. The software and Jupyter notebook have been updated accordingly.</p>
            </body>
        </sub-article>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report59806">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.24591.r59806</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Goldman</surname>
                        <given-names>Brian</given-names>
                    </name>
                    <xref ref-type="aff" rid="r59806a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r59806a1">
                    <label>1</label>Modeling &amp; Informatics, Vertex Pharmaceuticals, Boston, MA, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>28</day>
                <month>2</month>
                <year>2020</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Goldman B</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport59806" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.22292.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The article &#x2018;ccbmlib: a Python package for modeling Tanimoto similarity value distributions&#x2019;, by Vogt and Bajorath is clearly written and concretely describes a method for determining the significance of tanimoto similarity scores. The statistical technique detailed in the paper outlines a mathematical method for converting tanimoto similarity scores from various binary molecular fingerprints into significance (p) values. Consequently, the method provides a way of normalizing similarity scores so that comparisons between results of searches utilizing different fingerprinting methods can be conducted easily. The paper also outlines a &#x2018;conditional method&#x2019; that provides a technique for estimating the distributions of similarity scores for a given reference compound. This allows one to estimate how well a test compound would rank in a large-scale similarity search.</p>
            <p> The explanations and mathematical equations in the paper are easy to follow. The graphs in the results section clearly support the findings of the study. I would recommend this paper to be indexed in its current form.</p>
            <p>Are the conclusions about the tool and its performance adequately supported by the findings presented in the article?</p>
            <p>Yes</p>
            <p>Is the rationale for developing the new software tool clearly explained?</p>
            <p>Yes</p>
            <p>Is the description of the software tool technically sound?</p>
            <p>Yes</p>
            <p>Are sufficient details of the code, methods and analysis (if applicable) provided to allow replication of the software development and its use by others?</p>
            <p>Yes</p>
            <p>Is sufficient information provided to allow interpretation of the expected output datasets and any results generated using the tool?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>machine learning for computational chemistry, statistics.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <sub-article article-type="response" id="comment5270-59806">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Bajorath</surname>
                            <given-names>J&#x00fc;rgen</given-names>
                        </name>
                        <aff>University of Bonn, Germany</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>29</day>
                    <month>2</month>
                    <year>2020</year>
                </pub-date>
            </front-stub>
            <body>
                <p>Thank you for your instructive comments on the manuscript.</p>
            </body>
        </sub-article>
    </sub-article>
</article>
