<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="other" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.25877.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Software Tool Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Umpire 2.0: Simulating realistic, mixed-type, clinical data for machine learning</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 1 approved with reservations]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Coombes</surname>
                        <given-names>Caitlin E.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Abrams</surname>
                        <given-names>Zachary B.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Nakayiza</surname>
                        <given-names>Samantha</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Brock</surname>
                        <given-names>Guy</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Coombes</surname>
                        <given-names>Kevin R.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-7630-2123</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>College of Medicine, The Ohio State University, Columbus, OH, 43210, USA</aff>
                <aff id="a2">
                    <label>2</label>Biomedical Informatics, The Ohio State University, Columbus, OH, 43210, USA</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:coombes.3@osu.edu">coombes.3@osu.edu</email>
                </corresp>
                <fn fn-type="con">
                    <p>CEC contributed in conceptualization, formal analysis, methodology, software, validation, visualization, and writing (both original draft preparation and review &amp; editing). ZBA contributed in methodology, software, and writing &#x2013; reviewing &amp; editing. GB contributed in supervision and writing &#x2013; reviewing &amp; editing. KRC contributed in conceptualization, formal analysis, methodology, software, supervision, validation, and writing &#x2013; reviewing and editing. All authors have approved the final version of the manuscript and agree to be accountable for all aspects of the work.</p>
                </fn>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>1</day>
                <month>10</month>
                <year>2020</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2020</year>
            </pub-date>
            <volume>9</volume>
            <elocation-id>1186</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>21</day>
                    <month>9</month>
                    <year>2020</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Coombes CE et al.</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/9-1186/pdf"/>
            <abstract>
                <p>The Umpire 2.0 R-package offers a streamlined, user-friendly workflow to simulate complex, heterogeneous, mixed-type data with known subgroup identities, dichotomous outcomes, and time-to-event data, while providing ample opportunities for fine-tuning and flexibility. Here, we describe how we have expanded the core Umpire 1.0 R-package, developed to simulate gene expression data, to generate clinically realistic, mixed-type data for use in evaluating unsupervised and supervised machine learning (ML) methods. As the availability of large-scale clinical data for ML has increased, clinical data has posed unique challenges, including widely variable size, individual biological heterogeneity, data collection and measurement noise, and mixed data types. Developing and validating ML methods for clinical data requires data sets with known ground truth, generated from simulation. Umpire 2.0 addresses challenges to simulating realistic clinical data by providing the user a series of modules to generate survival parameters and subgroups, apply meaningful additive noise, and discretize to single or mixed data types. Umpire 2.0 provides broad functionality across sample sizes, feature spaces, and data types, allowing the user to simulate correlated, heterogeneous, binary, continuous, categorical, or mixed type data from the scale of a small clinical trial to data on thousands of patients drawn from electronic health records. The user may generate elaborate simulations by varying parameters in order to compare algorithms or interrogate operating characteristics of an algorithm in both supervised and unsupervised ML.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>machine learning</kwd>
                <kwd>unsupervised machine learning</kwd>
                <kwd>supervised machine learning</kwd>
                <kwd>clustering</kwd>
                <kwd>clinical informatics</kwd>
                <kwd>mixed data</kwd>
                <kwd>mixed-type data</kwd>
                <kwd>clinical data</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/100006108">
                    <funding-source>National Center for Advancing Translational Sciences</funding-source>
                    <award-id>UL1TR002733</award-id>
                </award-group>
                <award-group id="fund-2" xlink:href="http://dx.doi.org/10.13039/100000054">
                    <funding-source>National Cancer Institute</funding-source>
                    <award-id>R03CA235101</award-id>
                </award-group>
                <funding-statement>This work was supported by the National Cancer Institute [P30CA016058] and the National Center For Advancing Translational Sciences [UL1TR002733].</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="intro">
            <title>Introduction</title>
            <p>As large clinical databases expand and data mining of the electronic medical record (EMR) improves, the scale and potential of data available for clinical knowledge discovery is increasing dramatically. Expanding size and complexity of data demands new analytics approaches and paves the way for applications of machine learning (ML) in novel clinical contexts
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>,
                    <xref ref-type="bibr" rid="ref-2">2</xref>
                </sup>. However, clinical data are characterized by heterogeneity, including measurement and data collection noise, individual biological variation, variable data set size, and mixed data types, which raises new challenges for ML analyses
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>
                </sup>. Clinical data sets vary widely in scale, from early-stage clinical trials with fewer than 100 patients to prospective cohorts following 10,000 patients to large-scale mining of electronic health records. They consist of data collected in the clinical setting, including demographic information, laboratory values, results of physical exams, disease and symptom histories, dates of visits or hospital length-of-stay, pharmacologic medications and dosing, and procedures performed, possibly with associated ICD-9 or -10 codes. The most salient, identifying feature of clinical data is that it is of mixed-type, containing continuous, categorical, and binary data. The result of this heterogeneity is an ML milieu characterized by methodological experimentation, without consensus best methods to apply to challenging clinical data
                <sup>
                    <xref ref-type="bibr" rid="ref-3">3</xref>
                </sup>.</p>
            <p>Developing and evaluating best practice methodologies for ML on clinical data demands a known validation standard for comparison. Previously, we described an approach using &#x201c;biological validation&#x201d;: testing an ML methodology in a disease with well-understood relationships between patient features and outcomes. Thus, we allow known biological truths uncovered (or absent) in a solution to validate a method
                <sup>
                    <xref ref-type="bibr" rid="ref-3">3</xref>
                </sup>. However, biological validation fails to capture interaction effects or allow the validation of emergent discoveries. By far, a superior solution would be to validate novel methods on data with known &#x201c;ground truth.&#x201d; Artificial clinical data, simulated with known assignments, can serve to rigorously test and validate ML algorithms.</p>
            <p>Simulating realistic clinical data poses challenges. The wide range in feature spaces and sample sizes demands simulation solutions that vary by orders of magnitude. Rather than simulating data of a single type, simulated clinical data must be of mixed-type and must reflect the variable mixtures of types found in clinical scenarios, where one type may predominate over others
                <sup>
                    <xref ref-type="bibr" rid="ref-4">4</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref-6">6</xref>
                </sup>. In addition, in order to conclusively test algorithms for use in clinical contexts, simulations of clinical data must replicate the noisiness of these data that results from variation of human and technological features in measurement and the biological variation between individuals.</p>
            <p>A real need exists for noisy, realistic, clinically meaningful simulated data to advance ML in clinical contexts. The user finds few tools currently available, and those pose problematic restrictions. For example, the KAMILA (
                <italic toggle="yes">k-</italic>means for mixed large data) R package can be used to generate complex mixed-type clusters with a high degree of user specificity, but can only be used to generate two clusters
                <sup>
                    <xref ref-type="bibr" rid="ref-7">7</xref>
                </sup>. Because many important problems face the analyst beyond distinguishing two groups in data, the need presents itself in the literature for more comprehensive, mixed-type simulation tools.</p>
            <p>Here, we present Umpire 2.0, a tool that facilitates generation of complex, noisy, simulated clinical and mixed-type data sets. Umpire 2.0 provides broad functionality across sample sizes, feature spaces, and data types to allow the user to simulate correlated, heterogeneous binary, continuous, categorical, or mixed type data from the scale of a small clinical trial to data on thousands of patients drawn from the EMR. These realistic clinical simulations are vital for testing and developing superior ML techniques for new clinical data challenges.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <p>The original Umpire R package (1.0) could be used to simulate complex, correlated, continuous gene expression data with known subgroup identities and both dichotomous and survival outcomes, as previously described
                <sup>
                    <xref ref-type="bibr" rid="ref-8">8</xref>
                </sup>. Two core ideas underlie Umpire. First, biological data are correlated in blocks of variable size, simulating the functioning of genes, tissues, or symptoms in biological networks and pathways. Second, motivated by the multi-hit theory of cancer, subgroups (or clusters) of patients are defined by a number of informative, latent variables called &#x201c;hits&#x201d;. Each patient receives a combination of multiple &#x201c;hits,&#x201d; simulating population heterogeneity. These latent hits are used to link simulated alterations in patient data to outcome data in the form of dichotomous outcomes and time-to-event data.</p>
            <p>Umpire 2.0 expands the Umpire simulation engine for clinical and mixed-type data through a flexible pipeline. Users can vary the characteristics and the number of subgroups, features, hits, and correlated blocks. Using Umpire, they can control the level of patient-to-patient heterogeneity in various configurations of mixed-type data. Users can also generate multiple data sets (for example, training and test) of unlimited sizes from the same underlying distributional models.</p>
            <sec>
                <title>Data heterogeneity</title>
                <p>Umpire 2.0 enables users to incorporate individual and population heterogeneity in multiple ways. First, as above, latent hits are used to simulate features in correlated blocks, using multivariate normal distributions, with variation between individual members of a subgroup. Second, users can simulate clusters of equal or unequal size. Third, users can apply additive noise modeling measurement error and individual biological variation to simulations.</p>
                <p>Because we know that clusters of equal size are unrealistic (outside of pre-defined case-control studies), we enable users to simulate clusters of equal or unequal sizes. In the equal case, we set the population proportions equal and sample data using a multinomial distribution. In the unequal case, we first sample a vector, 
                    <italic toggle="yes">r</italic> &#x223c; 
                    <italic toggle="yes">Dirichlet</italic>(
                    <italic toggle="yes">&#x03b1;</italic>
                    <sub>1</sub>, . . . , 
                    <italic toggle="yes">&#x03b1;
                        <sub>k</sub>
                    </italic>), setting the expected proportions of 
                    <italic toggle="yes">k</italic> clusters from the Dirichlet distribution. For small numbers of clusters (
                    <italic toggle="yes">k</italic> &#x2264; 8), we set all 
                    <italic toggle="yes">&#x03b1;</italic> = 10. For more clusters (
                    <italic toggle="yes">k &gt;</italic> 8), we set one quarter each of the 
                    <italic toggle="yes">&#x03b1;</italic> parameters to 1, 2, 4, and 8, respectively, accepting only a vector of cluster sizes 
                    <italic toggle="yes">r</italic> in which every cluster contains at least 1% of patients.</p>
                <p>The initial data simulated by Umpire represents the true, unadulterated biological signal. To these data, Umpire can add noise, mimicking biological variation and experimental random error. Marlin and colleagues
                    <sup>
                        <xref ref-type="bibr" rid="ref-9">9</xref>
                    </sup> argue that all clinical data &#x201c;must be treated as fundamentally uncertain&#x201d; due to human error in measurement and manual recording, variability in sampling frequencies, and variation within automatic monitoring equipment. Clinical experience teaches us that variability in clinical data arises from many sources, including human error, measurement error, and individual biological variation. However, because clinical measurements are integral to the provision of patient care, demanding high accuracy and reliability, we also assume that many clinical variables have low measurement error, such as tightly calibrated laboratory tests. For a given feature 
                    <italic toggle="yes">f</italic> measured on patient 
                    <italic toggle="yes">i,</italic> we model the clinically observed value 
                    <italic toggle="yes">Y</italic> from additive measurement noise 
                    <italic toggle="yes">E</italic> applied to the true biological signal 
                    <italic toggle="yes">S</italic> as</p>
                <p>
                    <disp-formula id="e1">
                        <mml:math display="block" id="math1">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>Y</mml:mi>
                                    <mml:mrow>
                                        <mml:mtext>f</mml:mtext>
                                        <mml:mo>,</mml:mo>
                                        <mml:mtext>i</mml:mtext>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:msub>
                                    <mml:mi>S</mml:mi>
                                    <mml:mrow>
                                        <mml:mtext>f</mml:mtext>
                                        <mml:mo>,</mml:mo>
                                        <mml:mtext>i</mml:mtext>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mo>+</mml:mo>
                                <mml:msub>
                                    <mml:mi>E</mml:mi>
                                    <mml:mrow>
                                        <mml:mtext>f</mml:mtext>
                                        <mml:mo>,</mml:mo>
                                        <mml:mtext>i</mml:mtext>
                                    </mml:mrow>
                                </mml:msub>
                                <mml:mo>.</mml:mo>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>We model the additive noise following the normal distribution 
                    <italic toggle="yes">E</italic> &#x223c; 
                    <italic toggle="yes">N</italic>(0, 
                    <italic toggle="yes">&#x03c4;</italic>) with mean 0 and standard deviation 
                    <italic toggle="yes">&#x03c4;</italic>, where 
                    <italic toggle="yes">&#x03c4;</italic> follows the gamma distribution 
                    <italic toggle="yes">&#x03c4;</italic> &#x223c; 
                    <italic toggle="yes">&#x0393;</italic> (
                    <italic toggle="yes">c</italic>, 
                    <italic toggle="yes">b</italic>) such that 
                    <italic toggle="yes">bc</italic> = 0.05. Thus, we create a distribution in which most features have very low noise while some are subject to very high noisiness.</p>
            </sec>
            <sec>
                <title>Mixed-type data</title>
                <p>Umpire 2.0 generates binary and categorical data by discretizing raw, continuous features along meaningful cutoffs. To convert a continuous feature into a binary vector, we select a cutoff and assign values on one side of this demarcation to &#x201c;zero&#x201d; and the others to &#x201c;one.&#x201d; We begin by calculating a &#x201c;bimodality index&#x201d; (BI) for the continuous vector
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup>. To compute the bimodality index, we model the data as a mixture of two normal distributions, and take:</p>
                <p>
                    <disp-formula id="e2">
                        <mml:math display="block" id="math2">
                            <mml:mrow>
                                <mml:mi>B</mml:mi>
                                <mml:mi>I</mml:mi>
                                <mml:mo>=</mml:mo>
                                <mml:msup>
                                    <mml:mrow>
                                        <mml:mo stretchy="false">[</mml:mo>
                                        <mml:mi>&#x03c0;</mml:mi>
                                        <mml:mo stretchy="false">(</mml:mo>
                                        <mml:mn>1</mml:mn>
                                        <mml:mo>&#x2212;</mml:mo>
                                        <mml:mi>&#x03c0;</mml:mi>
                                        <mml:mo stretchy="false">)</mml:mo>
                                        <mml:mo stretchy="false">]</mml:mo>
                                    </mml:mrow>
                                    <mml:mrow>
                                        <mml:mn>1</mml:mn>
                                        <mml:mo>/</mml:mo>
                                        <mml:mn>2</mml:mn>
                                    </mml:mrow>
                                </mml:msup>
                                <mml:mi>&#x03b4;</mml:mi>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Here 
                    <italic toggle="yes">&#x03c0;</italic> is the fraction of members in one population and 
                    <italic toggle="yes">&#x03b4;</italic> =(
                    <italic toggle="yes">&#x00b5;</italic>1 &#x2212;
                    <italic toggle="yes">&#x00b5;</italic>2)
                    <italic toggle="yes">/&#x03c3;</italic> is the standardized distance between the two means. The recommended cutoff of 1.1 to define bimodality was determined by simulation
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup>. If the continuous data are bimodal, we split them midway between the means. For continuous features without a bimodal distribution, we partition them to binary features by randomly selecting an arbitrary cutoff between 5% to 35%. Although arbitrariness feels uncomfortable in an informatics sphere, we believe that this approach reflects a fundamental arbitrariness in many clinical definitions. For example, an adult female with a hemoglobin of 12.0 is said to be anemic, even though the clinical presentation and symptoms of a woman with a hemoglobin of 11.9 probably do not differ from those of a woman with a hemoglobin of 12.1. The choice of an arbitrary cutoff reflects these clinical decision-making processes: along a spectrum of phenotype, a value is chosen based on experience to define the edge of the syndrome. By choosing an arbitrary cutoff, we replicate this process. To reduce bias that could result if all low values were assigned &#x201c;0&#x201d; and all larger values were assigned &#x201c;1,&#x201d; we randomly choose whether values above or below the cutoff are assigned 0. We mark binary features in which 10% or fewer values fall into one category as asymmetric and mark the remainder as symmetric binary features.</p>
                <p>To simulate a categorical feature, we rank a continuous feature from low to high and bin its components into categories, which we label numerically (i.e., 1, 2, 3, 4, 5). Distributing an equal number of observations into each bin does not reflect the realities we see in clinical data, and dividing a continuous feature by values (e.g., dividing a feature of 500 observations between 1 and 100 into units of 1-10, 11-20, etc.) could lead to overly disparate distributions of observations into categories, especially at the tails. Here, for 
                    <italic toggle="yes">c</italic> categories, we model a vector of 
                    <italic toggle="yes">R</italic> sizes along the Dirichlet distribution,</p>
                <p>
                    <disp-formula id="e3">
                        <mml:math display="block" id="math3">
                            <mml:mrow>
                                <mml:msub>
                                    <mml:mi>R</mml:mi>
                                    <mml:mi>c</mml:mi>
                                </mml:msub>
                                <mml:mspace width="0.2em"/>
                                <mml:mo>&#x223c;</mml:mo>
                                <mml:mspace width="0.2em"/>
                                <mml:mtext>&#x2009;</mml:mtext>
                                <mml:mi>D</mml:mi>
                                <mml:mi>i</mml:mi>
                                <mml:mi>r</mml:mi>
                                <mml:mi>i</mml:mi>
                                <mml:mi>c</mml:mi>
                                <mml:mi>h</mml:mi>
                                <mml:mi>l</mml:mi>
                                <mml:mi>e</mml:mi>
                                <mml:mi>t</mml:mi>
                                <mml:mtext>&#x2009;</mml:mtext>
                                <mml:mo stretchy="false">(</mml:mo>
                                <mml:msub>
                                    <mml:mi>&#x03b1;</mml:mi>
                                    <mml:mn>1</mml:mn>
                                </mml:msub>
                                <mml:mo>,</mml:mo>
                                <mml:mn>...</mml:mn>
                                <mml:mo>,</mml:mo>
                                <mml:msub>
                                    <mml:mi>&#x03b1;</mml:mi>
                                    <mml:mi>c</mml:mi>
                                </mml:msub>
                                <mml:mo stretchy="false">)</mml:mo>
                                <mml:mo>,</mml:mo>
                                <mml:mspace width="2em"/>
                                <mml:msub>
                                    <mml:mi>&#x03b1;</mml:mi>
                                    <mml:mn>1</mml:mn>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mn>&#x2026;</mml:mn>
                                <mml:mo>=</mml:mo>
                                <mml:msub>
                                    <mml:mi>&#x03b1;</mml:mi>
                                    <mml:mi>c</mml:mi>
                                </mml:msub>
                                <mml:mo>=</mml:mo>
                                <mml:mn>20</mml:mn>
                            </mml:mrow>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>such that we create categories of unequal membership without overly sparse tails. To generate an ordinal categorical feature, we bin a continuous feature and number its bins sequentially by value of observations (e.g., 1, 2, 3, 4, 5). To generate a nominal categorical feature, we number these bins in random order (e.g., 4, 2, 5, 1, 3).</p>
                <p>The user may choose to simulate continuous, binary, nominal, or ordinal data, or any mixture thereof.</p>
            </sec>
            <sec>
                <title>Operation</title>
                <p>Umpire 2.0 has been implemented as a package for R 3.6.3 and R 4.0. It is freely available on RForge and 
                    <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/Umpire/index.html">CRAN.</ext-link> Any system (Linux, Windows, MacOS) capable of running R 3.6.3 or R 4.0 is sufficient for implementing Umpire.</p>
            </sec>
            <sec>
                <title>Implementation</title>
                <p>Umpire 2.0 provides a 4-part workflow to generate simulations and save parameters for downstream reuse (
                    <xref ref-type="fig" rid="f1">Figure 1</xref>). The original Umpire 1.0 functionality and the Umpire 2.0 extension are arranged as a series of interchangeable modules (e.g., Engines, NoiseModels) within a parallel workflow. For a more thorough, guided introduction to the Umpire functions, please see the package vignettes. For clinical simulations, the user begins by generating a ClinicalEngine, consisting of a correlated block structure to generate population heterogeneity, a model of subgroup membership, and a survival model, which is used to generate a raw (continuous, not-noisy) data set. Next, clinically representative noise is applied. The user discretizes these data to mixed-type. Finally, Engine parameters, the ClinicalNoiseModel, and mixed data definitions are stored in a MixedTypeEngine to easily generate downstream simulations from the same parameter set.</p>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>Workflow to simulate mixed-type, clinically realistic data with the Umpire R package.</title>
                        <p>The user begins by generating a ClinicalEngine to define correlated block structure, latent hits, subgroup prevalences, and a survival model. This is used to generate a raw, continuous data set. The user generates a clinically meaningful ClinicalNoiseModel, and applies it to the raw data. Next, the data are discretized to mixed type. Finally, the parameters of the ClinicalEngine, the ClinicalNoiseModel, and the discretized cutpoints are stored in a MixedTypeEngine to generate future simulations with the same parameters.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure1.gif"/>
                </fig>
            </sec>
        </sec>
        <sec>
            <title>Use cases</title>
            <p>In this section, we present several examples explaining how Umpire can be used to simulate data relevant to important clinical questions.</p>
            <sec>
                <title>Use case 1: Subtypes</title>
                <p>Unsupervised machine learning algorithms, designed to discover the subtypes inherent in a given data set, form one of the major branches in the field. In the clinical literature, these algorithms are being applied to data with variable feature sizes, including some studies with fewer than 10 features
                    <sup>
                        <xref ref-type="bibr" rid="ref-4">4</xref>,
                        <xref ref-type="bibr" rid="ref-11">11</xref>
                    </sup>. The number of subtypes (or clusters) identified in the literature also spans a fairly wide range
                    <sup>
                        <xref ref-type="bibr" rid="ref-4">4</xref>,
                        <xref ref-type="bibr" rid="ref-5">5</xref>,
                        <xref ref-type="bibr" rid="ref-12">12</xref>,
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup>. At present, however, there is no consensus on which unsupervised ML algorithms are most effective, nor is it clear if different algorithms work better for different numbers of patients, clusters, features, or mixtures of data types.</p>
            </sec>
            <sec>
                <title>Clinical engine</title>
                <p>Since one idea at the core of Umpire is that cohorts of patients tend to be heterogeneous, it is perfectly positioned to perform simulations to evaluate unsupervised ML algorithms in the clinical context. As an illustration, we start by construcing a 
                    <monospace>ClinicalEngine</monospace> with four subtypes of patients.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(Umpire)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">36475</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">numFeat &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">100</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce0 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">numFeat,</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># clinical variables.</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                          
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">      # subtypes,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                          
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">FALSE</styled-content>) 
                        <styled-content style="font-size:15px;color:#8F5903;"> # about the same size.</styled-content>
                    </preformat>
                </p>
                <p>Internally, the 
                    <monospace>ClinicalEngine</monospace> simulates latent variables that affect both the expression of the clinical covariates and the outcomes in each of the four patient clusters. You can visualize which latent variables affect which clusters by extracting the &#x201c;hit pattern&#x201d; nested inside the 
                    <monospace>ClinicalEngine</monospace> (
                    <xref ref-type="fig" rid="f2">Figure 2</xref>).</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(Polychrome)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">alphabet.colors</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">26</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)[</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">c</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"red"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"navy"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"forest"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"amethyst"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                
                        <styled-content style="font-size:15px;color:#4F9905;">"turquoise"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"sea"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"wine"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)]</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">res &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">300</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"heatpattern.png"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>      
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">heatmap</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce0</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cm</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">hitPattern,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">scale=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"none"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ColSideColors =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk[</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">],</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>          
                        <styled-content style="font-size:15px;color:#214A87;">col = c</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"gray"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"black"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>Figure 2. </label>
                    <caption>
                        <title>Association between latent variables and clusters.</title>
                        <p>Black pixels mark the presence of latent variables within a cluster.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure2.gif"/>
                </fig>
                <p>Note that this heatmap shows the true underlying structure relating the clusters to the latent variables, and not any simulated data sets. By design, however, the 
                    <monospace>ClinicalEngine</monospace> can only simulate &#x201c;perfect&#x201d; continuous data reflecting the true signal. In order to simulate realistic mixed-type data, we must first add noise to these data, and then discretize some of the features to create binary or nominal features.</p>
            </sec>
            <sec>
                <title>Mixed data types</title>
                <p>Data collected in the clinic includes many different kinds of values. Demographic values include a small number of nominal demographic values (e.g., ethnicity, marital status) with four to five categories. Physical exam values include binary indicators (such as presence or absence of enlarged liver). Most of the values assessed on patients are continuous, such as heart rate, blood pressure, and laboratory values. We assume that most of these values have low error. For example, clinical laboratory values are tightly calibrated. However, some measurements, such as physical exam values, were assessed by chart review from the medical record. Others, such as blood pressure, were measured by hand and typed into the record at the time of the visit. Thus, a few values may be very prone to measurement and human error. Here we apply a 
                    <monospace>ClinicalNoiseModel</monospace> to our features that reflects our beliefs about the noisiness of our data, with the standard deviation following a gamma distribution defined by 
                    <monospace>shape</monospace> and 
                    <monospace>scale</monospace> parameters. Then, we construct a 
                    <monospace>MixedTypeEngine</monospace>.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cnm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalNoiseModel</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(numFeat,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">shape =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1.02</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">scale =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.05</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">mte &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">MixedTypeEngine</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce0,</styled-content>               
                        <styled-content style="font-size:15px;color:#8F5903;"># a clinical engine</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                           
                        <styled-content style="font-size:15px;color:#000000;">cnm,</styled-content>              
                        <styled-content style="font-size:15px;color:#8F5903;"> # a noise model</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                           
                        <styled-content style="font-size:15px;color:#214A87;">cutpoints = list</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">N =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pCont =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.6</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pBin =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pCat =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pNominal =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">range = c</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rm</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(cnm)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">A &#x2019;MixedTypeEngine&#x2019; (MTE) based on:
A &#x2019;CancerEngine&#x2019; using the cancer model:
--------------
Clinical Simulation Model (Raw), a CancerModel object constructed via:
   CancerModel(name = "Clinical Simulation Model (Raw)", nPossible = NP,
    nPattern = nClusters, HIT = hitfn, SURV = SURV, OUT = OUT,
    survivalModel = survivalModel, prevalence = Prevalence(isWeighted,
    nClusters))
    
Pattern prevalences:
[1] 0.2206505 0.2998339 0.2343084 0.2452072

Survival effects:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
-0.43931 -0.12208  0.08784  0.08932  0.32549  0.51441

Outcome effects:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
-0.46565 -0.16604 -0.03749 -0.03927  0.13257  0.40766
--------------

Base expression given by:
An Engine with 33 components.

Altered expression given by:
An Engine with 33 components.

---------------
The MTE uses the following noise model:
A &#x2019;NoiseModel&#x2019; with:
     additive offset = 0
     additive scale distributed as:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
0.001204 0.015000 0.041432 0.051332 0.069693 0.249882
     multiplicative scale = 0
---------------
The MTE simulates clinical data of these types:

asymmetric binary        continuous           nominal  symmetric binary
                1                60                23                15</styled-content>
                    </preformat>
                </p>
                <p>Note that the 
                    <monospace>cm</monospace> slot of the clinical engine is retained as a slot in the mixed-type engine, so the heatmap shown above can be recreated with the command</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># Not run</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">heatmap</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cm</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">hitPattern,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">scale=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"none"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ColSideColors =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk[</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">],</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+ </styled-content>         
                        <styled-content style="font-size:15px;color:#214A87;">col = c</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"gray"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"black"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">))</styled-content>
                    </preformat>
                </p>
                <p>At this point, we still haven&#x2019;t simulated any actual data. For that purpose, we use the 
                    <monospace>rand</monospace> method.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">mtData &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">500</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">keepall =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">TRUE</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)</styled-content>
                    </preformat>
                </p>
                <p>We now take a look inside the simulated data</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">names</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mtData)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">[1] "raw"      "clinical" "noisy"     "binned"</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">sapply</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mtData, dim)

     raw clinical noisy binned
[1,]  99      500    99    500
[2,] 500        4   500     99</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mtData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical)

CancerSubType  Outcome         LFU          Event
Min.   :1.00   Bad :234   Min.   : 0.00   Mode :logical
1st Qu.:2.00   Good:266   1st Qu.: 7.00   FALSE:136
Median :3.00              Median :16.00   TRUE :364
Mean   :2.56              Mean   :19.79
3rd Qu.:4.00              3rd Qu.:29.00
Max.   :4.00              Max.   :71.00</styled-content>
                    </preformat>
                </p>
                <p>There are four components:</p>
                <list list-type="bullet">
                    <list-item>
                        <label>1. </label>
                        <p>&#x201c;
                            <monospace>clinical</monospace>&#x201d; contains the subtype, a binary outcome, and a time-to-event outcome represented by the last follow up time (
                            <monospace>LFU</monospace>) and a logical indicator of whether the event occurred.</p>
                    </list-item>
                    <list-item>
                        <label>2. </label>
                        <p>&#x201c;
                            <monospace>raw</monospace>&#x201d; contains the continuous data simulated by the clinical engine.</p>
                    </list-item>
                    <list-item>
                        <label>3. </label>
                        <p>&#x201c;
                            <monospace>noisy</monospace>&#x201d; contains the same data, with noise added.</p>
                    </list-item>
                    <list-item>
                        <label>4. </label>
                        <p>&#x201c;
                            <monospace>binned</monospace>&#x201d; contains the mixed type data, after discretization of some features.</p>
                    </list-item>
                </list>
                <p>Note that using 
                    <monospace>keepall = FALSE</monospace> will not preserve the raw or noisy components. Also, the raw and noisy components are arranged in the &#x201c;omics&#x201d; style, where rows are features and columns are patients. By contrast, the binned component is transposed into the usual clinical style, where rows are patients and columns are features.</p>
            </sec>
            <sec>
                <title>Visualization</title>
                <p>As an illustration, we visualize clusters for the noisy, continuous data compared to the discretized, mixed-type data. We use the 
                    <monospace>daisy</monospace> function from the 
                    <monospace>cluster</monospace> R package to compute distances between mixed-type data, and we use the 
                    <monospace>Rtsne</monospace> package for visualization (
                    <xref ref-type="fig" rid="f3">Figure 3</xref>).</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">myClustColor &lt;- dk[mtData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType]</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(cluster)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dtypes &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">getDaisyTypes</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dai &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">daisy</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mtData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">binned,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">type =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dtypes)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dai &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">as.dist</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(dai)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(Rtsne)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">tsN &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">Rtsne</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">t</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mtData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">noisy))</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># transpose from omics-style to clinical-style</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">tsM &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">Rtsne</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(dai,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">is_distance=</styled-content>
                        <styled-content style="font-size:15px;color:#8F5903;">TRUE</styled-content>)</preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"daisy.png"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">10</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">opar &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">mfrow=c</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(tsN</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">Y,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">pch=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">19</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">myClustColor,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Noisy, Continuous Data"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(tsM</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">Y,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">pch=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">19</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">myClustColor,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Mixed-Type Data"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(opar)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                    <label>Figure 3. </label>
                    <caption>
                        <title>t-SNE plots.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure3.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Large scale simulation</title>
                <p>Based on the literature results referenced above, we constructed simulation parameters to represent common problems in clinical data (
                    <xref ref-type="table" rid="T1">Table 1</xref>). These included a range of feature sizes (9&#x2013;243), patient sizes (200&#x2013; 3200), and number of clusters (2&#x2013;16). In essence, this simulation was equivalent to evaluating a set of nested for-loops:</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">&gt; 
                        <styled-content style="font-size:15px;color:#8F5903;"># Not Run</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">for</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">(F</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">in</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">featureSize) {</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>   
                        <styled-content style="font-size:15px;color:#000000;">cnm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalNoiseModel</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">
                            <styled-content style="font-size:15px;color:#000000;">(F,</styled-content>
                        </styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">shape =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1.02</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">scale =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.05</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>   
                        <styled-content style="font-size:15px;color:#214A87;">for</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">(K</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">in</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">clusterSize) {</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#214A87;">for</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">(P</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">in</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">patientSize) {</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#000000;">mte &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">MixedTypeEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">list</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">F</styled-content>,               
                        <styled-content style="font-size:15px;color:#8F5903;"> # num features</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                      
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">K,</styled-content>                
                        <styled-content style="font-size:15px;color:#8F5903;"># num clusters</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                      
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">TRUE</styled-content>),

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                
                        <styled-content style="font-size:15px;color:#000000;">cnm,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                
                        <styled-content style="font-size:15px;color:#214A87;">cutpoints = list</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">N =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                                   
                        <styled-content style="font-size:15px;color:#214A87;">pCont =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.6</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                                   
                        <styled-content style="font-size:15px;color:#214A87;">pBin =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                                   
                        <styled-content style="font-size:15px;color:#214A87;">pCat =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                                   
                        <styled-content style="font-size:15px;color:#214A87;">pNominal =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.5</styled-content>))

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#000000;">simdata &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte, P,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">keepall =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">TRUE</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)</styled-content>                   
                        <styled-content style="font-size:15px;color:#8F5903;"># num patients</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">save</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(simdata,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">file =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">SOMEFILENAME)</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># for evaluation later</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>   
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content>
</preformat>
                </p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Parameters for Umpire simulations to assess clustering algorithms for clinical, mixed-type data.</title>
                        <p>Parameters were chosen to reflect data set sizes from a Phase II clinical trial to a large EHR data set or cohort.</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Patients</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">200, 800, 3200</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Features </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">9, 27, 81, 243 </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Clusters </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2, 6, 16 </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Data types and mixtures </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Single data type: continuous, binary, nominal, ordinal, mixed categorical
                                    <sup>
                                        <xref ref-type="other" rid="FN1">1</xref>
                                    </sup>
                                    <break/>Mixtures: balanced, continuous unbalanced, binary unbalanced, categorical unbalanced </td>
                            </tr>
                        </tbody>
                    </table>
                    <table-wrap-foot>
                        <fn id="FN1">
                            <p>
                                <sup>1</sup> A mixture of nominal and ordinal data</p>
                        </fn>
                    </table-wrap-foot>
                </table-wrap>
                <p>One could also vary other parameters, with the most likely candidates being the &#x201c;
                    <monospace>cutpoints</monospace>&#x201d; parameters that control the fraction of continuous, binary, or categorical features in the data.</p>
                <p>The primary benefit of these simulations for assessing clustering algorithms is that Umpire generates data with known, gold-standard cluster assignments. Using simulation parameters in  
                    <xref ref-type="table" rid="T2">Table 2</xref>, we examined hierarchical clustering (HC) with Euclidean distance, a method commonly found in the literature
                    <sup>
                        <xref ref-type="bibr" rid="ref-5">5</xref>,
                        <xref ref-type="bibr" rid="ref-14">14</xref>&#x2013;
                        <xref ref-type="bibr" rid="ref-16">16</xref>
                    </sup>, We compared HC to partitioning around medoids (PAM) and self-organizing maps (SOM)
                    <sup>
                        <xref ref-type="bibr" rid="ref-17">17</xref>,
                        <xref ref-type="bibr" rid="ref-18">18</xref>
                    </sup>. We also compared Euclidean distance to the mixed-type distance measure, DAISY. We were able to assess accuracy and quality of each clustering solution against a known ground truth using the Adjusted Rand Index (ARI)
                    <sup>
                        <xref ref-type="bibr" rid="ref-19">19</xref>
                    </sup>. Summarized results are shown in 
                    <xref ref-type="fig" rid="f4">Figure 4</xref>.</p>
                <table-wrap id="T2" orientation="portrait" position="anchor">
                    <label>Table 2. </label>
                    <caption>
                        <title>Comparison of 6 clustering methods using Umpire simulations.</title>
                        <p>Three algorithms (hierarchical clustering with Ward&#x2019;s criterion, Partitioning Around Medoids, and Self-Organizing Maps) were implemented with a single-distance metric (Euclidean distance) and a mixed-type metric and tested on single- and mixedtype simulations generated with Umpire.</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Algorithm</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Single-Distance Metric</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Mixed-Type Metric</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Hierarchical clustering </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Euclidean </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">DAISY </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Partitioning Around Medoids (PAM) </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Euclidean </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">DAISY </td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Self-organizing maps (SOM) </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Euclidean </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Supersom </td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>Figure 4. </label>
                    <caption>
                        <title>Adjusted Rand Index for six clustering methods applied to two distinct data mixtures.</title>
                        <p>DAISY outperformed all other methods on both balanced (top) and unbalanced continuous (bottom) data mixtures. Algorithm performance varied between data mixtures, with improved performance of hierarchical clustering (HC) and self-orgamnizing maps (SOM) with Euclidean distance on unbalanced continuous data.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure4.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Use case 2: Simulating survival in phase II clinical trials</title>
                <p>Time to response or adverse event is a core clinical question in clinical trials of pharmaceutical and device interventions. Here, we use Umpire to simulate time-to-event data for clinical trials to inform study design or methods development.</p>
            </sec>
            <sec>
                <title>Survival model</title>
                <p>We begin by customizing a 
                    <monospace>SurvivalModel</monospace>, which in this case will simulate a trial with 5 years of patient accrual and 1 year of follow up. The user may customize length and units of follow up, as well as the base hazard rate. (Internally, Umpire uses this hazard rate in an exponential survival function.)</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(Umpire)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">83552</styled-content>) 
                        <styled-content style="font-size:15px;color:#8F5903;"># for reproducibility</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">SurvivalModel</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">baseHazard =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">/</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                       
                        <styled-content style="font-size:15px;color:#214A87;">accrual =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                      
                        <styled-content style="font-size:15px;color:#214A87;"> followUp =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                       
                        <styled-content style="font-size:15px;color:#214A87;">units =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">12</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                       
                        <styled-content style="font-size:15px;color:#214A87;">unitName =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"months"</styled-content>)</preformat>
                </p>
                <p>Here, we illustrate the impact of altering the base hazard on the simulated mortality rate. We simulate three different survival models, using the default values for accrual and follow up (
                    <xref ref-type="fig" rid="f5">Figure 5</xref>).</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(survival)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">12345</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm3 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">SurvivalModel</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">baseHazard =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">/</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">3</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dat3 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(sm3,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm5 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">SurvivalModel</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">baseHazard =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">/</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dat5 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(sm5,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm8 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">SurvivalModel</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">baseHazard =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">/</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dat8 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(sm8,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>)</preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">
                            <styled-content style="font-size:15px;color:#4F9905;">"hazards.png"</styled-content>
                        </styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">10</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">
                            <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>
                        </styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">opar &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">pty=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">
                            <styled-content style="font-size:15px;color:#4F9905;">"s"</styled-content>
                        </styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">mfrow=c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">3</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">dat8),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Base Hazard = 1/8"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">dat5),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Base Hazard = 1/5"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">dat3),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Base Hazard = 1/3"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(opar)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>Figure 5. </label>
                    <caption>
                        <title>Effect of base hazard on survival.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure5.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Clinical engine</title>
                <p>The survival model is an argument to the constructor for a 
                    <monospace>ClinicalEngine</monospace>; if omitted, a default survival model is used. Here we explicitly use our survival model to construct a clinical engine with four balanced subtypes.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">64321</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">40</styled-content>,     
                        <styled-content style="font-size:15px;color:#8F5903;"># clinical variables.</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;"> 4</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>      
                        <styled-content style="font-size:15px;color:#8F5903;"># subtypes,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">FALSE</styled-content>, 
                        <styled-content style="font-size:15px;color:#8F5903;"># designed to be the same size.</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">survivalModel =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm)</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># outcomes.</styled-content>
                    </preformat>
                </p>
                <p>Because the latent variables affect the survival outcomes (by changing the hazard ratio) in the four patient clusters, you can generate different clinical engines from the same underlying parameters and obtain cohorts with different survival patterns (
                    <xref ref-type="fig" rid="f6">Figure 6</xref>). Note that we create different clinical engines to select differnt latent variables and create different populations. Using the 
                    <monospace>rand</monospace> function multiple times with the same engine would generate different samples from the same population, which would be useful for creating separate training and test data sets.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">11111</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce1 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">40</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">FALSE</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">survivalModel =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cdat1 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce1,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">100</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce2 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">40</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">FALSE</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">survivalModel =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cdat2 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce2,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">100</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce3 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">40</styled-content>,
                        <styled-content style="font-size:15px;color:#214A87;"> nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">FALSE</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">survivalModel =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">sm)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cdat3 &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce3,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">100</styled-content>)</preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"varce.png"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">10</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">opar &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">pty=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"s"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">mfrow=c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">3</styled-content>))

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType,</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;"> data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cdat1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Simulation 1"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">lwd = </styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event) </styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cdat2</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Simulation 2"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">lwd=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cdat3</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>       
                        <styled-content style="font-size:15px;color:#214A87;">main=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"Simulation 3"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">lwd =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(opar)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f6" orientation="portrait" position="float">
                    <label>Figure 6. </label>
                    <caption>
                        <title>Cohort survival in different simulations.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure6.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Clinical trials</title>
                <p>It is important to realize that the subtypes generated as part of a clinical engine or a mixed-type engine are unlikely to represent the arms of an actual clinical trial. They are, after all, based on patterns of latent variables that, by definition, would be unobserved by the team running the clinical trial. One might want to view the simulations as a single-arm trial, where different unknown subgroups of patients respond to the therapy differently, and the goal is to use the covariate to identify a subset of patients who respond. In that case, the ability of Umpire to generate another data set from the same mixed-type engine could be used to provide independent validation of the method.</p>
                <p>A sensible approach might be to simulate a two-arm clinical trial where one arm receives a placebo (or the current standard-of-care), while the second arm receives a new (or additional) therapy. Again, one possible goal is to identify the subset of patients in the experimental arm with better response. We can achieve this in Umpire by adding a control group.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cec &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">addControl</styled-content>(ce)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(cec)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">A &#x2019;CancerEngine&#x2019; using the cancer model:
--------------
Clinical Simulation Model (Raw) plus control, a CancerModel object constructed via:
   CancerModel(name = "Clinical Simulation Model (Raw)", nPossible = NP,
    nPattern = nClusters, HIT = hitfn, SURV = SURV, OUT = OUT,
    survivalModel = survivalModel, prevalence = Prevalence(isWeighted,
    nClusters))

Pattern prevalences:
[1] 0.5000000 0.1197993 0.1096268 0.1340680 0.1365059

Survival effects:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
-0.13793 -0.03038  0.03760  0.07660  0.18176  0.50587

Outcome effects:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
-0.284222 -0.228845 -0.040243  0.001858  0.129494  0.616201
--------------

Base expression given by:
An Engine with 40 components.

Altered expression given by:
An Engine with 40 components.</styled-content>
                    </preformat>
                </p>
                <p>The control arm is now subtype 1, and the experimental arm is given by the collection of all other (heterogeneous) subtypes.</p>
                <p>We note here that one of the default parameters to the 
                    <monospace>CancerModel</monospace> constructor that is used inside a clinical engine defines the distribution of the beta-parameters in a Cox proportional hazards model. By default, these are chosen from a normal distribution with mean 0 and standard deviation 0.3. As a consequence, each latent variable is just as likely to make the hazard ratio worse rather than better. For purposes of illustration, we are going to cheat and adjust the beta parameters to bias them toward an improved outcome in the experimental group:</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">beta &lt;- cec</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cm</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">survivalBeta</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">if</styled-content> (
                        <styled-content style="font-size:15px;color:#214A87;">sum</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(beta)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">) cec</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cm</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">survivalBeta &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">-</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">beta</styled-content>
                    </preformat>
                </p>
                <p>Of course, the better way to accomplish this goal would have been to set that parameter when we constructed the ClinicalEngine orginally, to something like 
                    <monospace>SURV = function(n) rnorm(n, 0.2, 0.3)</monospace>.</p>
                <p>Here is an example of a simulated trial.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cnm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalNoiseModel</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">40</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">shape =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1.02</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">scale =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.05</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">mte &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">MixedTypeEngine</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(cec,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                           
                        <styled-content style="font-size:15px;color:#000000;">cnm,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                           
                        <styled-content style="font-size:15px;color:#214A87;">cutpoints = list</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">N =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pCont =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.6</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pBin =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pCat =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.2</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">pNominal =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                              
                        <styled-content style="font-size:15px;color:#214A87;">range =</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>)))

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mte,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># make a factor for trial arm</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">isExp &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">Arm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">factor</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">c</styled-content>(
                        <styled-content style="font-size:15px;color:#4F9905;">"Control"</styled-content>, 
                        <styled-content style="font-size:15px;color:#4F9905;">"Experimental"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)[isExp])</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">CancerSubType  Outcome         LFU          Event                   Arm
Min.   :1.00   Bad :101   Min.   : 0.00   Mode :logical   Control     :104
1st Qu.:1.00   Good: 99   1st Qu.:14.00   FALSE:101       Experimental: 96
Median :1.00              Median :23.00   TRUE :99
Mean   :2.25              Mean   :27.53
3rd Qu.:4.00              3rd Qu.:40.25
Max.   :5.00              Max.   :70.00</styled-content>
                    </preformat>
                </p>
                <p>Now we compute the Cox proportional hazards model that we would see from the two-arm trial.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">fit &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">coxph</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">Arm,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(fit)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">Call:
coxph(formula = Surv(LFU, Event) ~ Arm, data = trialData$clinical)

  n= 200, number of events= 99
                   coef exp(coef) se(coef)      z Pr(&gt;|z|)
ArmExperimental -0.5974    0.5502   0.2075 -2.879  0.00399 **
---
Signif. codes:  0 &#x2019;***&#x2019; 0.001 &#x2019;**&#x2019; 0.01 &#x2019;*&#x2019; 0.05 &#x2019;.&#x2019; 0.1 &#x2019; &#x2019; 1

                exp(coef) exp(-coef) lower .95 upper .95
ArmExperimental    0.5502      1.817    0.3664    0.8264

Concordance= 0.57  (se = 0.027 )
Likelihood ratio test= 8.56  on 1 df,   p=0.003
Wald test            = 8.29  on 1 df,   p=0.004
Score (logrank) test = 8.53  on 1 df,   p=0.003</styled-content>
                    </preformat>
                </p>
                <p>Thanks in part to the bias we built into the simulation, the experimental group does significantly better than the control group. We can also fit the model that would be obtained if the trial designers were omniscient and could see the subtypes defined by the latent variables.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">latentfit &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">coxph</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">factor</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(CancerSubType),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                      
                        <styled-content style="font-size:15px;color:#214A87;">data =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">latentfit</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">Call:
coxph(formula = Surv(LFU, Event) ~ factor(CancerSubType), data = trialData$clinical)

                           coef exp(coef) se(coef)      z      p
factor(CancerSubType)2  0.08847   1.09250  0.30651  0.289 0.7729
factor(CancerSubType)3 -0.62841   0.53344  0.37710 -1.666 0.0956
factor(CancerSubType)4 -0.91522   0.40043  0.35797 -2.557 0.0106
factor(CancerSubType)5 -0.90929   0.40281  0.37737 -2.410 0.0160

Likelihood ratio test=15.31  on 4 df, p=0.004102
n= 200, number of events= 99</styled-content>
</preformat>
                </p>
                <p>Here we see that one of the four subtypes is equivalent to the control group, while all three other subtypes appear to do better. Finally, we plot the resulting Kaplan-Meirer curves for both models (
                    <xref ref-type="fig" rid="f7">Figure 7</xref>).</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"fitplots.png"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">10</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">opar &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">mfrow =c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>))

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">Arm,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>      
                        <styled-content style="font-size:15px;color:#214A87;">col=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">dk[</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>, 
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)],</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">lwd=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">legend</styled-content>(
                        <styled-content style="font-size:15px;color:#4F9905;">"bottomleft"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">levels</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">Arm),</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;"> lwd=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk[</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>, 
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)])</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content>

                        <styled-content style="font-size:15px;color:#A30000;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">plot</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">survfit</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Surv</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(LFU, Event)</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">~</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">factor</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(CancerSubType),</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">trialData</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical),</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>      
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">dk,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">lwd=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">legend</styled-content>(
                        <styled-content style="font-size:15px;color:#4F9905;">"bottomleft"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">legend =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">lwd=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">2</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">col =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">par</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(opar)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>

</preformat>
                </p>
                <fig fig-type="figure" id="f7" orientation="portrait" position="float">
                    <label>Figure 7. </label>
                    <caption>
                        <title>Kaplan-Meier plots for a simulated two-arm trial (left) and for the hidden latent subtypes (right).</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure7.gif"/>
                </fig>
                <p>As in the first use case, you can run a set of nested loops to vary the parameters of interest. As noted previously, one possible application would be to test algorithms for finding clinical variables that define patient subgroups with better (or worse) responses than the control group.</p>
            </sec>
            <sec>
                <title>Use case 3: Epidemiological cohort studies, mixed data sources, and binary outcomes</title>
                <p>Large epidemiological cohorts are a foundational data type in public health research. Here, we simulate an extensive patient cohort and assess for a binary outcome.</p>
                <p>Epidemiological cohorts may aggregate data from multiple data collection instruments, possibly including chart review, laboratory data, and survey. Here, we generate mixed type data consisting of continuous laboratory data gathered at time of study entry and an extensive survey, which contains both nominal and ordinal (Likert scale) categorical responses. We simulate a 
                    <monospace>ClinicalEngine</monospace> with a large feature space and 6 latent clusters of unequal size, taking the default noise and survival models. We generate data for 4,000 patients.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">set.seed</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1000</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">ce &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalEngine</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">nFeatures =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">300</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">nClusters =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">isWeighted =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">TRUE</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">OUT = function</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(n)</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rnorm</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(n,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">-0.1</styled-content>, 
                        <styled-content style="font-size:15px;color:#0000B1;">0.3</styled-content>)) 
                        <styled-content style="font-size:15px;color:#8F5903;"># bias favorable</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cnm &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">ClinicalNoiseModel</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">300</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">mixed &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">MixedTypeEngine</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ce, cnm,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                             
                        <styled-content style="font-size:15px;color:#214A87;">list</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">N=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">200</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">pCont =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">.3</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">pCat =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">.7</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">pBin=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">0</styled-content>,

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                  
                        <styled-content style="font-size:15px;color:#214A87;">pNominal =</styled-content> .5, 
                        <styled-content style="font-size:15px;color:#214A87;">range = c</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">3</styled-content>,
                        <styled-content style="font-size:15px;color:#0000B1;">7</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">)))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">table</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">getDataTypes</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mixed))</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">continuous    nominal    ordinal
        77         97        123</styled-content>


                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">simdata &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rand</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mixed,</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">4000</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">sapply</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(simdata, dim)</styled-content>


                        <styled-content style="font-size:15px;color:#000000;">     binned clinical
[1,]   4000     4000
[2,]    297        4</styled-content>


                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical)</styled-content>


                        <styled-content style="font-size:15px;color:#000000;">CancerSubType   Outcome          LFU          Event
Min.   :1.000   Bad :1233   Min.   : 0.00   Mode :logical
1st Qu.:2.000   Good:2767   1st Qu.:14.00   FALSE:2095
Median :4.000               Median :25.00   TRUE :1905
Mean   :3.762               Mean   :28.31
3rd Qu.:5.000               3rd Qu.:42.00
Max.   :6.000               Max.   :71.00</styled-content>
                    </preformat>
                </p>
                <p>Now we are going to fit univariate models to determine which features are associated with the binary outcome. For continuous variables, we perform a t-test comparing the values in the two outcome groups. For discrete variables (binary or categorical), we perform a chi-squared test.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">DT &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">getDataTypes</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(mixed)</styled-content> 

                        <styled-content style="font-size:15px;color:#8F5903;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">results &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">data.frame</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Test=</styled-content>
                        <styled-content style="font-size:15px;color:#8F5903;">NA</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">Statistic =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">NA</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">PValue =</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;">NA</styled-content>)[
                        <styled-content style="font-size:15px;color:#CF5C00;">-</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,]</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">for</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">(J</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">in</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#214A87;">ncol</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">binned)) {</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>   
                        <styled-content style="font-size:15px;color:#214A87;">if</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">(DT[J]</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">==</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"continuous"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">) {</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">test &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">t.test</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">binned[simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>clinical
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>Outcome 
                        <styled-content style="font-size:15px;color:#CF5C00;">==</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"Bad"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">, J],</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                      
                        <styled-content style="font-size:15px;color:#000000;">simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">binned[simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">clinical</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">Outcome</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">==</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"Good"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">, J])</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">results &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rbind</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(results,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">data.frame</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Test</styled-content> = 
                        <styled-content style="font-size:15px;color:#4F9905;">"T"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                    
                        <styled-content style="font-size:15px;color:#214A87;">Statistic =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">test</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">statistic,</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                    
                        <styled-content style="font-size:15px;color:#214A87;">PValue =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">test</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">p.value))</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>   
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">else</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">{</styled-content> 
                        <styled-content style="font-size:15px;color:#8F5903;"># discrete = binary or categorical</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">tab &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">table</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">feature =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">binned[, J],</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                   
                        <styled-content style="font-size:15px;color:#214A87;">outcome =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">simdata</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>clinical
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>Outcome)

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">test &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">chisq.test</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(tab)</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">results &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rbind</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(results,</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                        
                        <styled-content style="font-size:15px;color:#214A87;">data.frame</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">Test =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"ChiSq"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">,</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                    
                        <styled-content style="font-size:15px;color:#214A87;">Statistic =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">test</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">statistic,</styled-content> 

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>                                    
                        <styled-content style="font-size:15px;color:#214A87;">PValue = </styled-content>
                        <styled-content style="font-size:15px;color:#000000;">test</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">p.value))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>     
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>  
                        <styled-content style="font-size:15px;color:#000000;">}</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">summary</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(results)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">    Test             Statistic          PValue
Length:297         Min.   :-9.300   Min.   :0.00000
Class :character   1st Qu.: 1.266   1st Qu.:0.00371
Mode  :character   Median : 4.960   Median :0.23660
                   Mean   : 9.104   Mean   :0.32206
                   3rd Qu.: 9.924   3rd Qu.:0.58628
                   Max.   :87.610   Max.   :0.99882</styled-content>
                    </preformat>
                </p>
                <p>To account for multiple testing, we fit a beta-uniform-mixture (BUM) model to estimate the false discovery rate (FDR)
                    <sup>
                        <xref ref-type="bibr" rid="ref-20">20</xref>
                    </sup>. We show the results by overlaying the fitted model on a histogram of p-values (
                    <xref ref-type="fig" rid="f8">Figure 8</xref>).</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">suppressMessages</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">library</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(ClassComparison))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">bum &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">Bum</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(results</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">PValue)</styled-content>


                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"fig8-bum.png"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">5</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">4</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">hist</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(bum)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f8" orientation="portrait" position="float">
                    <label>Figure 8. </label>
                    <caption>
                        <title>Histogram of p-values with overlaid BUM model.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure8.gif"/>
                </fig>
                <p>There is clear evidence of an enrichment of small p-values indicating features that are associated with the clinical outcome in univariate models. We can determine the number of significant features and the nominal p-value cutoff associated with any given FDR.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">countSignificant</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(bum,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">alpha =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.01</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">by =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"FDR"</styled-content>)


                        <styled-content style="font-size:15px;color:#000000;">[1] 76</styled-content>


                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cutoff &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">cutoffSignificant</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(bum,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">alpha =</styled-content> 
                        <styled-content style="font-size:15px;color:#0000B1;">0.01</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">by =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"FDR"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cutoff</styled-content>


                        <styled-content style="font-size:15px;color:#000000;">[1] 0.004186874</styled-content>
                    </preformat>
                </p>
                <p>We can also count the number of significant &#x201c;discoveries&#x201d; associated with each block of correlated genes, but this requires some spelunking into the depths of the mixed-type engine.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">A &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">get</styled-content>(
                        <styled-content style="font-size:15px;color:#4F9905;">"altered"</styled-content>, 
                        <styled-content style="font-size:15px;color:#000000;">mixed</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">localenv)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">N &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">nComponents</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(A)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">am &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">sapply</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(A</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">components,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">function</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(x)</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">length</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(x</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">mu))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">block &lt;-</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">rep</styled-content>(
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">N,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">times =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">am)</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">table</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">(results</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">$</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">PValue</styled-content> 
                        <styled-content style="font-size:15px;color:#CF5C00;">&lt;</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">cutoff, block)</styled-content>
                    </preformat>
                </p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#000000;">       block
         1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
  FALSE 10 11 10 11  8  6  6  6  3  8 11  7  4  9 11  6  2  0  8  7 11 11 11 11
  TRUE   1  0  1  0  3  5  5  5  8  3  0  4  7  2  0  5  9 11  3  4  0  0  0  0
       block
        25 26 27
  FALSE 11 11 11
  TRUE   0  0  0</styled-content>
                    </preformat>
                </p>
                <p>We can compare this table with a heatmap of the hit pattern (
                    <xref ref-type="fig" rid="f9">Figure 9</xref>). Only 20 of the 27 blocks were included as possible hits, and blocks 2, 4, 11, and 15 were unused. The table shows that none of the identified features were included in any of those blocks, suggesting that we made no false discoveries.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">png</styled-content>(
                        <styled-content style="font-size:15px;color:#214A87;">filename =</styled-content> 
                        <styled-content style="font-size:15px;color:#4F9905;">"heatpattern2.png"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">width=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">height=</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">*</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">res=</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">res,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">bg=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"white"</styled-content>)

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">heatmap</styled-content>(
                        <styled-content style="font-size:15px;color:#000000;">mixed</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">cm</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">@</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">hitPattern,</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">scale=</styled-content>
                        <styled-content style="font-size:15px;color:#4F9905;">"none"</styled-content>, 
                        <styled-content style="font-size:15px;color:#214A87;">ColSideColors =</styled-content> 
                        <styled-content style="font-size:15px;color:#000000;">dk[</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">1</styled-content>
                        <styled-content style="font-size:15px;color:#CF5C00;">:</styled-content>
                        <styled-content style="font-size:15px;color:#0000B1;">6</styled-content>],

                        <styled-content style="font-size:15px;color:#CF5C00;">+</styled-content>          
                        <styled-content style="font-size:15px;color:#214A87;">col = c</styled-content>(
                        <styled-content style="font-size:15px;color:#4F9905;">"gray"</styled-content>, 
                        <styled-content style="font-size:15px;color:#4F9905;">"black"</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">))</styled-content>

                        <styled-content style="font-size:15px;color:#CF5C00;">&gt;</styled-content> 
                        <styled-content style="font-size:15px;color:#214A87;">dev.off</styled-content>
                        <styled-content style="font-size:15px;color:#000000;">()</styled-content>
                    </preformat>
                </p>
                <fig fig-type="figure" id="f9" orientation="portrait" position="float">
                    <label>Figure 9. </label>
                    <caption>
                        <title>Association between latent variables and clusters.</title>
                        <p>Black pixels mark the presence of latent variables within a cluster. Columns are clusters (or subypes); rows are latent variables (or hits).</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/28557/3e0c55f6-02c5-4c30-a19a-b0bbf6d68535_figure9.gif"/>
                </fig>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <p>The Umpire R-package provides a series of tools to simulate complex, correlated, heterogenous data for methods development and testing for omics and clinical data. The Umpire 2.0 package version described here provides an easy, user-friendly pipeline to generate clinically realistic, mixed-type data to interrogate analytic problems in clinical data. Alongside data sets with meaningful noise and complex feature interrelationships, Umpire simulates subgroup or cluster identities with known ground truth and single- and multi-group dichotomous and survival outcomes. Thus, Umpire facilitates the creation of simulations to explore a variety of methodological problems.</p>
            <p>Umpire offers the user a streamlined workflow with ample opportunities for fine-tuning and flexibility. Although this paper describes applications for clinical data, we have previously described Umpire&#x2019;s tools for simulating omics data
                <sup>
                    <xref ref-type="bibr" rid="ref-8">8</xref>
                </sup>. Furthermore, the modules of the package (e.g., Engines, NoiseModels, and make-DataTypes) may be used interchangeably. Thus, the user may choose to generate omics-scale data of noncontinuous type. The user may generate elaborate simulations by varying and increasing parameters (including, but not limited to, subgroup size or number, feature space, sample size, noise, survival model) to target an inquiry.</p>
            <p>In our use cases, we demonstrated the flexibility of Umpire for generating simulations to help evaluate a variety of applications of machine learning to clinical data. These include applications of unsupervised ML to discover subtypes (in Use case 1) and applications of supervised machine learning to find predictive or prognostic factors (in Use cases 2 and 3). The ability of Umpire to evaluate analysis methods is not confined to these use cases. Our use cases illustrating supervised ML did not exploit the fact that, using the parameters saved in a mixed-type engine, Umpire can simulate multiple data sets from the same underlying population, thus providing unlimited test and validation sets. In addition to testing algorithms head-to-head, Umpire can also be used to generate complex simulations to interrogate the &#x201c;operating characteristics&#x201d; of an algorithm. For instance, one of the still-unsolved problems in clustering is determining the true number of clusters. A researcher who has developed a new method that claims to solve this problem could simulate mixed-type data with a variety of different cluster numbers, prevalences, feature numbers, and patient sizes to determine which factors influence the accuracy of the method.</p>
            <p>We expect Umpire to have wide applicability as a tool for comparing and understanding the behavior of any ML method that has the potential to be applied to clinical data.</p>
        </sec>
        <sec>
            <title>Data availability</title>
            <p>All data underlying the results are available as part of the article and no additional source data are required.</p>
        </sec>
        <sec>
            <title>Software availability</title>
            <p>
                <bold>Umpire is freely available at the Comprehensive R Archive Network</bold>: 
                <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/Umpire/index.html">https://cran.r-project.org/ web/packages/Umpire/index.html</ext-link>. </p>
            <p>
                <bold>Source code is available from R-Forge:</bold> 
                <ext-link ext-link-type="uri" xlink:href="https://r-forge.r-project.org/R/?group_id=1831">https://r-forge.r-project.org/R/?group_id=1831</ext-link>. </p>
            <p>
                <bold>Archived source code at time of publication:</bold> 
                <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/src/contrib/Archive/Umpire/">https://cran.r-project.org/src/contrib/Archive/Umpire/</ext-link> and 
                <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/record/4023106">https://doi.org/10.5281/zenodo.4023106</ext-link>
                <sup>
                    <xref ref-type="bibr" rid="ref-21">21</xref>
                </sup>.</p>
            <p>
                <bold>License:</bold> Apache License, version 2.0.</p>
        </sec>
    </body>
    <back>
        <ref-list>
            <ref id="ref-1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Raghupathi</surname>
                            <given-names>W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Raghupathi</surname>
                            <given-names>V</given-names>
                        </name>
</person-group>:
                    <article-title>Big data analytics in healthcare: promise and potential.</article-title>
                    <source>

                        <italic toggle="yes">Health Inf Sci Syst.</italic>
</source>
                    <year>2014</year>;<volume>2</volume>:<fpage>3</fpage>.
                    <pub-id pub-id-type="pmid">25825667</pub-id>
                    <pub-id pub-id-type="doi">10.1186/2047-2501-2-3 </pub-id>
                    <pub-id pub-id-type="pmcid">4341817</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Cook</surname>
                            <given-names>JA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Collins</surname>
                            <given-names>GS</given-names>
                        </name>
</person-group>:
                    <article-title>The rise of big clinical databases.</article-title>
                    <source>

                        <italic toggle="yes">Br J Surg.</italic>
</source>
                    <year>2015</year>;<volume>102</volume>(<issue>2</issue>):<fpage>e93</fpage>&#x2013;<lpage>e101</lpage>.
                    <pub-id pub-id-type="pmid">25627139</pub-id>
                    <pub-id pub-id-type="doi">10.1002/bjs.9723 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Coombes</surname>
                            <given-names>CE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Abrams</surname>
                            <given-names>ZB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Unsupervised machine learning and prognostic factors of survival in chronic lymphocytic leukemia.</article-title>
                    <source>

                        <italic toggle="yes">J Am Med Inform Assoc.</italic>
</source>
                    <year>2020</year>;<volume>27</volume>(<issue>7</issue>):<fpage>1019</fpage>&#x2013;<lpage>1027</lpage>.
                    <pub-id pub-id-type="pmid">32483590</pub-id>
                    <pub-id pub-id-type="doi">10.1093/jamia/ocaa060 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Castaldi</surname>
                            <given-names>PJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Benet</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Petersen</surname>
                            <given-names>H</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Do COPD subtypes really exist? COPD heterogeneity and clustering in 10 independent cohorts.</article-title>
                    <source>

                        <italic toggle="yes">Thorax.</italic>
</source>
                    <year>2017</year>;<volume>72</volume>(<issue>11</issue>):<fpage>998</fpage>&#x2013;<lpage>1006</lpage>.
                    <pub-id pub-id-type="pmid">28637835</pub-id>
                    <pub-id pub-id-type="doi">10.1136/thoraxjnl-2016-209846 </pub-id>
                    <pub-id pub-id-type="pmcid">6013053 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Pikoula</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Quint</surname>
                            <given-names>JK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nissen</surname>
                            <given-names>F</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Identifying clinically important COPD sub-types using data-driven approaches in primary care population based electronic health records.</article-title>
                    <source>

                        <italic toggle="yes">BMC Med Inform Decis Mak.</italic>
</source>
                    <year>2019</year>;<volume>19</volume>(<issue>1</issue>):<fpage>86</fpage>.
                    <pub-id pub-id-type="pmid">30999919</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s12911-019-0805-0 </pub-id>
                    <pub-id pub-id-type="pmcid">6472089</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Powers</surname>
                            <given-names>BW</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Yan</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhu</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Subgroups of High-Cost Medicare Advantage Patients: an Observational Study.</article-title>
                    <source>

                        <italic toggle="yes">J Gen Intern Med.</italic>
</source>
                    <year>2019</year>;<volume>34</volume>(<issue>2</issue>):<fpage>218</fpage>&#x2013;<lpage>225</lpage>.
                    <pub-id pub-id-type="pmid">30511290</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s11606-018-4759-1</pub-id>
                    <pub-id pub-id-type="pmcid">6374249 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Foss</surname>
                            <given-names>AH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Markatou</surname>
                            <given-names>M</given-names>
                        </name>
</person-group>:
                    <article-title>kamila: clustering mixed-type data in R and Hadoop.</article-title>
                    <source>

                        <italic toggle="yes">J Stat Softw.</italic>
</source>
                    <year>2018</year>;<volume>83</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>44</lpage>.
                    <pub-id pub-id-type="doi">10.18637/jss.v083.i13</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Coombes</surname>
                            <given-names>KR</given-names>
                        </name>
</person-group>:
                    <article-title>Sources of variation in false discovery rate estimation include sample size, correlation, and inherent differences between groups.</article-title>
                    <source>

                        <italic toggle="yes">BMC Bioinformatics.</italic>
</source>
                    <year>2012</year>;<volume>Suppl 13(Suppl 13)</volume>:<fpage>S1</fpage>.
                    <pub-id pub-id-type="pmid">23320794</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1471-2105-13-S13-S1</pub-id>
                    <pub-id pub-id-type="pmcid">3426804</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Marlin</surname>
                            <given-names>BM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kale</surname>
                            <given-names>DC</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Khemani</surname>
                            <given-names>RG</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Unsupervised pattern discovery in electronic health care data using probabilistic clustering models.</article-title>
                    <source>
In: 
                        <italic toggle="yes">Proceedings of the 2nd ACM SIGHIT international health informatics symposium.</italic>
</source>
                    <year>2012</year>;<fpage>389</fpage>&#x2013;<lpage>398</lpage>.
                    <pub-id pub-id-type="doi">10.1145/2110363.2110408</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wen</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Symmans</surname>
                            <given-names>WF</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The bimodality index: a criterion for discovering and ranking bimodal signatures from cancer gene expression profiling data.</article-title>
                    <source>

                        <italic toggle="yes">Cancer Inform.</italic>
</source>
                    <year>2009</year>;<volume>7</volume>:<fpage>199</fpage>&#x2013;<lpage>216</lpage>.
                    <pub-id pub-id-type="pmid">19718451</pub-id>
                    <pub-id pub-id-type="doi">10.4137/cin.s2846 </pub-id>
                    <pub-id pub-id-type="pmcid">2730180 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>JH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rhee</surname>
                            <given-names>CK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kim</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Identification of subtypes in subjects with mild-to-moderate airflow limitation and its clinical and socioeconomic implications.</article-title>
                    <source>

                        <italic toggle="yes">Int J Chron Obstruct Pulmon Dis.</italic>
</source>
                    <year>2017</year>;<volume>12</volume>:<fpage>1135</fpage>&#x2013;<lpage>1144</lpage>.
                    <pub-id pub-id-type="pmid">28442900</pub-id>
                    <pub-id pub-id-type="doi">10.2147/COPD.S130140</pub-id>
                    <pub-id pub-id-type="pmcid">5396836 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bose</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Radhakrishnan</surname>
                            <given-names>K</given-names>
                        </name>
</person-group>:
                    <article-title>Using Unsupervised Machine Learning to Identify Subgroups Among Home Health Patients With Heart Failure Using Telehealth.</article-title>
                    <source>

                        <italic toggle="yes">Comput Inform Nurs.</italic>
</source>
                    <year>2018</year>;<volume>36</volume>(<issue>5</issue>):<fpage>242</fpage>&#x2013;<lpage>248</lpage>.
                    <pub-id pub-id-type="pmid">29494361</pub-id>
                    <pub-id pub-id-type="doi">10.1097/CIN.0000000000000423 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Yan</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Linn</surname>
                            <given-names>KA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Powers</surname>
                            <given-names>BW</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Applying Machine Learning Algorithms to Segment High-Cost Patient Populations.</article-title>
                    <source>

                        <italic toggle="yes">J Gen Intern Med.</italic>
</source>
                    <year>2019</year>;<volume>34</volume>(<issue>2</issue>):<fpage>211</fpage>&#x2013;<lpage>217</lpage>.
                    <pub-id pub-id-type="pmid">30543022</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s11606-018-4760-8 </pub-id>
                    <pub-id pub-id-type="pmcid">6374273 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Burgel</surname>
                            <given-names>PR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Paillasseur</surname>
                            <given-names>JL</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Caillaud</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Clinical COPD phenotypes: a novel approach using principal component and cluster analyses.</article-title>
                    <source>

                        <italic toggle="yes">Eur Respir J.</italic>
</source>
                    <year>2010</year>;<volume>36</volume>(<issue>3</issue>):<fpage>531</fpage>&#x2013;<lpage>9</lpage>.
                    <pub-id pub-id-type="pmid">20075045</pub-id>
                    <pub-id pub-id-type="doi">10.1183/09031936.00175109</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Inohara</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shrader</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pieper</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Association of of Atrial Fibrillation Clinical Phenotypes With Treatment Patterns and Outcomes: A Multicenter Registry Study.</article-title>
                    <source>

                        <italic toggle="yes">JAMA Cardiol.</italic>
</source>
                    <year>2018</year>;<volume>3</volume>(<issue>1</issue>):<fpage>54</fpage>&#x2013;<lpage>63</lpage>.
                    <pub-id pub-id-type="pmid">29128866</pub-id>
                    <pub-id pub-id-type="doi">10.1001/jamacardio.2017.4665 </pub-id>
                    <pub-id pub-id-type="pmcid">5833527 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Egan</surname>
                            <given-names>BM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sutherland</surname>
                            <given-names>SE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tilkemeier</surname>
                            <given-names>PL</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A cluster-based approach for integrating clinical management of Medicare beneficiaries with multiple chronic conditions.</article-title>
                    <source>

                        <italic toggle="yes">PLoS One.</italic>
</source>
                    <year>2019</year>;<volume>14</volume>(<issue>6</issue>):<fpage>e0217696</fpage>.
                    <pub-id pub-id-type="pmid">31216301</pub-id>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0217696 </pub-id>
                    <pub-id pub-id-type="pmcid">6584004 </pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kaufman</surname>
                            <given-names>PJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rousseeuw</surname>
                            <given-names>L</given-names>
                        </name>
</person-group>:
                    <article-title>Finding Groups in Data: An Introduction to Cluster Analysis</article-title>. John Wiley &amp; Sons, Hoboken, NJ.<year>1990</year>.
                    <pub-id pub-id-type="doi">10.1002/9780470316801</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kohonen</surname>
                            <given-names>T</given-names>
                        </name>
</person-group>:
                    <article-title>Self-Organizing Maps</article-title>. Springer-Verlag, Berlin; Heidelberg; New York, third edition,<year>2001</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://www.springer.com/gp/book/9783540679219">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hubert</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Arabie</surname>
                            <given-names>P</given-names>
                        </name>
</person-group>:
                    <article-title>Comparing partitions.</article-title>
                    <source>

                        <italic toggle="yes">J Classif.</italic>
</source>
                    <year>1985</year>;<volume>2</volume>(<issue>1</issue>):<fpage>193</fpage>&#x2013;<lpage>218</lpage>.
                    <pub-id pub-id-type="doi">10.1007/BF01908075</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Pounds</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Morris</surname>
                            <given-names>SW</given-names>
                        </name>
</person-group>:
                    <article-title>Estimating the occurrence of false positives and false negatives in microarray studies by approximating and partitioning the empirical distribution of 
                        <italic toggle="yes">p</italic>-values.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2003</year>;<volume>19</volume>(<issue>10</issue>):<fpage>1236</fpage>&#x2013;<lpage>42</lpage>.
                    <pub-id pub-id-type="pmid">12835267</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/btg148</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Coombes</surname>
                            <given-names>KR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Coombes</surname>
                            <given-names>CE</given-names>
                        </name>
</person-group>:
                    <article-title>Umpire 2.0: An R Package to simulate realistic gene expression and clinical data</article-title>.<year>2020</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.doi.org/10.5281/zenodo.4023106">http://www.doi.org/10.5281/zenodo.4023106</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report75047">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.28557.r75047</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Wrobel</surname>
                        <given-names>Julia</given-names>
                    </name>
                    <xref ref-type="aff" rid="r75047a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r75047a1">
                    <label>1</label>Department of Biostatistics and Informatics, Colorado School of Public Health, University of Colorado Anschutz Medical Campus, Aurora, CO, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>2</day>
                <month>12</month>
                <year>2020</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Wrobel J</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport75047" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.25877.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The article introduces Umpire 2.0, an R package that builds on its previous version (Umpire 1.0) and provides a way to simulate complicated clinical data that has survival and genomics components. The package has a wide range of useful capabilities, and will be helpful for researchers who need simulated data with ground truth to assess the performance of their supervised and unsupervised machine learning methods. However, some work can be done to clarify the interpretation and underlying mechanism of the package. A few specific comments are given as bullet points below. 
                <list list-type="bullet">
                    <list-item>
                        <p>One of the key improvements for Umpire 2.0 is the ability to simulate "mixed-type" data. However, "mixed-type" is not clearly defined when it is introduced. It's meaning becomes clear in later sections, but it would be helpful to the reader to define it more explicitly at the outset.</p>
                    </list-item>
                    <list-item>
                        <p>The interpretation of Figures 2 and 3 are not well explained. In the text or a caption, explanation of what is learned from these figures should be added. For example with Figure 2, it seems that the dendrogram on top shows the four clusters, but what does the dendrogram on the left side represent? What features are being fed in here? Similarly, how are the two plots in Figure 3 related? Are the clusters assigned using the true underlying cluster values?</p>
                    </list-item>
                    <list-item>
                        <p>It seems that the package was built using an S4 object-oriented paradigm. It would be helpful to include a paragraph or two about how the package was structured and implemented using this approach.</p>
                    </list-item>
                </list>
            </p>
            <p>Are the conclusions about the tool and its performance adequately supported by the findings presented in the article?</p>
            <p>Yes</p>
            <p>Is the rationale for developing the new software tool clearly explained?</p>
            <p>Yes</p>
            <p>Is the description of the software tool technically sound?</p>
            <p>Yes</p>
            <p>Are sufficient details of the code, methods and analysis (if applicable) provided to allow replication of the software development and its use by others?</p>
            <p>Partly</p>
            <p>Is sufficient information provided to allow interpretation of the expected output datasets and any results generated using the tool?</p>
            <p>No</p>
            <p>Reviewer Expertise:</p>
            <p>Biomedical imaging, statistical software development, wearable technology.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
        <sub-article article-type="response" id="comment6343-75047">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Coombes</surname>
                            <given-names>Kevin</given-names>
                        </name>
                        <aff>Population Health Sciences, Georgia Cancer Center at Augusta University, Augusta, Georgia, USA</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>None</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>9</day>
                    <month>2</month>
                    <year>2021</year>
                </pub-date>
            </front-stub>
            <body>
                <p>In response to the comments from Reviewer 1 (Julia Wrobel), we:</p>
                <p> </p>
                <p> 1. Added a sentence to the abstract to clarify the meaning of "mixed&#x00a0;type data".</p>
                <p> 2. Expanded the captions on Figures 2, 3, and 4 in order to make them&#x00a0;easier to understand.</p>
                <p> 3. Added several paragraphs to the "Implementation" section to&#x00a0;highlight the use of S4 classes in our R package.</p>
            </body>
        </sub-article>
    </sub-article>
</article>
