<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="methods-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.21539.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Method Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Ensemble machine learning modeling for the prediction of artemisinin resistance in malaria</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: awaiting peer review]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Ford</surname>
                        <given-names>Colby T.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Janies</surname>
                        <given-names>Daniel</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Resources</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Department of Bioinformatics and Genomics, University of North Carolina at Charlotte, Charlotte, North Carolina, 28223, USA</aff>
                <aff id="a2">
                    <label>2</label>School of Data Science, University of North Carolina at Charlotte, Charlotte, North Carolina, 28223, USA</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:colby.ford@uncc.edu">colby.ford@uncc.edu</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>29</day>
                <month>1</month>
                <year>2020</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2020</year>
            </pub-date>
            <volume>9</volume>
            <elocation-id>62</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>16</day>
                    <month>1</month>
                    <year>2020</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2020 Ford CT and Janies D</copyright-statement>
                <copyright-year>2020</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/9-62/pdf"/>
            <abstract>
                <p>Resistance in malaria is a growing concern affecting many areas of Sub-Saharan Africa and Southeast Asia. Since the emergence of artemisinin resistance in the late 2000s in Cambodia, research into the underlying mechanisms has been underway. The 2019 Malaria Challenge posited the task of developing computational models that address important problems in advancing the fight against malaria. The first goal was to accurately predict artemisinin drug resistance levels of 
                    <italic toggle="yes">Plasmodium falciparum</italic> isolates, as quantified by the IC
                    <sub>50</sub>. The second goal was to predict the parasite clearance rate of malaria parasite isolates based on 
                    <italic toggle="yes">in vitro</italic> transcriptional profiles.</p>
                <p>In this work, we develop machine learning models using novel methods for transforming isolate data and handling the tens of thousands of variables that result from these data transformation exercises. This is demonstrated by using massively parallel processing of the data vectorization for use in scalable machine learning. In addition, we show the utility of ensemble machine learning modeling for highly effective predictions of both goals of this challenge. This is demonstrated by the use of multiple machine learning algorithms combined with various scaling and normalization preprocessing steps. Then, using a voting ensemble, multiple models are combined to generate a final model prediction.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>malaria</kwd>
                <kwd>Plasmodium falciparum</kwd>
                <kwd>machine learning</kwd>
                <kwd>parallel computing</kwd>
                <kwd>Apache Spark</kwd>
                <kwd>big data</kwd>
                <kwd>artemisinin</kwd>
                <kwd>bioinformatics</kwd>
                <kwd>DREAM Competition</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/100010942">
                    <funding-source>University of North Carolina at Charlotte</funding-source>
                </award-group>
                <funding-statement>This work was supported by the University of North Carolina at Charlotte Department of Bioinformatics and Genomics and the School of Data Science.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="intro">
            <title>Introduction</title>
            <p>Malaria is a serious disease caused by parasites belonging to the genus 
                <italic toggle="yes">Plasmodium</italic> which are transmitted by 
                <italic toggle="yes">Anopheles</italic> mosquitoes in the genus. The World Health Organization (WHO) reports that there were 219 million cases of malaria in 2017 across 87 countries
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>
                </sup>. 
                <italic toggle="yes">Plasmodium falciparum</italic> poses one of greatest health threats in Southeast Asia, being responsible for 62.8% of malaria cases in the region in 2017
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>
                </sup>.</p>
            <p>Artemisinin-based therapies are among the best treatment options for malaria caused by 
                <italic toggle="yes">P. falciparum</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref-2">2</xref>
                </sup>. However, emergence of artemisinin resistance in Thailand and Cambodia in 2007 has been cause for research
                <sup>
                    <xref ref-type="bibr" rid="ref-3">3</xref>
                </sup>. While there are polymorphisms in the kelch domain&#x2013;carrying protein K13 in 
                <italic toggle="yes">P. falciparum</italic> that are known to be associated with artemisinin resistance, the underlying molecular mechanism that confers resistance remains unknown
                <sup>
                    <xref ref-type="bibr" rid="ref-4">4</xref>
                </sup>. The established pharmacodynamics benchmark for 
                <italic toggle="yes">P. falciparum</italic> sensitivity to artemisinin-based therapy is the parasite clearance rate
                <sup>
                    <xref ref-type="bibr" rid="ref-5">5</xref>,
                    <xref ref-type="bibr" rid="ref-6">6</xref>
                </sup>. Resistance to artemisinin-based therapy is considered to be present with a parasite clearance rate greater than five hours
                <sup>
                    <xref ref-type="bibr" rid="ref-7">7</xref>
                </sup>. By understanding the genetic factors that affect resistance in malaria, targeted development can occur in an effort to abate further resistance or infections of resistant strains.</p>
        </sec>
        <sec>
            <title>Prediction of artemisinin IC
                <sub>50</sub>
            </title>
            <p>First, we created a machine learning model to predict the IC
                <sub>50</sub> of malaria parasites based on transcription profiles of experimentally-tested isolates. IC
                <sub>50</sub>, also known as the half maximal inhibitory concentration, is the drug concentration at which 50% of parasites die. This value indicates a population of parasites&#x2019; ability to withstand various doses of anti-malarial drugs, such as artemisinin.</p>
            <sec sec-type="methods">
                <title>Methods</title>
                <p>Training data was obtained from the 2019 DREAM Malaria Challenge
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>,
                        <xref ref-type="bibr" rid="ref-9">9</xref>
                    </sup>. The training data consists of gene expression data of 5,540 genes of 30 isolates from the malaria parasite, 
                    <italic toggle="yes">Plasmodium falciparum</italic>. For each malaria parasite isolate, transcription data was collected at two time points [6 hours post invasion (hpi) and 24 hpi], with and without treatment of dihydroartemisinin (the metabolically active form of artemisinin), each with a biological replicate. This yields a total of at eight data points for each isolate. The initial form of the training dataset contains 272 rows and 5,546 columns, as shown in 
                    <xref ref-type="table" rid="T1">Table 1</xref>.</p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Initial IC
                            <sub>50</sub> model training data format.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Sample_Name</th>
                                <th colspan="1" rowspan="1">Isolate</th>
                                <th colspan="1" rowspan="1">Timepoint</th>
                                <th colspan="1" rowspan="1">Treatment</th>
                                <th colspan="1" rowspan="1">BioRep</th>
                                <th colspan="1" rowspan="1">Gene
                                    <sub>1</sub>
                                </th>
                                <th colspan="1" rowspan="1">&#x2026;</th>
                                <th colspan="1" rowspan="1">Gene
                                    <sub>5540</sub>
                                </th>
                                <th colspan="1" rowspan="1">DHA_IC50</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.24HR.DHA.BRep1</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">24HR</td>
                                <td colspan="1" rowspan="1">DHA</td>
                                <td colspan="1" rowspan="1">BRep1</td>
                                <td colspan="1" rowspan="1">0.008286</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-2.48653</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.24HR.DHA.BRep2</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">24HR</td>
                                <td colspan="1" rowspan="1">DHA</td>
                                <td colspan="1" rowspan="1">BRep2</td>
                                <td colspan="1" rowspan="1">-0.87203</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.79457</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.24HR.UT.BRep1</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">24HR</td>
                                <td colspan="1" rowspan="1">UT</td>
                                <td colspan="1" rowspan="1">BRep1</td>
                                <td colspan="1" rowspan="1">0.03948</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-2.49517</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.24HR.UT.BRep2</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">24HR</td>
                                <td colspan="1" rowspan="1">UT</td>
                                <td colspan="1" rowspan="1">BRep2</td>
                                <td colspan="1" rowspan="1">0.125177</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.73531</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.6HR.DHA.BRep1</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">6HR</td>
                                <td colspan="1" rowspan="1">DHA</td>
                                <td colspan="1" rowspan="1">BRep1</td>
                                <td colspan="1" rowspan="1">1.354956</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-0.82169</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.6HR.DHA.BRep2</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">6HR</td>
                                <td colspan="1" rowspan="1">DHA</td>
                                <td colspan="1" rowspan="1">BRep2</td>
                                <td colspan="1" rowspan="1">-0.21807</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.61839</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.6HR.UT.BRep1</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">6HR</td>
                                <td colspan="1" rowspan="1">UT</td>
                                <td colspan="1" rowspan="1">BRep1</td>
                                <td colspan="1" rowspan="1">1.31135</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-2.62262</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01.6HR.UT.BRep2</td>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">6HR</td>
                                <td colspan="1" rowspan="1">UT</td>
                                <td colspan="1" rowspan="1">BRep2</td>
                                <td colspan="1" rowspan="1">0.997722</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-2.24719</td>
                                <td colspan="1" rowspan="1">2.177</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_30.6HR.UT.BRep2</td>
                                <td colspan="1" rowspan="1">isolate_30</td>
                                <td colspan="1" rowspan="1">6HR</td>
                                <td colspan="1" rowspan="1">UT</td>
                                <td colspan="1" rowspan="1">BRep2</td>
                                <td colspan="1" rowspan="1">-0.26639</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.72273</td>
                                <td colspan="1" rowspan="1">1.363</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <p>The transcription data was collected as described in 
                    <xref ref-type="table" rid="T2">Table 2</xref>. The transcription data set consists of 92 non-coding RNAs (denoted by gene IDs that begins with &#x2019;MAL&#x2019;), while the rest are protein coding genes (denoted by gene IDs that start with &#x2019;PF3D7&#x2019;). The feature to predict is 
                    <italic toggle="yes">DHA</italic>_
                    <italic toggle="yes">IC</italic>50.</p>
                <table-wrap id="T2" orientation="portrait" position="anchor">
                    <label>Table 2. </label>
                    <caption>
                        <title>IC
                            <sub>50</sub> training data information.</title>
                        <p>(Adapted from Turnbull 
                            <italic toggle="yes">et al</italic>., (2017) PLoS One
                            <sup>
                                <xref ref-type="bibr" rid="ref-11">11</xref>
                            </sup>).</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1"/>
                                <th colspan="1" rowspan="1">Training Set</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">Array</td>
                                <td colspan="1" rowspan="1">Bozdech</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Platform</td>
                                <td colspan="1" rowspan="1">Printed</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Plexes</td>
                                <td colspan="1" rowspan="1">1</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Unique Probes</td>
                                <td colspan="1" rowspan="1">10159</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Range of Probes per Exon</td>
                                <td colspan="1" rowspan="1">N/A</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Average Probes per Gene</td>
                                <td colspan="1" rowspan="1">2</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Genes  Represented</td>
                                <td colspan="1" rowspan="1">5363</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Transcript Isoform Profiling</td>
                                <td colspan="1" rowspan="1">No</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">ncRNAs</td>
                                <td colspan="1" rowspan="1">No</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Channel Detection Method</td>
                                <td colspan="1" rowspan="1">Two Color</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Scanner</td>
                                <td colspan="1" rowspan="1">PowerScanner</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Data Extraction</td>
                                <td colspan="1" rowspan="1">GenePix Pro</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec>
                <title>Data preparation</title>
                <p>We used Apache Spark
                    <sup>
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup> to pivot the dataset such that each isolate was its own row and each of the transcription values for each gene and attributes (i.e. timepoint, treatment, biological replicate) combination was its own column. This exercise transformed the training dataset from 272 rows and 5,546 columns to 30 rows and 44,343 columns, as shown in 
                    <xref ref-type="table" rid="T3">Table 3</xref>. We completed this pivot by slicing the data by each of the eight combinations of timepoint, treatment, and biological replicate, dynamically renaming the variables (genes) for each slice, and then joining all eight slices back together.</p>
                <table-wrap id="T3" orientation="portrait" position="anchor">
                    <label>Table 3. </label>
                    <caption>
                        <title>Post-transformation format of the IC
                            <sub>50</sub> model training data.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Isolate</th>
                                <th colspan="1" rowspan="1">DHA_IC50</th>
                                <th colspan="1" rowspan="1">hr24_trDHA_br1_Gene
                                    <sub>1</sub>
                                </th>
                                <th colspan="1" rowspan="1">hr24_trDHA_br2_Gene
                                    <sub>1</sub>
                                </th>
                                <th align="center" colspan="1" rowspan="1" valign="top">&#x2026;</th>
                                <th colspan="1" rowspan="1">hr6_trUT_br2_Gene
                                    <sub>5540</sub>
                                </th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_01</td>
                                <td colspan="1" rowspan="1">2.177</td>
                                <td colspan="1" rowspan="1">0.008286</td>
                                <td colspan="1" rowspan="1">-0.87203</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-2.24719</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">isolate_30</td>
                                <td colspan="1" rowspan="1">1.363</td>
                                <td colspan="1" rowspan="1">0.195032</td>
                                <td colspan="1" rowspan="1">0.031504</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.72273</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <p>Example code shown below in the section labeled 
                    <xref ref-type="other" rid="C1">code 1</xref>. By using the massively parallel architecture of Spark, this transformation can be completed in a minimal amount of time on a relatively small cluster environment (e.g., &lt;10 minutes using a 8-worker/36-core cluster with PySpark on Apache Spark 2.4.3).</p>
                <p id="C1">
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="color:#000000;font-size:15px"> 1 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Separate Dependent Variable                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 2 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">y </styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#000000;font-size:15px">train</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">select(col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">),                                             </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 3                  </styled-content>
                        <styled-content style="color:#000000;font-size:15px">col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"DHA_IC50"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)) \                                          </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 4          </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">distinct()                                                         </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 5                                                                              </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 6 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">##  Create Slice [Timepoint: 24HR, Treatment: DHA, BioRep: BRep1]            </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 7 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">hr24_trDHA_br1 </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">train</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">drop(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Sample_Name"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"DHA_IC50"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 8                       </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">filter((col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Timepoint"</styled-content>
                        <styled-content style="color:#000000;font-size:15px;">) </styled-content>
                        <styled-content style="color:#666666;font-size:15px">== </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"24HR"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) </styled-content>
                        <styled-content style="color:#666666;font-size:15px">&amp;                 </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 9                               </styled-content>
                        <styled-content style="color:#000000;font-size:15px">(col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Treatment"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) </styled-content>
                        <styled-content style="color:#666666;font-size:15px">== </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"DHA"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)</styled-content> 
                        <styled-content style="color:#666666;font-size:15px">&amp;                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">10                               </styled-content>
                        <styled-content style="color:#000000;font-size:15px">(col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"BioRep"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) </styled-content>
                        <styled-content style="color:#666666;font-size:15px">== </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"BRep1"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">))                    </styled-content>                                            

                        <styled-content style="color:#000000;font-size:15px">11 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Rename Columns                                                            </styled-content>

                        <styled-content style="color:#000000;font-size:15px">12 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">column_list </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">hr24_trDHA_br1</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">columns                                         </styled-content>

                        <styled-content style="color:#000000;font-size:15px">13 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">prefix </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"hr24_trDHA_br1_"                                                   </styled-content>

                        <styled-content style="color:#000000;font-size:15px8">14 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">new_column_list </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[prefix </styled-content>
                        <styled-content style="color:#666666;font-size:15px">+ </styled-content>
                        <styled-content style="color:#000000;font-size:15px">s </styled-content>
                        <styled-content style="color:#008000;font-size:15px">if </styled-content>
                        <styled-content style="color:#000000;font-size:15px">s </styled-content>
                        <styled-content style="color:#666666;font-size:15px">!= </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate" </styled-content>
                        <styled-content style="color:#008000;font-size:15px">else </styled-content>
                        <styled-content style="color:#000000;font-size:15px">s </styled-content>
                        <styled-content style="color:#008000;font-size:15px">for </styled-content>
                        <styled-content style="color:#000000;font-size:15px8">s </styled-content>
                        <styled-content style="color:#AB21FF;font-size:15px">in </styled-content>
                        <styled-content style="color:#000000;font-size:15px">column_list] </styled-content>

                        <styled-content style="color:#000000;font-size:15px;">15                                                                              </styled-content>

                        <styled-content style="color:#000000;font-size:15px">16 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">column_mapping </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[[o, n] </styled-content>
                        <styled-content style="color:#008000;font-size:15px">for </styled-content>
                        <styled-content style="color:#000000;font-size:15px">o, n </styled-content>
                        <styled-content style="color:#AB21FF;font-size:15px">in </styled-content>
                        <styled-content style="color:#008000;font-size:15px">zip</styled-content>
                        <styled-content style="color:#000000;font-size:15px">(column_list, new_column_list)]      </styled-content>

                        <styled-content style="color:#000000;font-size:15px">17                                                                              </styled-content>

                        <styled-content style="color:#000000;font-size:15px">18 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">hr24_trDHA_br1 </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">hr24_trDHA_br1</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">select(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">list</styled-content>
                        <styled-content style="color:#000000;font-size:15px">(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">map</styled-content>
                        <styled-content style="color:#000000;font-size:15px">(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">lambda </styled-content>
                        <styled-content style="color:#000000;font-size:15px">old, new: col(old) \  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">19                                </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">alias(new),</styled-content>
                        <styled-content style="color:#666666;font-size:15px">*</styled-content>
                        <styled-content style="color:#008000;font-size:15px">zip</styled-content>
                        <styled-content style="color:#000000;font-size:15px">(</styled-content>
                        <styled-content style="color:#666666;font-size:15px">*</styled-content>
                        <styled-content style="color:#000000;font-size:15px">column_mapping))))          </styled-content>

                        <styled-content style="color:#000000;font-size:15px">20                                                                              </styled-content>

                        <styled-content style="color:#000000;font-size:15px8">21 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Join Slices Together                                                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px">22 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">data </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">y</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr24_trDHA_br1, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                       </styled-content>

                        <styled-content style="color:#000000;font-size:15px">23         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr24_trDHA_br2, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                       </styled-content>

                        <styled-content style="color:#000000;font-size:15px">24         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr24_trUT_br1, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">25         </styled-content>
                        <styled-content style="color:#666666;font-size:15px8">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr24_trUT_br2, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">26         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr6_trDHA_br1, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">27         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr6_trDHA_br2, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">28         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr6_trUT_br1, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \                         </styled-content>

                        <styled-content style="color:#000000;font-size:15px">29         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">join(hr6_trUT_br2, </styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"Isolate"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, how</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'left'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                           </styled-content>
                    </preformat>
                </p>
                <p>Lastly, the dataset is then vectorized using the Spark 
                    <monospace>VectorAssembler,</monospace> and converted into a Numpy
                    <sup>
                        <xref ref-type="bibr" rid="ref-12">12</xref>
                    </sup>-compatible array. Example code shown below in 
                    <xref ref-type="other" rid="C1">Code 1</xref>. Vectorization allows for highly scalable parallelization of the machine learning modeling in the next step.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="color:#000000;font-size:15px"> 1 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Transform Data using VectorAssembler                                          </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 2 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assemblerInputs </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">numericalColumns                                               </styled-content>  

                        <styled-content style="color:#000000;font-size:15px"> 3 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assembler </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">VectorAssembler(inputCols </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assemblerInputs, outputCol</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"features"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \ </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 4             </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">setHandleInvalid(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"keep"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                                            </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 5 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">stages </styled-content>
                        <styled-content style="color:#666666;font-size:15px">+= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[assembler]                                                            </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 6                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 7 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">prepPipeline </styled-content>
                        <styled-content style="color:#666666;font-size:15px8">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">Pipeline()</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">setStages(stages)                                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 8 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pipelineModel </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">prepPipeline</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">fit(data)                                           </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 9 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">vectordata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pipelineModel</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">transform(data) \                                     </styled-content>

                        <styled-content style="color:#000000;font-size:15px">10                           </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">select(col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"DHA_IC50"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">), col(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"features"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)) \            </styled-content>

                        <styled-content style="color:#000000;font-size:15px">11                           </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">withColumnRenamed(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px8">"DHA_IC50"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"label"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                 </styled-content>

                        <styled-content style="color:#000000;font-size:15px">12                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">13 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Convert to Numpy Array                                                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">14 </styled-content>
                        <styled-content style="color:#008000;font-size:15px">import </styled-content>
                        <styled-content style="color:#0000FF;font-size:15px">numpy </styled-content>
                        <styled-content style="color:#008000;font-size:15px">as </styled-content>
                        <styled-content style="color:#0000FF;font-size:15px">np                                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">15 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">vectordata</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">toPandas()                                                   </styled-content>

                        <styled-content style="color:#000000;font-size:15px">16 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">seriesdata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata[</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'features'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">]</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">apply(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">lambda </styled-content>
                        <styled-content style="color:#000000;font-size:15px">x : np</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">array(x</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">toArray())) \        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">17              </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">as_matrix()</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">reshape(</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,</styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                                          </styled-content>

                        <styled-content style="color:#000000;font-size:15px">18 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">X_train </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">np</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">apply_along_axis(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">lambda </styled-content>
                        <styled-content style="color:#000000;font-size:15px">x : x[</styled-content>
                        <styled-content style="color:#666666;font-size:15px">0</styled-content>
                        <styled-content style="color:#000000;font-size:15px">], </styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, seriesdata)                    </styled-content>

                        <styled-content style="color:#000000;font-size:15px">19 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">y_train </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata[</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'label'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">]</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">values</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">reshape(</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,</styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">ravel()                           </styled-content>

                        <styled-content style="color:#000000;font-size:15px">20                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">21 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Example Output (X_train) After Vectorization                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">22 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">array([[</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.62161893</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.60860881</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.11331369</styled-content>
                        <styled-content style="color:#000000;font-size:15px;background-color:#F8F8F8">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.457377 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">23         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-3.292903  </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.869169 </styled-content>
                        <styled-content style="color:#000000;font-size:15px"> ],                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">24        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[</styled-content>
                        <styled-content style="color:#666666;font-size:15px8">-0.55719008</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-2.41660489</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.39244109</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.770098 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">25         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-3.698841  </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.740082 </styled-content>
                        <styled-content style="color:#000000;font-size:15px"> ],                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">26        </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                                                                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px">27        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.17072536</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-2.32828532</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.08406554</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px8">-1.402658 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">28          </styled-content>
                        <styled-content style="color:#666666;font-size:15px8">-5.314896 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.328886 </styled-content>
                        <styled-content style="color:#000000;font-size:15px"> ],                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">29        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.1923732 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.88763881</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.23867258</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.971246 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px8">30          </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-3.567355 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.904116 </styled-content>
                        <styled-content style="color:#000000;font-size:15px"> ]])                                              </styled-content>
                    </preformat>
                </p>
            </sec>
            <sec>
                <title>Machine learning</title>
                <p>We used the Microsoft Azure Machine Learning Service
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup> as the tracking platform for retaining model performance metrics as the various models were generated. For this use case, 498 machine learning models were trained using various scaling techniques and algorithms. We then created two ensemble models of the individual models using Stack Ensemble and Voting ensemble methods. Scaling and normalization methods are shown in 
                    <xref ref-type="table" rid="T14">Table 14</xref>.</p>
                <p>The Microsoft AutoML package
                    <sup>
                        <xref ref-type="bibr" rid="ref-14">14</xref>
                    </sup> allows for the parallel creation and testing of various models, fitting based on a primary metric. For this use case, models were trained using Decision Tree, Elastic Net, Extreme Random Tree, Gradient Boosting, Lasso Lars, LightGBM, RandomForest, and Stochastic Gradient Decent algorithms along with various scaling methods from Maximum Absolute Scaler, Min/Max Scaler, Principal Component Analysis, Robust Scaler, Sparse Normalizer, Standard Scale Wrapper, Truncated Singular Value Decomposition Wrapper (as defined in 
                    <xref ref-type="table" rid="T14">Table 14</xref>). All of the machine learning algorithms are from the 
                    <italic toggle="yes">scikit-learn</italic> package
                    <sup>
                        <xref ref-type="bibr" rid="ref-15">15</xref>
                    </sup> except for LightGBM, which is from the 
                    <italic toggle="yes">LightGBM</italic> package
                    <sup>
                        <xref ref-type="bibr" rid="ref-16">16</xref>
                    </sup>. The settings for the model sweep are defined in 
                    <xref ref-type="table" rid="T4">Table 4</xref>. The &#x2018;Preprocess Data?&#x2019; parameter enables the scaling and imputation of the features in the data.</p>
                <table-wrap id="T4" orientation="portrait" position="anchor">
                    <label>Table 4. </label>
                    <caption>
                        <title>Model search parameter setting for the IC
                            <sub>50</sub> model search.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Parameter</th>
                                <th colspan="1" rowspan="1">Value</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">Task</td>
                                <td colspan="1" rowspan="1">Regression</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Number of Iterations</td>
                                <td colspan="1" rowspan="1">500</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Iteration Timeout (minutes)</td>
                                <td colspan="1" rowspan="1">20</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Max Cores per Iteration</td>
                                <td colspan="1" rowspan="1">7</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Primary Metric</td>
                                <td colspan="1" rowspan="1">Normalized Root Mean
                                    <break/>Squared Error</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Preprocess Data?</td>
                                <td colspan="1" rowspan="1">True</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">k-Fold  Cross-Validations</td>
                                <td colspan="1" rowspan="1">20 folds</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <p>Once the 498 individual models were trained, two ensemble models (voting ensemble and stack ensemble) were then created and tested. The voting ensemble method makes a prediction based on the weighted average of the previous models&#x2019; predicted regression outputs whereas the stacking ensemble method combines the previous models and trains a meta-model using the elastic net algorithm based on the output from the previous models. The model selection method used was the Caruana ensemble selection algorithm
                    <sup>
                        <xref ref-type="bibr" rid="ref-17">17</xref>
                    </sup>.</p>
            </sec>
            <sec sec-type="results">
                <title>Results</title>
                <p>The voting ensemble model (using soft voting) was selected as the best model, having the lowest normalized Root Mean Squared Error (RMSE), as shown in 
                    <xref ref-type="table" rid="T5">Table 5</xref>. The top 10 models trained are reported in 
                    <xref ref-type="table" rid="T6">Table 6</xref>. Having a normalized RMSE of only 0.1228 and a Mean Absolute Percentage Error (MAPE) of 24.27%, this model is expected to accurately predict IC
                    <sub>50</sub> in malaria isolates. See 
                    <xref ref-type="fig" rid="f1">Figure 1</xref> for a visualization of the experiment runs and 
                    <xref ref-type="fig" rid="f2">Figure 2</xref> for the distribution of residuals on the best model.</p>
                <table-wrap id="T5" orientation="portrait" position="anchor">
                    <label>Table 5. </label>
                    <caption>
                        <title>Model metrics of the final IC
                            <sub>50</sub> ensemble model.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Metric</th>
                                <th colspan="1" rowspan="1">Value</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">Normalized Root Mean Squared Error</td>
                                <td colspan="1" rowspan="1">0.1228</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Root Mean Squared Log Error</td>
                                <td colspan="1" rowspan="1">0.1336</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Normalized Mean Absolute Error</td>
                                <td colspan="1" rowspan="1">0.1097</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Mean Absolute Percentage Error</td>
                                <td colspan="1" rowspan="1">24.27</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Normalized Median Absolute Error</td>
                                <td colspan="1" rowspan="1">0.1097</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Root Mean Squared Error</td>
                                <td colspan="1" rowspan="1">0.3398</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Explained Variance</td>
                                <td colspan="1" rowspan="1">-1.755</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Normalized Root Mean Squared Log Error</td>
                                <td colspan="1" rowspan="1">0.1379</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Median Absolute Error</td>
                                <td colspan="1" rowspan="1">0.3035</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Mean Absolute Error</td>
                                <td colspan="1" rowspan="1">0.3035</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T6" orientation="portrait" position="anchor">
                    <label>Table 6. </label>
                    <caption>
                        <title>Top 10 training iterations of the IC
                            <sub>50</sub> model search, evaluated by Root Mean Squared Error.</title>
                        <p>Note that the top performing model (VotingEnsemble) is the final IC
                            <sub>50</sub> model discussed in this paper.</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Iteration</th>
                                <th colspan="1" rowspan="1">Preprocessor</th>
                                <th colspan="1" rowspan="1">Algorithm</th>
                                <th colspan="1" rowspan="1">Normalized  RMSE</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">498</td>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1">VotingEnsemble</td>
                                <td colspan="1" rowspan="1">0.12283293</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">370</td>
                                <td colspan="1" rowspan="1">SparseNormalizer</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.132003138</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">432</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LightGBM</td>
                                <td colspan="1" rowspan="1">0.133180215</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">240</td>
                                <td colspan="1" rowspan="1">SparseNormalizer</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.133779391</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">430</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.137084337</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">65</td>
                                <td colspan="1" rowspan="1">SparseNormalizer</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.13884791</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">56</td>
                                <td colspan="1" rowspan="1">SparseNormalizer</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.14417843</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">68</td>
                                <td colspan="1" rowspan="1">MaxAbsScaler</td>
                                <td colspan="1" rowspan="1">ExtremeRandomTrees</td>
                                <td colspan="1" rowspan="1">0.151925822</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">470</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">RandomForest</td>
                                <td colspan="1" rowspan="1">0.152262231</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">181</td>
                                <td colspan="1" rowspan="1">MinMaxScaler</td>
                                <td colspan="1" rowspan="1">LightGBM</td>
                                <td colspan="1" rowspan="1">0.15279075</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>Root Mean Squared Error (RMSE) by iteration of the IC
                            <sub>50</sub> model search.</title>
                        <p>Each orange dot is an iteration with the blue line representing the minimum RMSE up to that iteration.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure1.gif"/>
                </fig>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>Figure 2. </label>
                    <caption>
                        <title>Model residuals of the final IC
                            <sub>50</sub> ensemble model.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure2.gif"/>
                </fig>
            </sec>
        </sec>
        <sec>
            <title>Prediction of resistance status</title>
            <p>The second task of this work was to create a machine learning model that can predict the parasite clearance rate (fast versus slow) of malaria isolates. When resistance rates change in a pathogen, it can be indicative of regulatory changes in the pathogen&#x2019;s genome. These changes can be exploited for the prevention of further resistance spread. Thus, a goal of this work is to understand genes important in the prediction of artemisinin resistance.</p>
            <sec sec-type="methods">
                <title>Methods</title>
                <p>An 
                    <italic toggle="yes">in vivo</italic> transcription data set from Mok 
                    <italic toggle="yes">et al.</italic>, (2015) Science
                    <sup>
                        <xref ref-type="bibr" rid="ref-18">18</xref>
                    </sup> was used to predict the parasite clearance rate of malaria parasite isolates based on 
                    <italic toggle="yes">in vitro</italic> transcriptional profiles (see 
                    <xref ref-type="table" rid="T8">Table 8</xref>).</p>
                <p>The training data consists of 1,043 isolates with 4,952 genes from the malaria parasite 
                    <italic toggle="yes">Plasmodium falciparum</italic>. For each malaria parasite isolate, transcription data was collected for various 
                    <italic toggle="yes">PF3D7</italic> genes. The form of the training dataset contains 1,043 rows and 4,957 columns, as shown in 
                    <xref ref-type="table" rid="T7">Table 7</xref>. The feature to predict is 
                    <italic toggle="yes">ClearanceRate</italic>.</p>
                <table-wrap id="T7" orientation="portrait" position="anchor">
                    <label>Table 7. </label>
                    <caption>
                        <title>Format of the clearance rate model training data.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Sample_Names</th>
                                <th colspan="1" rowspan="1">Country</th>
                                <th colspan="1" rowspan="1">Asexual_
                                    <break/>stage   hpi_</th>
                                <th colspan="1" rowspan="1">Kmeans_Grp</th>
                                <th colspan="1" rowspan="1">PF3D7_
                                    <break/>0100100</th>
                                <th colspan="1" rowspan="1">&#x2026;</th>
                                <th colspan="1" rowspan="1">PF3D7_1480100</th>
                                <th colspan="1" rowspan="1">ClearanceRate</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">GSM1427365</td>
                                <td colspan="1" rowspan="1">Bangladesh</td>
                                <td colspan="1" rowspan="1">20</td>
                                <td colspan="1" rowspan="1">B</td>
                                <td colspan="1" rowspan="1">0.226311</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-0.64171</td>
                                <td colspan="1" rowspan="1">Fast</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">GSM1427537</td>
                                <td colspan="1" rowspan="1">Cambodia</td>
                                <td colspan="1" rowspan="1">12</td>
                                <td colspan="1" rowspan="1">C</td>
                                <td colspan="1" rowspan="1">0.81096</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">-1.72825</td>
                                <td colspan="1" rowspan="1">Slow</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">GSM1428407</td>
                                <td colspan="1" rowspan="1">Vietnam</td>
                                <td colspan="1" rowspan="1">8</td>
                                <td colspan="1" rowspan="1">A</td>
                                <td colspan="1" rowspan="1">0.999095</td>
                                <td colspan="1" rowspan="1">&#x2026;</td>
                                <td colspan="1" rowspan="1">NaN</td>
                                <td colspan="1" rowspan="1">Fast</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T8" orientation="portrait" position="anchor">
                    <label>Table 8. </label>
                    <caption>
                        <title>Training dataset information from Mok 
                            <italic toggle="yes">et al.</italic>, 2015
                            <sup>
                                <xref ref-type="bibr" rid="ref-18">18</xref>
                            </sup>.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1"/>
                                <th colspan="1" rowspan="1">Training Set</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">Number
                                    <break/>of isolates</td>
                                <td colspan="1" rowspan="1">1043</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Isolate
                                    <break/>collection site</td>
                                <td colspan="1" rowspan="1">Southeast Asia</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Isolate
                                    <break/>collection years</td>
                                <td colspan="1" rowspan="1">2012&#x2013;2014</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Sample
                                    <break/>type</td>
                                <td colspan="1" rowspan="1">
                                    <italic toggle="yes">in vivo</italic>
</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Synchronized?</td>
                                <td colspan="1" rowspan="1">Not synchronized</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Number
                                    <break/>of samples per isolate</td>
                                <td colspan="1" rowspan="1">1</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Additional attributes</td>
                                <td colspan="1" rowspan="1">~18 hpi,
                                    <break/>Non-perturbed, No replicates</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec>
                <title>Data preparation</title>
                <p>The training data for this use case did not require the same pivoting transformations as in the last use case as each record describes a single isolate. Thus, only the vectorization of the data was necessary, which was performed using the Spark 
                    <monospace>VectorAssembler</monospace> and then converted into a Numpy-compatible array
                    <sup>
                        <xref ref-type="bibr" rid="ref-12">12</xref>
                    </sup>. Example code is shown below. Note that this vectorization only kept the numerical columns, which excludes the 
                    <monospace>Country</monospace>, 
                    <monospace>Kmeans_Grp</monospace>, and 
                    <monospace>Asexual_stage__hpi_</monospace> attributes as they are either absent or contain non-matching factors (i.e. different set of countries) in the testing data.</p>
                <p>
                    <preformat orientation="portrait" position="float" preformat-type="computer code" xml:space="preserve">
                        <styled-content style="color:#000000;font-size:15px"> 1 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Transform Data using VectorAssembler                                          </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 2 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assemblerInputs </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">numericalColumns                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 3 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assembler </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">VectorAssembler(inputCols </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">assemblerInputs, outputCol</styled-content>
                        <styled-content style="color:#666666;font-size:15px">=</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"features"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">) \ </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 4             </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">setHandleInvalid(</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">"keep"</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                                            </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 5 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">stages </styled-content>
                        <styled-content style="color:#666666;font-size:15px">+= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[assembler]                                                            </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 6                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 7 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">prepPipeline </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">Pipeline()</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">setStages(stages)                                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 8 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pipelineModel </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">prepPipeline</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">fit(data)                                           </styled-content>

                        <styled-content style="color:#000000;font-size:15px"> 9 </styled-content>
                        <styled-content style="color:#000000;font-size:15px8">vectordata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pipelineModel</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">transform(data)                                       </styled-content>

                        <styled-content style="color:#000000;font-size:15px">10                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">11 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Convert to Numpy Array                                                        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">12 </styled-content>
                        <styled-content style="color:#008000;font-size:15px">import </styled-content>
                        <styled-content style="color:#0000FF;font-size:15px">numpy </styled-content>
                        <styled-content style="color:#008000;font-size:15px">as </styled-content>
                        <styled-content style="color:#0000FF;font-size:15px;">np                                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">13 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">vectordata</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">toPandas()                                                   </styled-content>

                        <styled-content style="color:#000000;font-size:15px">14 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">seriesdata </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata[</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'features'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">]</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">apply(</styled-content>
                        <styled-content style="color:#008000;font-size:15px8">lambda </styled-content>
                        <styled-content style="color:#000000;font-size:15px">x : np</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">array(x</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">toArray())) \        </styled-content>

                        <styled-content style="color:#000000;font-size:15px">15              </styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">as_matrix()</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">reshape(</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,</styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)                                          </styled-content>

                        <styled-content style="color:#000000;font-size:15px">16 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">X_train </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">np</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">apply_along_axis(</styled-content>
                        <styled-content style="color:#008000;font-size:15px">lambda </styled-content>
                        <styled-content style="color:#000000;font-size:15px">x : x[</styled-content>
                        <styled-content style="color:#666666;font-size:15px">0</styled-content>
                        <styled-content style="color:#000000;font-size:15px">], </styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, seriesdata)                    </styled-content>

                        <styled-content style="color:#000000;font-size:15px">17 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">y_train </styled-content>
                        <styled-content style="color:#666666;font-size:15px">= </styled-content>
                        <styled-content style="color:#000000;font-size:15px">pddata[</styled-content>
                        <styled-content style="color:#BA2121;font-size:15px">'label'</styled-content>
                        <styled-content style="color:#000000;font-size:15px">]</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">values</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">reshape(</styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">,</styled-content>
                        <styled-content style="color:#666666;font-size:15px">1</styled-content>
                        <styled-content style="color:#000000;font-size:15px">)</styled-content>
                        <styled-content style="color:#666666;font-size:15px">.</styled-content>
                        <styled-content style="color:#000000;font-size:15px">ravel()                           </styled-content>

                        <styled-content style="color:#000000;font-size:15px">18                                                                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">19 </styled-content>
                        <styled-content style="color:#408080;font-size:15px">## Example Output (X_train) After Vectorization                                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">20 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">array([[ </styled-content>
                        <styled-content style="color:#666666;font-size:15px">0.2263112 </styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.39682897</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.80458125</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">nan,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">21         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.30952803</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.64170958</styled-content>
                        <styled-content style="color:#000000;font-size:15px">]</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px8">22        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[ </styled-content>
                        <styled-content style="color:#666666;font-size:15px">0.55442743</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,  </styled-content>
                        <styled-content style="color:#666666;font-size:15px">0.54200115</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.56157279</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">1.83083869</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">23          </styled-content>
                        <styled-content style="color:#666666;font-size:15px">0.21021662</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.06553341</styled-content>
                        <styled-content style="color:#000000;font-size:15px">],                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">24        </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                                                                      </styled-content>

                        <styled-content style="color:#000000;font-size:15px">25        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[ </styled-content>
                        <styled-content style="color:#666666;font-size:15px">1.24446867</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.09076431</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.62156926</styled-content>
                        <styled-content style="color:#000000;font-size:15px8">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">3.18060844</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">26         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-0.43056353</styled-content>
                        <styled-content style="color:#000000;font-size:15px;b">,         </styled-content>
                        <styled-content style="color:#000000;font-size:15px">nan],                                               </styled-content>

                        <styled-content style="color:#000000;font-size:15px">27        </styled-content>
                        <styled-content style="color:#000000;font-size:15px">[ </styled-content>
                        <styled-content style="color:#666666;font-size:15px">0.99909549</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.47208829</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.91898139</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">...</styled-content>
                        <styled-content style="color:#000000;font-size:15px">, </styled-content>
                        <styled-content style="color:#666666;font-size:15px">2.59463935</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,                  </styled-content>

                        <styled-content style="color:#000000;font-size:15px">28         </styled-content>
                        <styled-content style="color:#666666;font-size:15px">-1.21233458</styled-content>
                        <styled-content style="color:#000000;font-size:15px">,         </styled-content>
                        <styled-content style="color:#000000;font-size:15px">nan]])                                              </styled-content>
                    </preformat>
                </p>
            </sec>
            <sec>
                <title>Machine learning</title>
                <p>Once the 98 individual models were trained, two ensemble models (voting ensemble and stack ensemble) were then created and tested as before. Model search parameters are shown in 
                    <xref ref-type="table" rid="T9">Table 9</xref>.</p>
                <table-wrap id="T9" orientation="portrait" position="anchor">
                    <label>Table 9. </label>
                    <caption>
                        <title>Model search parameter settings for the clearance rate model search.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Parameter</th>
                                <th colspan="1" rowspan="1">Value</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">Task</td>
                                <td colspan="1" rowspan="1">Regression</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Number of iterations</td>
                                <td colspan="1" rowspan="1">100</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Iteration timeout (minutes)</td>
                                <td colspan="1" rowspan="1">20</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Max cores per iteration</td>
                                <td colspan="1" rowspan="1">14</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Primary metric</td>
                                <td colspan="1" rowspan="1">weighted area under the receiver
                                    <break/>operating characteristic curve (AUC)</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">Preprocess data?</td>
                                <td colspan="1" rowspan="1">True</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">k-Fold  cross-validations</td>
                                <td colspan="1" rowspan="1">10 folds</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec sec-type="results">
                <title>Results</title>
                <p>The voting ensemble model (using soft voting) was selected as the best model, having the highest area under the receiver operating characteristic curve (AUC), as shown in 
                    <xref ref-type="table" rid="T11">Table 11</xref>. The top 10 of the 100 models trained are reported in 
                    <xref ref-type="table" rid="T10">Table 10</xref>. Having a weighted AUC of 0.87 and a weighted F1 score of 0.80, this model is expected to accurately predict isolate clearance rates. A confusion matrix of the predicted results versus actuals is shown in 
                    <xref ref-type="table" rid="T12">Table 12</xref>. See 
                    <xref ref-type="fig" rid="f3">Figure 3</xref> for a visualization of the experiment runs and see 
                    <xref ref-type="fig" rid="f4">Figure 4</xref> and 
                    <xref ref-type="fig" rid="f5">Figure 5</xref> for the ROC and Precision-Recall curves on the best model.</p>
                <table-wrap id="T10" orientation="portrait" position="anchor">
                    <label>Table 10. </label>
                    <caption>
                        <title>Top 10 training iterations of the clearance rate model search.</title>
                        <p>Note that the top performing model (VotingEnsemble) is the clearance rate model discussed in this paper.</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Iteration</th>
                                <th colspan="1" rowspan="1">Preprocessor</th>
                                <th colspan="1" rowspan="1">Algorithm</th>
                                <th colspan="1" rowspan="1">Weighted AUC</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">98</td>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1">VotingEnsemble</td>
                                <td colspan="1" rowspan="1">0.870471056</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">99</td>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1">StackEnsemble</td>
                                <td colspan="1" rowspan="1">0.865215516</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">65</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.86062304</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">33</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.859881677</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">97</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.858791006</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">44</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.856105491</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">73</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.855502817</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">17</td>
                                <td colspan="1" rowspan="1">RobustScaler</td>
                                <td colspan="1" rowspan="1">SVM</td>
                                <td colspan="1" rowspan="1">0.855452622</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">43</td>
                                <td colspan="1" rowspan="1">StandardScalerWrapper</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.855368394</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">61</td>
                                <td colspan="1" rowspan="1">RobustScaler</td>
                                <td colspan="1" rowspan="1">LogisticRegression</td>
                                <td colspan="1" rowspan="1">0.854357599</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T11" orientation="portrait" position="anchor">
                    <label>Table 11. </label>
                    <caption>
                        <title>Model metrics of the final clearance rate ensemble model.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Metric</th>
                                <th colspan="1" rowspan="1">Accuracy</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">f1_score_macro</td>
                                <td colspan="1" rowspan="1">0.6084</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">AUC_micro</td>
                                <td colspan="1" rowspan="1">0.9445</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">AUC_macro</td>
                                <td colspan="1" rowspan="1">0.8475</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">recall_score_micro</td>
                                <td colspan="1" rowspan="1">0.8101</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">recall_score_weighted</td>
                                <td colspan="1" rowspan="1">0.8101</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">average_precision_score_weighted</td>
                                <td colspan="1" rowspan="1">0.8707</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">weighted_accuracy</td>
                                <td colspan="1" rowspan="1">0.8585</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">precision_score_macro</td>
                                <td colspan="1" rowspan="1">0.6217</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">precision_score_micro</td>
                                <td colspan="1" rowspan="1">0.8101</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">balanced_accuracy</td>
                                <td colspan="1" rowspan="1">0.6027</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">log_loss</td>
                                <td colspan="1" rowspan="1">0.4455</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">recall_score_macro</td>
                                <td colspan="1" rowspan="1">0.6027</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">precision_score_weighted</td>
                                <td colspan="1" rowspan="1">0.8</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">AUC_weighted</td>
                                <td colspan="1" rowspan="1">0.8705</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">average_precision_score_micro</td>
                                <td colspan="1" rowspan="1">0.8911</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">f1_score_weighted</td>
                                <td colspan="1" rowspan="1">0.8019</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">f1_score_micro</td>
                                <td colspan="1" rowspan="1">0.8101</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">norm_macro_recall</td>
                                <td colspan="1" rowspan="1">0.354</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">average_precision_score_macro</td>
                                <td colspan="1" rowspan="1">0.7344</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">accuracy</td>
                                <td colspan="1" rowspan="1">0.8101</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T12" orientation="portrait" position="anchor">
                    <label>Table 12. </label>
                    <caption>
                        <title>Confusion matrix of clearance rate predictions versus actual.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th abbr="right" colspan="2" rowspan="2" valign="bottom">Class</th>
                                <th align="center" colspan="3" rowspan="1" valign="top">Prediction</th>
                            </tr>
                            <tr>
                                <th colspan="1" rowspan="1">Fast (ID: 0)</th>
                                <th colspan="1" rowspan="1">Slow (ID: 1)</th>
                                <th colspan="1" rowspan="1">Null (ID: 2)</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="center" colspan="1" rowspan="3" valign="middle">Actual</td>
                                <td colspan="1" rowspan="1">
                                    <bold>Fast (ID: 0)</bold>
                                </td>
                                <td colspan="1" rowspan="1">661</td>
                                <td colspan="1" rowspan="1">74</td>
                                <td colspan="1" rowspan="1">0</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">
                                    <bold>Slow (ID: 1)</bold>
                                </td>
                                <td colspan="1" rowspan="1">115</td>
                                <td colspan="1" rowspan="1">184</td>
                                <td colspan="1" rowspan="1">0</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">
                                    <bold>Null (ID: 2)</bold>
                                </td>
                                <td colspan="1" rowspan="1">6</td>
                                <td colspan="1" rowspan="1">3</td>
                                <td colspan="1" rowspan="1">0</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                    <label>Figure 3. </label>
                    <caption>
                        <title>Area under the receiver operating characteristic curve (AUC) by iteration of the clearance rate model.</title>
                        <p>Each orange dot is an iteration with the blue line representing the maximum AUC up to that iteration.</p>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure3.gif"/>
                </fig>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>Figure 4. </label>
                    <caption>
                        <title>Receiver operating characteristic curve of the clearance rate model.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure4.gif"/>
                </fig>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>Figure 5. </label>
                    <caption>
                        <title>Precision-Recall curve of the clearance rate model.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure5.gif"/>
                </fig>
                <p>subsectionFeature importance Feature importances were calculated using mimic-based model explanation of the ensemble model
                    <sup>
                        <xref ref-type="bibr" rid="ref-19">19</xref>
                    </sup>. The mimic explainer works by training global surrogate models to mimic blackbox models. The surrogate model is an interpretable model, trained to approximate the predictions of a black box model as accurately as possible. See 
                    <xref ref-type="fig" rid="f6">Figure 6</xref> and 
                    <xref ref-type="table" rid="T13">Table 13</xref>.</p>
                <fig fig-type="figure" id="f6" orientation="portrait" position="float">
                    <label>Figure 6. </label>
                    <caption>
                        <title>Derived feature importances using the black box mimic model explanation of the clearance rate model.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/23732/7d58fc1d-4900-4dc3-8821-8f2e32357723_figure6.gif"/>
                </fig>
                <table-wrap id="T13" orientation="portrait" position="anchor">
                    <label>Table 13. </label>
                    <caption>
                        <title>Top 10 PF3D7 genes (features) in predicting clearance rate.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Rank</th>
                                <th colspan="1" rowspan="1">PF3D7 Gene</th>
                                <th colspan="1" rowspan="1">"Slow" Importance</th>
                                <th colspan="1" rowspan="1">"Fast" Importance</th>
                                <th colspan="1" rowspan="1">"NULL" Importance</th>
                                <th colspan="1" rowspan="1">Overall Importance</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">1</td>
                                <td colspan="1" rowspan="1">PF3D7_1245300</td>
                                <td colspan="1" rowspan="1">0.292</td>
                                <td colspan="1" rowspan="1">0.118</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.410</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">2</td>
                                <td colspan="1" rowspan="1">PF3D7_1107700</td>
                                <td colspan="1" rowspan="1">0.020</td>
                                <td colspan="1" rowspan="1">0.274</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.294</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">3</td>
                                <td colspan="1" rowspan="1">PF3D7_1328400</td>
                                <td colspan="1" rowspan="1">0.154</td>
                                <td colspan="1" rowspan="1">0.123</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.277</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">4</td>
                                <td colspan="1" rowspan="1">PF3D7_1372000</td>
                                <td colspan="1" rowspan="1">0.172</td>
                                <td colspan="1" rowspan="1">0.095</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.267</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">5</td>
                                <td colspan="1" rowspan="1">PF3D7_1115600</td>
                                <td colspan="1" rowspan="1">0.083</td>
                                <td colspan="1" rowspan="1">0.179</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.262</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">6</td>
                                <td colspan="1" rowspan="1">PF3D7_0608100</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.243</td>
                                <td colspan="1" rowspan="1">0.243</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">7</td>
                                <td colspan="1" rowspan="1">PF3D7_0523000</td>
                                <td colspan="1" rowspan="1">0.154</td>
                                <td colspan="1" rowspan="1">0.087</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.241</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">8</td>
                                <td colspan="1" rowspan="1">PF3D7_1205300</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.002</td>
                                <td colspan="1" rowspan="1">0.197</td>
                                <td colspan="1" rowspan="1">0.199</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">9</td>
                                <td colspan="1" rowspan="1">PF3D7_1129100</td>
                                <td colspan="1" rowspan="1">0.008</td>
                                <td colspan="1" rowspan="1">0.191</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.199</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">10</td>
                                <td colspan="1" rowspan="1">PF3D7_0935400</td>
                                <td colspan="1" rowspan="1">0.117</td>
                                <td colspan="1" rowspan="1">0.058</td>
                                <td colspan="1" rowspan="1">0.000</td>
                                <td colspan="1" rowspan="1">0.175</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <table-wrap id="T14" orientation="portrait" position="anchor">
                    <label>Table 14. </label>
                    <caption>
                        <title>Scaling function information for machine learning model search
                            <sup>
                                <xref ref-type="bibr" rid="ref-20">20</xref>
                            </sup>.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th colspan="1" rowspan="1">Scaling and Normalization</th>
                                <th colspan="1" rowspan="1">Description</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td colspan="1" rowspan="1">StandardScaleWrapper</td>
                                <td colspan="1" rowspan="1">Standardize features by removing the mean and scaling to unit variance</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">MinMaxScalar</td>
                                <td colspan="1" rowspan="1">Transforms features by scaling each feature by that column&#x2019;s minimum and maximum</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">MaxAbsScaler</td>
                                <td colspan="1" rowspan="1">Scale each feature by its maximum absolute value</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">RobustScalar</td>
                                <td colspan="1" rowspan="1">This Scaler features by their quantile range</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">PCA</td>
                                <td colspan="1" rowspan="1">Linear dimensionality reduction using singular value decomposition of the data to
                                    <break/>project it to a lower dimensional space</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">TruncatedSVDWrapper</td>
                                <td colspan="1" rowspan="1">This transformer performs linear dimensionality reduction by means of truncated
                                    <break/>singular value decomposition.
                                    <break/>Contrary to PCA, this estimator does not center the data before computing the
                                    <break/>singular value decomposition. This means it can efficiently work with sparse matrices.</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1">SparseNormalizer</td>
                                <td colspan="1" rowspan="1">Each sample (each record of the data) with at least one non-zero component is re-
                                    <break/>scaled independently of other samples so that its norm (L1 or L2) equals one</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
        </sec>
        <sec sec-type="discussion">
            <title>Discussion</title>
            <p>By using distributed processing of the data preparation, we can successfully shape and manage large malaria datasets. We efficiently transformed a matrix of over 40,000 genetic attributes for the IC
                <sub>50</sub> use case and over 4,000 genetic attributes for the resistance rate use case. This was completed with scalable vectorization of the training data, which allowed for many machine learning models to be generated. By tracking the individual performance results of each machine learning model, we can determine which model is most useful. In addition, ensemble modeling of the various singular models proved effective for both tasks in this work.</p>
            <p>The resulting model performance of both the IC
                <sub>50</sub> model and the clearance rate model show relatively adequate fitting of the data for their respective predictions. While additional model tuning may provide a lift in model performance, we have demonstrated the utility of ensemble modeling in these predictive use cases in malaria.</p>
            <p>In addition, this exercise helps to quantify the importance of genetic features, spotlighting potential genes that are significant in artemisinin resistance. The utility of these models will help in directing development of alternative treatment or coordination of combination therapies in resistant infections.</p>
        </sec>
        <sec>
            <title>Preprint</title>
            <p>An earlier version of this article can be found on bioRxiv (doi:
                <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1101/856922">10.1101/856922</ext-link>).</p>
        </sec>
        <sec>
            <title>Data availability</title>
            <sec>
                <title>Underlying data</title>
                <p>The challenge datasets are available from Synapse (
                    <ext-link ext-link-type="uri" xlink:href="https://www.synapse.org/">https://www.synapse.org/</ext-link>; Synapse ID: 
                    <ext-link ext-link-type="uri" xlink:href="https://www.synapse.org/#!Synapse:syn18089524">syn18089524</ext-link>). Access to the data requires registration and agreement to the conditions for use at: 
                    <ext-link ext-link-type="uri" xlink:href="https://www.synapse.org/#!Synapse: syn18089524">https://www.synapse.org/#!Synapse: syn18089524</ext-link>.</p>
                <p>Challenge documentation, including the detailed description of the Challenge design, data description, and overall results can be found at: 
                    <ext-link ext-link-type="uri" xlink:href="https://www.synapse.org/#!Synapse:syn16924919/wiki/583955">https://www.synapse.org/#!Synapse:syn16924919/wiki/583955</ext-link>.</p>
                <p>Whole genome expression profiling of artemsinin-resistant Plasmodium falciparum field isolates, Accession number GSE59099: 
                    <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59099">https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE59099</ext-link>.</p>
                <p>Zenodo: colbyford/malaria_DREAM2019: Ensemble Machine Learning Modeling for the Prediction of Artemisinin Resistance in Malaria - Initial Code Release for Research Publication (F1000). 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3590459">https://doi.org/10.5281/zenodo.3590459</ext-link>
                    <sup>
                        <xref ref-type="bibr" rid="ref-21">21</xref>
                    </sup>.</p>
                <p>This project contains the following underlying data:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>/SubChallenge1/data/sc1_X_train.pkl (Pickle file of the SubChallenge 1 independent variables, pivoted by 
                            <monospace>Timepoint</monospace>, 
                            <monospace>Treatment</monospace>, and 
                            <monospace>BioRep</monospace>.)</p>
                    </list-item>
                    <list-item>
                        <p>/SubChallenge1/data/sc1_y_train.pkl (Pickle file of the SubChallenge 1 dependent variable, 
                            <monospace>DHA_IC50</monospace>.)</p>
                    </list-item>
                    <list-item>
                        <p>/SubChallenge2/data/sc2_X_train.pkl (Pickle file of the SubChallenge 2 independent variables.)</p>
                    </list-item>
                    <list-item>
                        <p>/SubChallenge2/data/sc2_y_train.pkl (Pickle file of the SubChallenge 2 dependent variable, 
                            <monospace>ClearanceRate</monospace>.)</p>
                    </list-item>
                </list>
                <p>Data are available under the terms of the 
                    <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/publicdomain/zero/1.0/">Creative Commons Zero "No rights reserved" data waiver</ext-link> (CC0 1.0 Public domain dedication).</p>
            </sec>
        </sec>
        <sec>
            <title>Software availability</title>
            <list list-type="bullet">
                <list-item>
                    <p>Source code available from: 
                        <ext-link ext-link-type="uri" xlink:href="https://github.com/colbyford/malaria_DREAM2019">https://github.com/colbyford/malaria_DREAM2019</ext-link>
                    </p>
                </list-item>
                <list-item>
                    <p>Archived source code at time of publication: 
                        <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3590459">https://doi.org/10.5281/zenodo.3590459</ext-link>
                        <sup>
                            <xref ref-type="bibr" rid="ref-21">21</xref>
                        </sup>
                    </p>
                </list-item>
                <list-item>
                    <p>License: GPL-3.0</p>
                </list-item>
            </list>
        </sec>
    </body>
    <back>
        <ref-list>
            <ref id="ref-1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <article-title>Fact sheet about malaria</article-title>. World Health Organization.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/malaria">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <article-title>Guidelines for the treatment of malaria</article-title>. World Health Organization.<year>2015</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://apps.who.int/iris/bitstream/handle/10665/162441/9789241549127_eng.pdf;jsessionid=964EB4AA02A888EFC7A193503F9B795F?sequence=1">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Dondorp</surname>
                            <given-names>AM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nosten</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Yi</surname>
                            <given-names>P</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Artemisinin resistance in 
                        <italic toggle="yes">Plasmodium falciparum</italic> malaria.</article-title>
                    <source>

                        <italic toggle="yes">N Engl J Med.</italic>
</source>
                    <year>2009</year>;<volume>361</volume>(<issue>5</issue>):<fpage>455</fpage>&#x2013;<lpage>467</lpage>.
                    <pub-id pub-id-type="pmid">19641202</pub-id>
                    <pub-id pub-id-type="doi">10.1056/NEJMoa0808859</pub-id>
                    <pub-id pub-id-type="pmcid">3495232</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ouattara</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kone</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Adams</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Polymorphisms in the K13-propeller gene in artemisinin-susceptible 
                        <italic toggle="yes">Plasmodium falciparum</italic> parasites from Bougoula-Hameau and Bandiagara, Mali.</article-title>
                    <source>

                        <italic toggle="yes">Am J Trop Med Hyg.</italic>
</source>
                    <year>2015</year>;<volume>92</volume>(<issue>6</issue>):<fpage>1202</fpage>&#x2013;<lpage>1206</lpage>.
                    <pub-id pub-id-type="pmid">25918205</pub-id>
                    <pub-id pub-id-type="doi">10.4269/ajtmh.14-0605</pub-id>
                    <pub-id pub-id-type="pmcid">4458826</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Saralamba</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pan-Ngum</surname>
                            <given-names>W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Maude</surname>
                            <given-names>RJ</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Intrahost modeling of artemisinin resistance in 
                        <italic toggle="yes">Plasmodium falciparum</italic>.</article-title>
                    <source>

                        <italic toggle="yes">Proc Natl Acad Sci U S A.</italic>
</source>
                    <year>2011</year>;<volume>108</volume>(<issue>1</issue>):<fpage>397</fpage>&#x2013;<lpage>402</lpage>.
                    <pub-id pub-id-type="pmid">21173254</pub-id>
                    <pub-id pub-id-type="doi">10.1073/pnas.1006113108</pub-id>
                    <pub-id pub-id-type="pmcid">3017155</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>White</surname>
                            <given-names>NJ</given-names>
                        </name>
</person-group>:
                    <article-title>The parasite clearance curve</article-title>.
                    <source>In: 

                        <italic toggle="yes">Malar J.</italic>
</source>
                    <year>2011</year>;<volume>10</volume>:<fpage>278</fpage>.
                    <pub-id pub-id-type="pmid">21939506</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1475-2875-10-278</pub-id>
                    <pub-id pub-id-type="pmcid">3195204</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ashley</surname>
                            <given-names>EA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dhorda</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Fairhurst</surname>
                            <given-names>RM</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Spread of artemisinin resistance in 
                        <italic toggle="yes">Plasmodium falciparum</italic> malaria.</article-title>
                    <source>

                        <italic toggle="yes">N Engl J Med.</italic>
</source>
                    <year>2014</year>;<volume>371</volume>(<issue>5</issue>):<fpage>411</fpage>&#x2013;<lpage>423</lpage>.
                    <pub-id pub-id-type="pmid">25075834</pub-id>
                    <pub-id pub-id-type="doi">10.1056/NEJMoa1314981</pub-id>
                    <pub-id pub-id-type="pmcid">4143591</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Davis</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Button-Simons</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bensellak</surname>
                            <given-names>T</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Leveraging crowdsourcing to accelerate global health solutions.</article-title>
                    <source>

                        <italic toggle="yes">Nat Biotechnol.</italic>
</source>
                    <year>2019</year>;<volume>37</volume>(<issue>8</issue>):<fpage>848</fpage>&#x2013;<lpage>850</lpage>.
                    <pub-id pub-id-type="pmid">31324891</pub-id>
                    <pub-id pub-id-type="doi">10.1038/s41587-019-0180-5</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ghouila</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Siwo</surname>
                            <given-names>GH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Entfellner</surname>
                            <given-names>JD</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Hackathons as a means of accelerating scientific discoveries and knowledge transfer.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2018</year>;<volume>28</volume>(<issue>5</issue>):<fpage>759</fpage>&#x2013;<lpage>765</lpage>.
                    <pub-id pub-id-type="pmid">29650552</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.228460.117</pub-id>
                    <pub-id pub-id-type="pmcid">5932615</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Zaharia</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Xin</surname>
                            <given-names>RS</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wendell</surname>
                            <given-names>P</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Apache spark: A unified engine for big data processing.</article-title>
                    <source>

                        <italic toggle="yes">Commun ACM.</italic>
</source>
                    <year>2016</year>;<volume>59</volume>(<issue>11</issue>):<fpage>56</fpage>&#x2013;<lpage>65</lpage>.
                    <pub-id pub-id-type="doi">10.1145/2934664</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Turnbull</surname>
                            <given-names>LB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Siwo</surname>
                            <given-names>GH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Button-Simons</surname>
                            <given-names>KA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Simultaneous genome-wide gene expression and transcript isoform profiling in the human malaria parasite.</article-title>
                    <source>

                        <italic toggle="yes">PLoS One.</italic>
</source>
                    <year>2017</year>;<volume>12</volume>(<issue>11</issue>):<fpage>e0187595</fpage>.
                    <pub-id pub-id-type="pmid">29112986</pub-id>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0187595</pub-id>
                    <pub-id pub-id-type="pmcid">5675406</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>van der Walt</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Colbert</surname>
                            <given-names>SC</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Varoquaux</surname>
                            <given-names>G</given-names>
                        </name>
</person-group>:
                    <article-title>The numpy array: A structure for efficient numerical computation.</article-title>
                    <source>

                        <italic toggle="yes">Comput Sci Eng.</italic>
</source>
                    <year>2011</year>;<volume>13</volume>(<issue>2</issue>):<fpage>22</fpage>&#x2013;<lpage>30</lpage>.
                    <pub-id pub-id-type="doi">10.1109/MCSE.2011.37</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <article-title>Microsoft Azure Machine Learning Service</article-title>.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://azure.microsoft.com/en-us/services/machine-learning/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <collab>Microsoft</collab>:
                    <article-title>Azure Machine Learning AutoML Core version 1.0.79</article-title>.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/azureml-automl-core/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Pedregosa</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Varoquaux</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gramfort</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Scikit-learn: Machine learning in Python.</article-title>
                    <source>

                        <italic toggle="yes">J Mach Learn Res.</italic>
</source>
                    <year>2011</year>;<volume>12</volume>:<fpage>2825</fpage>&#x2013;<lpage>2830</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.jmlr.org/papers/volume12/pedregosa11a/pedregosa11a.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ke</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Meng</surname>
                            <given-names>Q</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Finley</surname>
                            <given-names>T</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Lightgbm: A highly efficient gradient boosting decision tree</article-title>. In: I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors,
                    <italic toggle="yes">Advances in Neural Information Processing Systems</italic>. Curran Associates, Inc.<year>2017</year>;<volume>30</volume>:<fpage>3146</fpage>&#x2013;<lpage>3154</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="http://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Caruana</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Niculescu-Mizil</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Crew</surname>
                            <given-names>G</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Ensemble selection from libraries of models</article-title>. In:
                    <italic toggle="yes">Proceedings of the Twenty-first International Conference on Machine Learning</italic>, ICML &#x2019;04, New York, NY, USA,<year>2004</year>;<fpage>18</fpage>.
                    <pub-id pub-id-type="doi">10.1145/1015330.1015432</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Mok</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ashley</surname>
                            <given-names>EA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ferreira</surname>
                            <given-names>PE</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Drug resistance. Population transcriptomics of human malaria parasites reveals the mechanism of artemisinin resistance.</article-title>
                    <source>

                        <italic toggle="yes">Science.</italic>
</source>
                    <year>2015</year>;<volume>347</volume>(<issue>6220</issue>):<fpage>431</fpage>&#x2013;<lpage>435</lpage>.
                    <pub-id pub-id-type="pmid">25502316</pub-id>
                    <pub-id pub-id-type="doi">10.1126/science.1260403</pub-id>
                    <pub-id pub-id-type="pmcid">5642863</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lundberg</surname>
                            <given-names>SM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>S</given-names>
                        </name>
</person-group>:
                    <article-title>A unified approach to interpreting model predictions</article-title>. In: I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors,
                    <italic toggle="yes">Advances in Neural Information Processing Systems</italic>. Curran Associates, Inc.,<year>2017</year>;<volume>30</volume>:<fpage>4765</fpage>&#x2013;<lpage>4774</lpage>.
                    <ext-link ext-link-type="uri" xlink:href="http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <collab>Microsoft</collab>:
                    <article-title>Microsoft Azure Machine Learning - AutoML Preprocessing</article-title>.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://docs.microsoft.com/en-us/azure/machine-learning/concept-automated-ml#automatic-preprocessing-standard">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ford</surname>
                            <given-names>C</given-names>
                        </name>
</person-group>:
                    <article-title>colbyford/malaria_DREAM2019: Ensemble Machine Learning Modeling for the Prediction of Artemisinin Resistance in Malaria - Initial Code Release for Research Publication (F1000)</article-title>.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.doi.org/10.5281/zenodo.3590459">http://www.doi.org/10.5281/zenodo.3590459</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
</article>
