<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.51873.2</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Research Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>A pan-genome method to determine core regions of the 
                    <italic>Bacillus subtilis </italic>and 
                    <italic>Escherichia coli</italic> genomes</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 2; peer review: 2 approved]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Sutton</surname>
                        <given-names>Granger</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-7498-8048</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Fogel</surname>
                        <given-names>Gary B.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Abramson</surname>
                        <given-names>Bradley</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Brinkac</surname>
                        <given-names>Lauren</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Michael</surname>
                        <given-names>Todd</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-6272-2875</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Liu</surname>
                        <given-names>Enoch S.</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Thomas</surname>
                        <given-names>Sterling</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>J. Craig Venter Institute, Rockville, Maryland, 20850, USA</aff>
                <aff id="a2">
                    <label>2</label>Natural Selection, Inc., San Diego, CA, 92121, USA</aff>
                <aff id="a3">
                    <label>3</label>Noblis, Inc., Reston, VA, 20191, USA</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:GSutton@jcvi.org">GSutton@jcvi.org</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>2</day>
                <month>9</month>
                <year>2021</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2021</year>
            </pub-date>
            <volume>10</volume>
            <elocation-id>286</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>17</day>
                    <month>8</month>
                    <year>2021</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2021 Sutton G et al.</copyright-statement>
                <copyright-year>2021</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/10-286/pdf"/>
            <abstract>
                <p>

                    <bold>Background:</bold> Synthetic engineering of bacteria to produce industrial products is a burgeoning field of research and application. In order to optimize genome design, designers need to understand which genes are essential, which are optimal for growth, and locations in the genome that will be tolerated by the organism when inserting engineered cassettes.</p>
                <p> 
                    <bold>Methods:</bold> We present a pan-genome based method for the identification of core regions in a genome that are strongly conserved at the species level.</p>
                <p> 
                    <bold>Results:</bold> We show that the core regions determined by our method contain all or almost all essential genes. This demonstrates the accuracy of our method as essential genes should be core genes. We show that we outperform previous methods by this measure. We also explain why there are exceptions to this rule for our method.</p>
                <p> 
                    <bold>Conclusions:</bold> We assert that synthetic engineers should avoid deleting or inserting into these core regions unless they understand and are manipulating the function of the genes in that region. Similarly, if the designer wishes to streamline the genome, non-core regions and in particular low penetrance genes would be good targets for deletion. Care should be taken to remove entire cassettes with similar penetrance of the genes within cassettes as they may harbor toxin/antitoxin genes which need to be removed in tandem. The bioinformatic approach introduced here saves considerable time and effort relative to knockout studies on single isolates of a given species and captures a broad understanding of the conservation of genes that are core to a species.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>pan-genome</kwd>
                <kwd>pan-genome graph</kwd>
                <kwd>core genes</kwd>
                <kwd>essential genes</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1">
                    <funding-source>IARPA</funding-source>
                    <award-id>N6600118C-4506</award-id>
                </award-group>
                <funding-statement>This research is based upon work supported [in part] by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA) under Finding Engineering Linked Indicators (FELIX) program contract #N6600118C-4506. The principal investigator for the award is Sterling Thomas. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of ODNI, IARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
        <notes>
            <sec sec-type="version-changes">
                <label>Revised</label>
                <title>Amendments from Version 1</title>
                <p>Version 2 attempts to address all comments from the reviewers for version 1. This includes a new introductory paragraph explaining the overall thrust of the article before diving into a detailed background. A new paragraph better explaining the advantages of a pan-genome graph (PGG) over a simple pan-genome. A new table showing how complete genomes from RefSeq were filtered to arrive at the final set. A new supplementary table showing the 34 genes in 
                    <italic>MiniBacillus </italic>but not present in all 108 
                    <italic>B. subtilis </italic>genomes. And new concluding paragraphs for the results and discussion section.</p>
            </sec>
        </notes>
    </front>
    <body>
        <sec id="sec1" sec-type="intro">
            <title>Introduction</title>
            <p>The primary focus of this paper is a new pan-genome method to determine core regions of a genome shared by all or almost all strains of the same species or subspecies. We evaluate the performance of this approach relative to other methods using experimentally determined essential genes under the hypothesis that all or at least most essential genes should be core across a species. This hypothesis implies that methods for determining core regions/genes are likely to be more accurate if they identify more essential genes as core genes. The paper reveals the potential usefulness of pan-genome analysis for synthetic engineering and genome analysis more broadly through the analysis of core regions in 
                <italic toggle="yes">Bacillus subtilis</italic> and 
                <italic toggle="yes">Escherichia coli</italic>.</p>
            <p>Over the last decade, considerable interest has been directed towards the determination of a minimal bacterial cell, making use of a short genome consisting of only essential genes for viability. The 
                <italic toggle="yes">Mycoplasma mycoides</italic> JCVI-syn3.0 is a case example of synthetic engineering to design and build a genome that contains a streamlined gene set essential for cell viability and cell replication.
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup> Multiple genome reduction projects have been undertaken.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref4">4</xref>
                </sup> More targeted genomic deletions of genomic loci have been performed to characterize essential genes, but generally targeted approaches are too laborious to perform on a whole genome.
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>,
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> However, the identification of &#x201c;essential&#x201d; genes - those genes that are critical for cell viability and replication - takes considerable time and effort in a laboratory setting and is usually determined with respect to one reference genome under one set of specific growth conditions. For instance, Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> and Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> experimentally and computationally determined the minimal gene set in the Gram-positive bacterium 
                <italic toggle="yes">Bacillus subtilis.</italic> Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> used a strictly experimental approach but Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> used three forms of evidence for their essential genes as given in their Table 4: RB: previous experimental work in 
                <italic toggle="yes">B. subtilis</italic>; RO: previous experimental work in other bacteria; and TW: their experimental work. The RO evidence used is a mix of experimental and computational as the determination of orthologs is computational and essentiality of those orthologs was not experimentally confirmed: &#x201c;Through predictions we propose that 79 other genes are essential, whereas 106 are not (Table 3)&#x201d;.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> Of the ~4,100 genes of the type strain, a total of 271 genes for Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> and 257 genes for Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> were shown to be essential. These essential genes were further categorized in terms of cell metabolism and enzymatic capability. Additionally, for the ~4400 genes in the Gram-negative bacterium 
                <italic toggle="yes">Escherichia coli</italic>, Goodall 
                <italic toggle="yes">et al</italic>.,
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> Baba 
                <italic toggle="yes">et al.</italic>,
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> and Yamazaki 
                <italic toggle="yes">et al.</italic>,
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> it was determined that 414 genes were essential to strain K-12.</p>
            <p>Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> completed extensive further experimental and computational work to determine a minimal 
                <italic toggle="yes">B. subtilis</italic> genome they call 
                <italic toggle="yes">MiniBacillus.</italic> They present a list of 523 protein coding and 119 RNA genes necessary for a minimal 
                <italic toggle="yes">B. subtilis</italic> cell growing in complex medium at 37&#x00b0;C. While many of these genes are not essential under single deletion experimental conditions, they are required for survival because a cell needs certain essential functions which may be carried out independently by more than one gene. As noted by Reu&#x00df; 
                <italic toggle="yes">et al.</italic>,
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> the choice of which functionally isologous genes to choose for a minimal cell depends upon minimization goals and gene choices for different functions are not independent of one another. One criterion used by Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> is the conservation of the gene: &#x201c;More strongly conserved genes were preferred over less conserved genes. In this respect, gene conservation and essentiality in genome-reduced 
                <italic toggle="yes">Mycoplasma</italic> and other mollicutes species and the inclusion of genes in the genome of 
                <italic toggle="yes">M. mycoides</italic> JCVI-syn3.0 had a high priority&#x201d;. Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> do not explicitly use gene conservation at the species/subspecies pan-genome level but this seems in spirit with their criteria.</p>
            <p>Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> extended their computational prediction of 
                <italic toggle="yes">MiniBacillus</italic> by building on previous work to generate 
                <italic toggle="yes">B. subtilis</italic> strains with large genome reductions.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> They started by constructing the delta 6 strain
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup> (~8% genome reduction). This reduction removed: &#x201c;two prophages (SP&#x03b2;, PBSX), three prophage-like regions, and the largest operon of 
                <italic toggle="yes">B. subtilis</italic> (
                <italic toggle="yes">pks</italic>).&#x201d; The phage/prophage regions were identified in part by GC content and codon usage as a method to identify probable horizontally transferred regions. Pan-genome analysis was not used. Next, strain IIG-Bs20 was constructed from delta 6
                <sup>
                    <xref ref-type="bibr" rid="ref13">13</xref>
                </sup> by removing &#x201c;all nine prophages, seven antibiotic biosynthesis gene clusters and two sigma factors for sporulation&#x201d; in part to have a strain that would &#x201c;not produce spores, antibiotics or bacteriocins&#x201d;. A direct descendant of IIG-Bs20, strain IIG-Bs27-47-24 (~31% genome reduction), was then used to generate more reductions in two independent strains, PG10 (~35% genome reduction) and PS38 (~36% genome reduction)
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> with the goal of removing genes &#x201c;not necessary for the survival of the cell (
                <italic toggle="yes">e.g.</italic>, sporulation, antibiotic production, motility, metabolism of secondary carbon sources, and genes of unknown functions)&#x201d;. For the strains IIG-Bs27-47-24, PG10, and PS-38, pan-genome analysis was not explicitly used but one of several criteria for deleting genes was a lack of conservation across broader taxonomic groups.</p>
            <p>For 
                <italic toggle="yes">E. coli</italic>, Kolisnychenko 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>
                </sup> generated an initial reduced genome in order to &#x201c;serve both as a better model organism and as a more useful technological tool for genome science&#x201d; by &#x201c;deleting the largest K-islands of 
                <italic toggle="yes">E. coli</italic>, identified by comparative genomics as recent horizontal acquisitions&#x201d;. K-islands are regions unique to the K-12 strain MG1655 compared with the O157:H7 strain Sakai, and the uroseptic 
                <italic toggle="yes">E. coli</italic> strain CFT073. This comparative analysis with a limited set of genomes is an obvious precursor to pan-genome analysis with a much larger set of genomes. Umenhoffer et al.
                <sup>
                    <xref ref-type="bibr" rid="ref14">14</xref>
                </sup> generated the reduced 
                <italic toggle="yes">E. coli</italic> strain MDS42 to be &#x201c;free of mutation-generating IS elements&#x201d;. This approach does not rely on comparison to other strains, just the ability to identify IS elements. Csorgo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref15">15</xref>
                </sup> further reduced the MDS42 strain by &#x201c;constructing low-mutation-rate variants &#x2026; to lack most genes irrelevant for laboratory/industrial applications.&#x201d; They targeted genes likely to be core and necessary for the species to adapt to the environment but detrimental in an industrial setting where strain stability is important.</p>
            <p>Experimental studies to determine such essential genes are time consuming and often restricted to a single environmental condition using a single strain of the species. In addition, these approaches also knock out one gene at a time. As such, genes with multiple copies with redundant functions are often not considered as essential following knockout, as their additional copy is able to maintain cellular function. In other words, a viable organism would not result from deleting all but the experimentally determined essential genes from the genome. Another peculiarity of the single knockout essential genes is that pairs or cassettes of genes which can be removed and still have a viable organism are labeled essential because removal of just one gene is lethal. For example, removing the methylation gene(s) without removing the restriction digestion enzyme genes from the restriction mechanism results in cell death but the cell survives if the entire system is removed. This is likewise true for toxin/antitoxin systems.</p>
            <p>While it is possible to define &#x201c;essential&#x201d; genes relative to viability, another larger question remains; which genes define a species? While specific phenotypes can vary across strains, in general a species seems to require some minimal set of genes to not only survive in the laboratory but to thrive in its natural environment. In contrast, some strains may have retained or acquired some genes which improve survival for specific niches. Comparing the genes from multiple diverse strains of a species can help answer these questions. We define the pan-genome for a species/subspecies to be the set of predicted orthologous gene clusters (OGC) across that set of strains. Others have allowed paralogs to be included in these gene clusters
                <sup>
                    <xref ref-type="bibr" rid="ref16">16</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> but here we do not. This constraint forces there to be at most one gene per genome in an OGC.</p>
            <p>We further define a pan-genome graph (PGG) to be a graph with the pan-genome OGCs as nodes where an edge exists between two nodes if the respective genes for any genome from the two OGCs are adjacent in that genome. More precisely, an OGC node is represented as a dipole with 5&#x2032; and 3&#x2032; ends and the edges go between an end of one node (5&#x2032; or 3&#x2032;) to an end of another node depending on the orientation of the genes which are adjacent. The edges primarily represent the order and orientation of OGCs in the pan-genome genomes. Secondarily, the edges also represent the interstitial DNA sequences between the genes. A PGG edge has a weight equal to the number of genomes which contain the indicated adjacent gene ends. Core OGCs are defined to be those OGCs present in some large percentage of the strains in the pan-genome (&#x2265;95% in this work). Core edges are defined similarly. Core regions are defined to be the coordinates in a genome for each set of adjacent core genes in that genome provided the edges between the core genes are also core edges.</p>
            <p>The PGG is important as it captures the structure of the pan-genome in ways that simply treating the pan-genome as a set of OGCs cannot. The inherent gene context in the PGG allows for more accurate annotation of a novel genome than OGCs alone which struggle to differentiate recent paralogs/repeats. The PGG allows core regions to be defined for any genome rather than just core OGCs/genes. The PGG indicates which OGCs occur in cassettes with implications for function, evolution, and synthetic engineering. As discussed later, the PGG allows for determination of probable orthologs not captured in the OGCs.</p>
            <p>Core OGCs should determine the baseline phenotype (capabilities and traits) of a species. Previous pan-genome studies
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> have shown that species tend to only tolerate the placement of noncore genes between core regions and not within those core regions. The reason an organism might constrain a core region rather than just core OGCs is that the region may include regulatory mechanisms such as operons, which allows for co-expression of multiple functionally associated genes, or regulons which would be disrupted with the insertion of other genes. We believe that conservation of core regions in species indicates resistance to insertion or deletion of genes in these regions through evolution or through human-mediated genetic engineering.</p>
            <p>Here we present a pan-genome based calculation of core regions for 
                <italic toggle="yes">B. subtilis</italic> ssp. 
                <italic toggle="yes">subtilis</italic> and for 
                <italic toggle="yes">E. coli.</italic> These core regions are compared with previous experimentally determined essential genes from the literature. These core regions are not a replacement for experimentally determined essential genes, but rather provide complementary information about a much larger portion of the genome. We expect that all truly essential genes for the species/subspecies would be a subset of the core OGCs/regions, since core OGCs would encompass genes responsible for providing a fitness advantage in environmental conditions as well as being essential for viability. This approach automates computational prediction of core OGCs/regions which can be used to help guide the removal of genome regions not needed for species fitness and indicate which genome regions are amenable to engineered insertions. This approach is an incremental improvement over previous computational methods to aid genome engineering. Ortholog prediction
                <sup>
                    <xref ref-type="bibr" rid="ref17">17</xref>,
                    <xref ref-type="bibr" rid="ref18">18</xref>,
                    <xref ref-type="bibr" rid="ref20">20</xref>
                </sup> and determination of genes essential for most bacteria has a long history.
                <sup>
                    <xref ref-type="bibr" rid="ref21">21</xref>,
                    <xref ref-type="bibr" rid="ref22">22</xref>
                </sup> Computational prediction of nonessential genes 
                <italic toggle="yes">via</italic> predicting prophage regions or other horizontal transfer events is also well established.
                <sup>
                    <xref ref-type="bibr" rid="ref23">23</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref25">25</xref>
                </sup> Pan-genome tools, most of which at some level predict core genes, are also not new.
                <sup>
                    <xref ref-type="bibr" rid="ref26">26</xref>,
                    <xref ref-type="bibr" rid="ref27">27</xref>
                </sup> Our method builds directly upon our previous pan-genome work
                <sup>
                    <xref ref-type="bibr" rid="ref28">28</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref30">30</xref>
                </sup> and includes several improvements: 1) being able to use only complete high-quality genomes (this concept is not new, but we find it impacts the quality of the PGG and core region determination and is reasonable as more complete genomes become available); 2) checking for including the correct species/subspecies using average nucleotide identity (ANI); 3) reannotating gene features using homology and gene context to ensure consistency; and 4) generating a PGG for a rigorous definition of a core region. 
                <xref ref-type="fig" rid="f1">Figure 1</xref> shows the high-level view of our method with details provided in the Methods section.</p>
            <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                <label>Figure 1. </label>
                <caption>
                    <title>High-level overview of our method for generating a refined PGG.</title>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76417/ffdd4ace-2b07-465f-94e8-75e9e90d93a7_figure1.gif"/>
            </fig>
        </sec>
        <sec id="sec2" sec-type="methods">
            <title>Methods</title>
            <sec id="sec12">
                <title>Genome Selection</title>
                <p>Reference 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> and 
                    <italic toggle="yes">E. coli</italic>&#x00a0;genomes were selected for pan-genome construction using a series of filtering steps resulting in high-quality, non-redundant genome datasets (
                    <xref ref-type="table" rid="T1">Table 1</xref>). For 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> and 
                    <italic toggle="yes">E. coli</italic>, we selected strains with complete genomes in RefSeq.
                    <sup>
                        <xref ref-type="bibr" rid="ref31">31</xref>
                    </sup> We restricted our analysis to complete genomes to ensure that missing genes due to incomplete genome sequencing/assembly did not affect the approach or results. We limited our choice to RefSeq for two reasons: RefSeq performs a series of quality checks to remove dubious genome assemblies, and the initial pan-genome construction depends upon reasonably consistent annotation which RefSeq provides. We extracted the genomes based on organism name: 
                    <italic toggle="yes">Bacillus subtilis</italic> (we did not specify subspecies, since for many RefSeq genomes a subspecies is not given) and 
                    <italic toggle="yes">Escherichia coli</italic> (we also specified 
                    <italic toggle="yes">Shigella</italic> since all 
                    <italic toggle="yes">Shigella</italic> species are actually considered to be the same species as 
                    <italic toggle="yes">Escherichia coli</italic>).
                    <sup>
                        <xref ref-type="bibr" rid="ref32">32</xref>,
                        <xref ref-type="bibr" rid="ref33">33</xref>
                    </sup>
                </p>
                <p>For each pan-genome, we then compared the genomes using a fast Average Nucleotide Identity (ANI) estimate generated using the MASH distance subtracted from 1 and multiplied by 100.
                    <sup>
                        <xref ref-type="bibr" rid="ref34">34</xref>
                    </sup> We used type strains and ANI to determine which of these genomes were the desired organism. We also used ANI to remove very closely related strains to reduce oversampling bias (for example for the 
                    <italic toggle="yes">B. subtilis</italic> type strain, 168, has at least eight genomes in RefSeq). We used GGRaSP
                    <sup>
                        <xref ref-type="bibr" rid="ref28">28</xref>
                    </sup> to choose a single medoid sequence from any complete linkage ANI cluster with a threshold of 0.01% or 1/10,000 base pair difference. We remove all other genomes besides the medoid as being redundant. Each removed redundant genome would be &#x2265;99.99% ANI to the retained medoid genome. The strain 168 medoid genome is the Entrez reference genome for the 
                    <italic toggle="yes">B. subtilis</italic> type strain (GenBank sequence AL009126.3, BioSample SAMEA3138188, Assembly ASM904v1
                    <bold>/</bold>GCA_000009045.1) which can be used to map the Kobayashi 
                    <italic toggle="yes">et al.</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref7">7</xref>
                    </sup> and Koo 
                    <italic toggle="yes">et al.</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref8">8</xref>
                    </sup> results.</p>
                <p>Using this approach, for 
                    <italic toggle="yes">B. subtilis</italic>, 143 genomes were downloaded from RefSeq. Of these, 132 genomes were determined to be 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> based on type strains and ANI. The minimum ANI between any pair of the 132 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> genomes was 97.28% whereas the maximum ANI of any of the 11 other genomes to the 132 genomes was 95.73%, providing good separation between the other subspecies. By sorting the pairwise ANI matrix rows based on the ANI values in the type strain column it was clear there was a punctate threshold at ~96.5% ANI which divided 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> genomes from other genomes. This means the 11 removed genomes all have &#x2264; 95.73% ANI to the type strain well below the 96.5% ANI threshold. The 132 genomes were further reduced to 109 genomes after removing redundant strains (using GGRaSP as discussed above). Finally, we removed strain delta6 (BioSample SAMN05150066) because it is known to have been engineered to remove multiple genes. Thus, we were left with 108 
                    <italic toggle="yes">B. subtilis</italic> genomes&#x00a0;(
                    <xref ref-type="table" rid="T1">Table 1</xref>). For 
                    <italic toggle="yes">E. coli</italic> (and 
                    <italic toggle="yes">Shigella</italic>) we downloaded 1097 complete genomes from RefSeq. Of these, 1096 were determined using ANI to be 
                    <italic toggle="yes">E. coli.</italic> The non-
                    <italic toggle="yes">E. coli</italic> genome was clearly mislabeled as its maximum ANI to any other genome was 82.27%.</p>
                <p>The minimum pairwise ANI of any of the 1096 genomes was 95.53% which is not as tight as for 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> which is to be expected given that 
                    <italic toggle="yes">E. coli</italic> is a species grouping not a subspecies grouping. One could arbitrarily try to choose a tighter grouping around the K-12 reference genome but the pairwise ANI values of the other genomes compared with the K-12 reference genome vary continuously from 96.22% to 100% with no punctate break in the values. After removing redundant genomes (using GGRaSP as discussed above), 969 
                    <italic toggle="yes">E. coli</italic> genomes remained. We added back in two redundant genomes: The K-12 Entrez 
                    <italic toggle="yes">E. coli</italic> reference strain MG1655 (BioSample SAMN02604091) and the K-12 strain BW25113 (GenBank sequence accession CP009273.1, GenBank Assembly accession ASM75055v1/GCA_000750555.1, GenBank BioSample accession SAMN03013572) used by Goodall 
                    <italic toggle="yes">et al</italic>.
                    <sup>
                        <xref ref-type="bibr" rid="ref9">9</xref>
                    </sup> These two redundant genomes were added back in so that we could map the PGG OGCs to these genomes for comparison to the established literature resulting in 971 genomes in the PGG (
                    <xref ref-type="table" rid="T1">Table 1</xref>). By using a 95% threshold for the number of genomes an OGC must be in to be considered core, some small number of the 971 genomes could be engineered to remove what are normally core OGCs and not affect the assignment of core OGCs.</p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Number of 
                            <italic toggle="yes">B. subtilis</italic> and 
                            <italic toggle="yes">E. coli</italic> genomes selected after each genome filtering step.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Organism</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Text-based query RefSeq download</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">ANI classification</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">GGRaSP redundancy filtering</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Final genome dataset</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">
                                    <italic toggle="yes">B. subtilis</italic>
                                </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">132</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">109</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">108</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">
                                    <italic toggle="yes">E. coli</italic>
                                </td>
                                <td align="left" colspan="1" rowspan="1" valign="top">1097</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">1096</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">969</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">971</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
            </sec>
            <sec id="sec131">
                <title>Pan-genome and PGG construction</title>
                <p>For 
                    <italic toggle="yes">B. subtilis</italic> ssp. 
                    <italic toggle="yes">subtilis</italic> and 
                    <italic toggle="yes">E. coli</italic>, initial pan-genomes were based on the RefSeq annotation of these genomes. The pan-genome was generated using the pan-genome pipeline at the J. Craig Venter Institute (JCVI) at the nucleotide level using default parameters with the exception that a minimum of 90% identity and 90% length for pairwise BLAST matches were used to prevent possible clustering of non-orthologous genes.
                    <sup>
                        <xref ref-type="bibr" rid="ref29">29</xref>
                    </sup> This produced OGCs using gene context
                    <sup>
                        <xref ref-type="bibr" rid="ref30">30</xref>
                    </sup> as well as a PGG.
                    <sup>
                        <xref ref-type="bibr" rid="ref19">19</xref>
                    </sup> The PGG has two main components: nodes representing OGCs, and edges representing the sequence between OGCs and the order and orientation of the OGCs in the genomes. We updated the code repository for the JCVI pan-genome pipeline with a script: iterate_pgg_graph.pl, which calls pgg_annotate.pl for the genomes in the existing PGG in order to ensure consistent annotation of the genomes and iterates until the PGG stabilizes. The script pgg_annotate.pl uses an existing PGG to assign regions of a genome to nodes of the graph. This is done by searching the medoid sequence using BLAST for the OGC the node represents against the genome and then uses Needleman &#x2013; Wunsch
                    <sup>
                        <xref ref-type="bibr" rid="ref35">35</xref>
                    </sup> to extend the alignment if needed. If there are conflicting BLAST matches, then the matches are resolved based on which matches are consistent with the structure of the PGG which encapsulates gene context across the entire pan-genome. Once the nodes of the PGG are mapped to each of the genomes in the pan-genome a new version of the PGG is intrinsic and then explicitly extracted. This process is iterated to stability. This ensures that each genome is consistently annotated so that genes missing from the original annotation of some genomes will be consistently annotated across all genomes. A user manual for this new functionality is available at 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/JCVenterInstitute/PanGenomePipeline">https://github.com/JCVenterInstitute/PanGenomePipeline</ext-link>.</p>
                <p>Core regions were determined based on the PGG. Nodes in the PGG were OGCs. Edges in the PGG represented adjacency of genes (contained in the OGCs) in the underlying genomes. The definition of which OGCs were or were not considered &#x201c;core&#x201d; was determined relative to a threshold criterion. We used a criterion for core such that 95% or more of the underlying genome had to contain the OGC or edge. Considering that we used only complete genomes it might have been possible to use a 100% threshold. However, we opted for a 95% threshold based on prior experience and an abundance of caution to not under call core OGCs/edges which might result in false negatives. Each core region began with a core OGC followed by a core edge (if possible, otherwise the core region comprises a single OGC) to another core OGC and so on until a core edge cannot be found to continue the core region. A core region is just a path in the PGG which was then mapped onto each genome to determine the core region coordinates. When the core threshold was below 100% any genome may be missing an OGC (gene) or edge along this path which results in the path being broken into its remaining constituent parts.</p>
            </sec>
            <sec id="sec13">
                <title>Comparison to essential genes</title>
                <p>In order to compare core regions to experimentally determined essential genes we needed a common base of reference. For each of the experimental studies, the genes are specified based on a reference strain that was used for the experiments and has a complete genome in RefSeq. For Kobayashi 
                    <italic toggle="yes">et al</italic>.,
                    <sup>
                        <xref ref-type="bibr" rid="ref7">7</xref>
                    </sup> only gene symbols/names were given which we mapped to Entrez GeneIDs using Entrez search. GeneIDs with no matches were manually curated to estimate the best matching gene symbol listed in the literature. For Koo 
                    <italic toggle="yes">et al.</italic>,
                    <sup>
                        <xref ref-type="bibr" rid="ref8">8</xref>
                    </sup> locus IDs were provided giving direct access to the gene coordinates for RefSeq accession NC_000964.3 (BioSample SAMEA3138188, Assembly GCF_000009045.1). For Goodall, we used the data from three studies in Table S2 from Goodall 
                    <italic toggle="yes">et al</italic>.
                    <sup>
                        <xref ref-type="bibr" rid="ref9">9</xref>
                    </sup> Gene symbols/names again were all that was available but these were consistent with the GenBank annotation downloadable in gff format for the K-12 BW25113 reference genome (GenBank accession CP009273.1) used by Goodall 
                    <italic toggle="yes">et al.</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref9">9</xref>
                    </sup> (BioSample SAMN03013572). This gave us coordinates for all essential genes on RefSeq genomes which were annotated with a PGG which produces a file with coordinates for OGCs and edges mapped to the genome. These coordinates allow us to affiliate essential genes to OGCs.</p>
            </sec>
        </sec>
        <sec id="sec3" sec-type="results">
            <title>Results</title>
            <p>The original and refined PGG statistics for 
                <italic toggle="yes">B. subtilis</italic> and 
                <italic toggle="yes">E. coli</italic> are provided in 
                <xref ref-type="table" rid="T2">Table 2</xref>. The major goal of refining the PGG using reannotation and iteration until stabilization was to achieve consistent annotation across all genomes in the PGG leading to a more comprehensive and cohesive PGG. While the RefSeq annotations of these genomes tends to be highly consistent, many small genes are often arbitrarily called from genome to genome and even some common longer genes can occasionally be missed. There are three obvious points of improvement in the refined PGG for both the OGC and edge stats: the number of size 1 OGCs/edges significantly decreased due to some dubious RefSeq gene calls being eliminated and some becoming shared with other genomes; the number of core OGCs/edges significantly increased showing an improvement in the consistency of annotation across all genomes; and the number of genes/edge instances in OGCs/edges greatly increased again indicating a much more consistent annotation. We have included core OGC statistics for three threshold definitions of core: 95%, 99%, and 100%. In part, this is for comparison to previous studies but it also illustrates the relative larger impact of consistency as the threshold increases. For example, in 
                <italic toggle="yes">E. coli</italic>, the refined PGG gives an increase of 22% in core OGCs at a 95% threshold but an increase of 111% in core OGCs at a 100% threshold. When even a single misannotated gene drops an OGC below core at the 100% threshold consistent annotation is crucial.</p>
            <table-wrap id="T2" orientation="portrait" position="anchor">
                <label>Table 2. </label>
                <caption>
                    <title>Pan-genome graph statistics for 
                        <italic toggle="yes">B. subtilis</italic> and 
                        <italic toggle="yes">E. coli.</italic>
                    </title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">PGG Statistic</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">
                                <italic toggle="yes">B. subtilis</italic> original PGG</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">
                                <italic toggle="yes">B. subtilis</italic> refined PGG</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">
                                <italic toggle="yes">E. coli</italic> original PGG</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">
                                <italic toggle="yes">E. coli</italic> refined PGG</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Size 1 OGCs</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">4434</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3231</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">87423</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">27273</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Shared (size&gt;1) OGCs</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">8174</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">8204</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">68970</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">48129</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom"># of genes in shared OGCs</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">463311</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">487562</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">5039502</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">5610683</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Core OGCs (95%)</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3558</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3778</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">2968</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3631</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Core OGCs (99%)</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3356</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3604</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">2168</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">2992</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">
                                <bold>Core OGCs (100%)</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">
                                <bold>3072</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">
                                <bold>3419</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">
                                <bold>713</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">
                                <bold>1501</bold>
                            </td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Size 1 edges</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">7282</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">5479</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">153199</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">67566</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Shared edges</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">9755</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">9433</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">99284</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">67248</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Edge instances in shared edges</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">460452</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">485177</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">4970823</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">5567497</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">Core edges (95%)</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3230</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3520</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">2218</td>
                            <td align="left" colspan="1" rowspan="1" valign="bottom">3124</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <p>The 
                <italic toggle="yes">B. subtilis</italic> ssp. 
                <italic toggle="yes">subtilis</italic> refined PGG annotates 4654 OGCs for the reference genome (GenBank sequence AL009126.3, BioSample SAMEA3138188, Assembly ASM904v1
                <bold>/</bold>GCA_000009045.1) (Supplementary Table 1): 876 (18.8% of OGCs) noncore (&lt;95% of genomes 102 or less), 359 (7.71%) core but not present in all genomes (&#x2265;95% and &lt;100% of genomes 103&#x2013;107), and 3419 (73.5%) core and present in all 108 genomes. For the union of the Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> and Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> essential gene data sets there are 305 genes (Supplementary Table 2): 16 (5.25%) noncore (&#x2264;102 genomes), 2 (0.656%) core but not all (103&#x2013;107 genomes), and 287 (94.1%) core all (108 genomes). This shows that most essential genes in 
                <italic toggle="yes">B. subtilis</italic> ssp. 
                <italic toggle="yes">subtilis</italic> are encompassed by core OGCs/regions. There are 258 core regions for 
                <italic toggle="yes">B. subtilis</italic> (Supplementary Table 3). The 289 essential genes which are core OGCs are contained in only 63 of these regions. These 289 essential genes are not evenly distributed in these 63 regions (
                <italic toggle="yes">e.g.</italic> 46 are in core region 3). Similarly, the 16 essential genes in non-core regions (the regions between core regions) are contained in only seven non-core regions with eight genes in the non-core region between core regions 206 and 207 (
                <xref ref-type="fig" rid="f2">Figure 2</xref>). A table of all 
                <italic toggle="yes">B. subtilis</italic> genes mapped to the reference genome is provided in Supplementary Table 1.</p>
            <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                <label>Figure 2. </label>
                <caption>
                    <title>There are eight tracks mapped to the 
                        <italic toggle="yes">B. subtilis</italic> reference genome in this Circos figure.</title>
                    <p>Going from the outside to the inside: track 1) core regions (dark red), 2) Minibacillus genes (green), 3) Koo 
                        <italic toggle="yes">et al</italic>.
                        <sup>
                            <xref ref-type="bibr" rid="ref8">8</xref>
                        </sup> essential genes (light blue), 4) Kobayashi 
                        <italic toggle="yes">et al</italic>.
                        <sup>
                            <xref ref-type="bibr" rid="ref7">7</xref>
                        </sup> essential genes (medium blue), 5) deleted genes in strain delta 6 (dark blue), 6) deleted genes in strain IIG-Bs27-47-24 (orange), 7) deleted genes in strain PG10 (yellow), and 8) deleted genes in strain PS38 (red).</p>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76417/ffdd4ace-2b07-465f-94e8-75e9e90d93a7_figure2.gif"/>
            </fig>
            <p>The Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> data set for 
                <italic toggle="yes">MiniBacillus</italic> has 523 protein coding and 119 RNA genes predicted to be necessary for a minimal 
                <italic toggle="yes">B. subtilis.</italic> For the 523 protein coding genes: 18 are noncore (&#x2264;102 genomes), 16 are core but not in all genomes (one in 105, one in 106, 14 in 107 genomes), and 489 are in all 108 genomes (Supplementary Table 1). They include all 30 rRNA and 86 tRNA genes from the reference genome as well as three &#x201c;misc&#x201d; RNA genes in 
                <italic toggle="yes">MiniBacillus.</italic> The three misc RNA genes are present in all 108 genomes. In all likelihood, the 10 copies of the 16S-23S-5S RNA operon are not required but it is safer for robust growth not to delete any of them. Likewise, for the tRNA genes where many are redundant. For the 30 rRNA genes: six are noncore (92&#x2013;102 genomes), 16 are core but not in all genomes (103&#x2013;107), and eight are in all 108 genomes. It is clearly possible that some of these strains are dispensing with some of the RNA operons but at most this is happening rarely reinforcing the decision not to remove any from 
                <italic toggle="yes">MiniBacillus.</italic> In addition, some of the missing RNA operon genes may be due to incorrect assembly of the two sets of tandem RNA operons (one a two-unit tandem and one a three-unit tandem) as large tandem repeats can be problematic for assemblers. All the rRNA genes in fewer than 106 genomes are in the tandem rRNA operons (Supplementary Table 4). Of course, the tandem rRNA operons are the most likely to be deleted via recombination as well. For the 86 tRNA genes: 13 are noncore (100&#x2013;102 genomes), 18 are core but not in all genomes (103&#x2013;107), and 55 are in all 108 genomes. Retaining all the tRNA genes in 
                <italic toggle="yes">MiniBacillus</italic> also seems to be the correct decision as strains rarely dispose of the tRNA genes.</p>
            <p>Both the experimentally determined essential genes and the predicted core OGCs/regions are important data for genome engineering. They both indicate regions that should not be deleted without careful consideration. The noncore regions also indicate where the bacterium is more likely to tolerate engineered insertions. As a validation of our method and how to interpret the results our method produces it is important to understand why 16 essential genes are in noncore regions.</p>
            <p>For 
                <italic toggle="yes">B. subtilis</italic>, both Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> and Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> used similar single knockout methods to determine &#x201c;essential&#x201d; protein-coding genes when grown in LB at 37&#x00b0;C. Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> identified 257 essential genes while Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> identified 271 essential genes. The union of these two sets results in 305 essential genes (Supplementary Table 2). The Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> data set has 257 genes. The Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> data set has 271 genes. There are 223 genes in common between the two data sets. 48 genes are only in the Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> data set. 34 genes are only in the Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> data set. The Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> data set has been refined with time:
                <sup>
                    <xref ref-type="bibr" rid="ref36">36</xref>
                </sup> &#x201c;Of the original 271 genes, 31 were shown to be non-essential in recent studies. Moreover, 21 new genes (19 protein-coding genes and two RNA-coding genes) were added to the list. Thus, 261 genes encoding 259 proteins and two RNAs are regarded as being essential today&#x201d;. This list of 259 protein-coding genes is more consistent with the more recent Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> data set. The 305 genes found in either data set were mapped to the PGG OGCs using the RefSeq genome NC_000964.3 (BioSample SAMEA3138188). Interestingly through this mapping, 16 of the essential genes were not identified as core OGCs (two more essential genes were core OGCs but not present in all 108 genomes). For the 18 essential genes not present in all 108 genomes (Supplementary Table 5), 12 are in both data sets and six are only in the Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> data set. We believe only 11 of the 18 genes are truly essential. Gene 
                <italic toggle="yes">wapI/yxxG</italic> (OGC 4769 present in 39 of 108 genomes) is an antitoxin for the 
                <italic toggle="yes">wapA</italic> toxin gene which is adjacent to it (present in 85 of 108 genomes).
                <sup>
                    <xref ref-type="bibr" rid="ref37">37</xref>
                </sup> Gene 
                <italic toggle="yes">rttF/yqcF</italic> (OGC 4590 present in 46 of 108 genomes) and gene 
                <italic toggle="yes">rtbE/yxxD</italic> (OGC 4772 present in 53 of 108 genomes) are also the antitoxin of a cognate toxin-antitoxin pair.
                <sup>
                    <xref ref-type="bibr" rid="ref38">38</xref>
                </sup> Gene 
                <italic toggle="yes">yezG</italic> (OGC 4411 present in 43 of 108 genomes) is also the toxin for a cognate toxin&#x2013;antitoxin pair.
                <sup>
                    <xref ref-type="bibr" rid="ref39">39</xref>
                </sup> Gene 
                <italic toggle="yes">sknR/yqaE</italic> (OGC 4643 present in 34 of 108 genomes) is part of a phage-like region which, if removed would still allow 
                <italic toggle="yes">B. subtilis</italic> to remain viable
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup> possibly because it is another antitoxin or similar mechanism. Genes 
                <italic toggle="yes">bsuMA/ydiO</italic> (OGC 4838 present in 24 of 108 genomes) and 
                <italic toggle="yes">bsuMB/ydiP</italic> (OGC 4839 present in 24 of 108 genomes) are part of a prophage region of about 15 genes in 48 genomes which includes 
                <italic toggle="yes">ydiR</italic> and 
                <italic toggle="yes">ydiS</italic> which are type-2 restriction enzymes. These are not essential genes, but they are essential if the restriction enzymes are present.
                <sup>
                    <xref ref-type="bibr" rid="ref40">40</xref>
                </sup> We are not the first to notice these issues with experimentally determined essential genes indicated by our references above. In their review, Commichau 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref36">36</xref>
                </sup> referred to these as "protective essential genes." In fact, Koo 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> also addressed this in their paper: &#x201c;Of the 257 genes essential in LB medium, 30 are not essential in some other growth condition or genomic context...LB may have an insufficient amount of particular compounds; 
                <italic toggle="yes">e.g.</italic>, the 
                <italic toggle="yes">ylaN</italic> mutant requires a higher amount of iron than that present in LB &#x2026; or may lack a compound that could bypass the need for that gene product; 
                <italic toggle="yes">e.g.</italic>, 
                <italic toggle="yes">eno, pgm, gapA</italic>, and 
                <italic toggle="yes">alrA</italic> &#x2026; Some gene products are essential only at high growth rates typical of LB at 37&#x00b0;C (
                <italic toggle="yes">smc</italic> and 
                <italic toggle="yes">scpA</italic> &#x2026;), and these may not be essential in the natural soil environment where 
                <italic toggle="yes">B. subtilis</italic> grows slower. Finally, some genes are non-essential in specific genetic backgrounds, 
                <italic toggle="yes">e.g.</italic>, antitoxins can be deleted in strains lacking their cognate toxin gene&#x201d;.</p>
            <p>Another eight essential non-core genes are involved in wall teichoic acid (WTA) biosynthesis: Genes 
                <italic toggle="yes">tuaB</italic> (OGC 4729 present in 85 of 108 genomes), 
                <italic toggle="yes">mnaA/yvyH</italic> (OGC 4735 present in 84 of 108 genomes), 
                <italic toggle="yes">tagH</italic> (OGC 4744 present in 84 of 108 genomes), 
                <italic toggle="yes">tagG</italic> (OGC 4745 present in 35 of 108 genomes), 
                <italic toggle="yes">tagF</italic> (OGC 4746 present in 35 of 108 genomes), 
                <italic toggle="yes">tagD</italic> (OGC 4748 present in 35 of 108 genomes), 
                <italic toggle="yes">tagA</italic> (OGC 4749 present in 35 of 108 genomes) and 
                <italic toggle="yes">tagB</italic> (OGC 4750 present in 35 of 108 genomes). The WTA genes are involved in production of anionic glycopolymers required for consistent cell shape and division.
                <sup>
                    <xref ref-type="bibr" rid="ref41">41</xref>
                </sup> The WTA genes are part of a 31 gene region which has been shown to be dispensable
                <sup>
                    <xref ref-type="bibr" rid="ref42">42</xref>
                </sup> but results in malformed cells with poor growth properties. Gene 
                <italic toggle="yes">rodA</italic> (OGC 3994 present in 97 of 108 genomes) appears to be the exception as it is asserted to be essential for maintaining a rod shape and preventing spherical cells which lyse.
                <sup>
                    <xref ref-type="bibr" rid="ref43">43</xref>
                </sup> Kobayashi 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> stated: &#x201c;Ten essential genes are involved in cell shape and division. Septum formation requires seven (ftsA, L, W, and Z, divIB and C, and pbpB &#x2026;), whereas cell shape requires three (rodA, and mreB and C).&#x201d; Interestingly, genes 
                <italic toggle="yes">ftsZ</italic> (OGC 1675 present in 105 of 108 genomes) and 
                <italic toggle="yes">pbpB</italic> (OGC 1662 present in 107 of 108 genomes) while considered core, using our 95% of genomes definition are the only core OGCs not present in all 108 genomes. We investigated these 11 genes further to understand why essential genes did not appear to be core OGCs. By examining the PGG we discovered that alternate OGCs with homology to the essential genes had replaced the essential genes. Gene 
                <italic toggle="yes">pbpB</italic> (OGC 1662 in 107 genomes) is replaced in the one remaining genome by OGC 7120 which is also annotated as 
                <italic toggle="yes">pbpB.</italic> Gene 
                <italic toggle="yes">ftsZ</italic> (OGC 1675 in 105 genomes) is replaced in three genomes by a four gene insertion of OGCs 8068, 8300, 8069, and 8070 where both 8068 and 8070 are annotated as 
                <italic toggle="yes">ftsZ.</italic> Gene 
                <italic toggle="yes">rodA</italic> (OGC 3994 in 97 genomes) is replaced by either: OGC 8718 (two genomes) or OGCs 10492, 6436, and 6437 (one genome) or OGCs 6436 and 6437 (eight genomes) where 8718 and 6436 are annotated as 
                <italic toggle="yes">rodA.</italic> As an illustrative example for 
                <italic toggle="yes">rodA</italic>, 
                <xref ref-type="fig" rid="f3">Figure 3</xref> shows how this is represented in the PGG. The medoid sequences for OGCs 6436 (A4A60_RS20560), and 8718 (C7M30_RS12210) have full length homology to the medoid sequence for rodA (OGC 3994, ETA10_RS20040) with 66% nucleotide /65% peptide and 83% nucleotide/85% peptide identity respectively. For 
                <italic toggle="yes">B. subtilis</italic> ssp. 
                <italic toggle="yes">spizizenii</italic> strain W23, poly (ribitol phosphate) is the main teichoic acid
                <sup>
                    <xref ref-type="bibr" rid="ref44">44</xref>
                </sup> and this was thought to distinguish ssp. 
                <italic toggle="yes">spizizenii</italic> from ssp. 
                <italic toggle="yes">subtilis</italic> whose type strain 168 has poly (glycerol phosphate) as the main teichoic acid. Further study found that the ribitol/glycerol distinction does not distinguish between 
                <italic toggle="yes">spizizenii</italic> and 
                <italic toggle="yes">subtilis</italic> subspecies
                <sup>
                    <xref ref-type="bibr" rid="ref45">45</xref>
                </sup> but rather either subspecies can contain one or the other. Our PGG confirms this and in fact finds six distinct variants of the WTA region. For example the 
                <italic toggle="yes">tagD</italic> gene (OGC 4748 in 35 genomes) has been replaced by multiple orthologs with the same annotation: OGC 3746 (23 genomes), OGC 5431 (43 genomes), OGC 6915 (two genomes), OGC 7624 (three genomes), and OGC 8731 (one genome). The variation of the WTA region in 
                <italic toggle="yes">B. subtilis</italic> will be the focus of a future paper.
                <sup>
                    <xref ref-type="bibr" rid="ref46">46</xref>
                </sup>
            </p>
            <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                <label>Figure 3. </label>
                <caption>
                    <title>Region of the 
                        <italic toggle="yes">B. subtillis</italic> refined PGG encompassing the variation in the 
                        <italic toggle="yes">rodA</italic> gene across the pan-genome.</title>
                    <p>OGC 3994 (red) contains the rodA gene from the reference strain. The medoid sequences of OGCs 6436 and 8718 (green) have RefSeq annotations of 
                        <italic toggle="yes">rodA</italic> and full-length homology below our 90% threshold to the medoid sequence for OGC 3994. The arrow boxes represent OGCs with gene directionality indicated by the 5&#x2032; end being flat and the 3&#x2032; end being pointed. Numbers above boxes and edges are the number of genomes the OGC or edge are in.</p>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76417/ffdd4ace-2b07-465f-94e8-75e9e90d93a7_figure3.gif"/>
            </fig>
            <p>For the 34 protein coding genes from 
                <italic toggle="yes">MiniBacillus</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> which were not in all 108 genomes (Supplementary Table 6), 10 were already discussed above as to why they were essential but not core. The seven essential genes previously shown to be protective essential genes are as expected not in the 
                <italic toggle="yes">MiniBacillus</italic> data set. The noncore 
                <italic toggle="yes">tuaB</italic> gene was essential in both data sets but not included in 
                <italic toggle="yes">MiniBacillus.</italic> This leaves 24 MiniBacillus protein coding genes which are noncore and unexplained. The tagU and gtaB genes are part of the WTA cassette discussed above. The four 
                <italic toggle="yes">fecC-F</italic> (also called 
                <italic toggle="yes">yfmC-F</italic>) genes form a cassette and are in 98 genomes. From Reu&#x00df; 
                <italic toggle="yes">et al.</italic>:
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> &#x201c;For iron uptake, the minimal cell should possess the EfeUO system for elemental iron uptake and the iron-citrate ABC transporter YhfQ-YfmCDEF (136, 137).&#x201d; (
                <italic toggle="yes">yhfQ</italic> is present in all 108 genomes) but no alternate mechanism is specified. The seven 
                <italic toggle="yes">purEKBCSQL</italic> genes form a cassette and are in 107 genomes. These genes are involved in purine biosynthesis (see Figure 5 in Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup>) and it is not clear what alternative could be used. The 
                <italic toggle="yes">guaA</italic> gene is involved in nucleotide biosynthesis downstream of purine biosynthesis (see Figure 5 in Reub 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup>) present in 107 genomes. The 
                <italic toggle="yes">mntH</italic> gene is a manganese transporter (see Figure 2 in Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup>) present in 106 genomes. The 
                <italic toggle="yes">rlmCD</italic> gene is an rRNA methyltransferase present in 107 genomes. The 
                <italic toggle="yes">lytE</italic>, and 
                <italic toggle="yes">ponA</italic> genes are in 107 genomes. The 
                <italic toggle="yes">pbpB</italic> gene was essential as discussed above and in 107 genomes. The 
                <italic toggle="yes">pbpA</italic> gene is in 50 genomes. From Reu&#x00df; 
                <italic toggle="yes">et al.</italic>:
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> &#x201c;For the minimal cell, we have selected penicillin-binding proteins 1 (PonA), 2B (PbpB), and 2A (PbpA) and the autolysins LytE and LytF. As outlined above, this selection was made according to their expression profiles and the dependence on other proteins. As an example, there is a functional paralog of LytE, CwlO. For the activity of CwlO, 
                <italic toggle="yes">B. subtilis</italic> also needs the ABC transporter FtsEX and the small protein Mbl. Thus, the choice of LytE allowed a smaller number of genes.&#x201d;. Interestingly, genes 
                <italic toggle="yes">cwlO</italic>, 
                <italic toggle="yes">ftsE</italic>, 
                <italic toggle="yes">ftsX</italic>, and 
                <italic toggle="yes">mbl</italic> are in all 108 genomes. The 
                <italic toggle="yes">yitI</italic> gene is in 107 genomes. From Reu&#x00df; 
                <italic toggle="yes">et al.</italic>:
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> &#x201c;Moreover, based on our own experimental data and those of colleagues, YitI, YitW, and YqhY are important for viability (P. Dos Santos, personal communication; our unpublished results).&#x201d;. The 
                <italic toggle="yes">yoaE</italic> gene is a formate dehydrogenase present in 89 genomes. The 
                <italic toggle="yes">thyB</italic> gene is thymidylate synthase B present in 70 genomes. The 
                <italic toggle="yes">rpoE</italic> gene is in 107 genomes. From Reub 
                <italic toggle="yes">et al.</italic>:
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> &#x201c;Moreover, we have included the RNA polymerase-interacting protein HelD and the nonessential delta subunit (RpoE). HelD binding stimulates transcription in an RpoE-dependent manner, suggesting that these two accessory proteins are important to allow rapid growth (59, 60).&#x201d;. The 
                <italic toggle="yes">hutM</italic> gene is a histidine permease present in 90 genomes. 
                <italic toggle="yes">MiniBacillus</italic> does not include the adjacent 
                <italic toggle="yes">hutPHUIG</italic> genes which are in 88-91 genomes probably indicating a cassette of genes which interact.</p>
            <p>We looked at how our OGGs intersected with the gene deletions from 
                <italic toggle="yes">B. subtilis</italic> strains delta 6, IIG-Bs27-47-24, PG10, and PS38 from Reu&#x00df; 
                <italic toggle="yes">et al.</italic>&#x2019;s
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> Supplemental Table S1. Strains PG10 and PS38 were derived from strain IIG-Bs27-47-24 which in turn was derived from strain delta 6. This means all deletions in delta 6 are present in the other strains, and all deletions in IIG-Bs27-47-24 are present in PG10 and PS38. For delta 6, most of the deleted genes are noncore which would be expected since most of the deleted regions were phage/prophage regions (
                <xref ref-type="table" rid="T3">Table 3</xref>). For additional deletions to IIG-Bs27-47-24, almost a quarter of the deleted genes are noncore which would again be expected as more prophage and horizontally transferred regions were intentionally targeted but now more core genes were deleted based on core functionality deemed not to be essential for laboratory growth such as sporulation (
                <xref ref-type="table" rid="T3">Table 3</xref>). For additional deletions to PG10 and PS38, most deleted genes were core as most of the obviously horizontally transferred regions had already been deleted (
                <xref ref-type="table" rid="T3">Table 3</xref>). While pan-genome analysis was not used to select the deleted regions, we believe it could have provided strong evidence to support the deletion of the noncore genes/regions which were deleted. In addition, it could be used to suggest further deletions. There are nine noncore regions which contain seven or more noncore genes which have not yet been deleted in any of these strains (
                <xref ref-type="table" rid="T4">Table 4</xref>). The largest of these regions contains the WTA genes cassette we discussed above and is not a good candidate for deletion. By examining the refined PGG at these regions it is straightforward to determine if there are alternate OGC choices for the region that in sum designate the region as likely to be core as we showed in 
                <xref ref-type="fig" rid="f3">Figure 3</xref>.</p>
            <table-wrap id="T3" orientation="portrait" position="anchor">
                <label>Table 3. </label>
                <caption>
                    <title>The number of deleted genes from 
                        <italic toggle="yes">B. subtilis</italic> reduced strains which are noncore versus core.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Strains</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Number of noncore deleted genes</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Number of core deleted genes</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">delta 6, IIG-Bs27-47-24, PG10, PS38</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">340</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">46</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">IIG-Bs27-47-24, PG10, PS38</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">232</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">792</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">PG10, PS38</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">7</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">57</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">PG10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">14</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">78</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">PS38</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">15</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">129</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <table-wrap id="T4" orientation="portrait" position="anchor">
                <label>Table 4. </label>
                <caption>
                    <title>Large noncore regions which have not been deleted from any of the strains delta 6, IIG-Bs27-47-24, or PG10, PS38.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">First gene</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Last gene</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Number of noncore genes in region</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Number of core genes in region</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Alternate genes in refined PGG</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU04270,
                                <italic toggle="yes">epsJ</italic>,OGC4339</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU04320,
                                <italic toggle="yes">kimA</italic>,OGC4344</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">7</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">no</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU05040,
                                <italic toggle="yes">yddN</italic>,OGC4348</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU05110,
                                <italic toggle="yes">sufLC</italic>,OGC579</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU07440,
                                <italic toggle="yes">yfmK</italic>,OGC4418</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU07550,
                                <italic toggle="yes">yflT</italic>,OGC4427</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">no</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU11910,
                                <italic toggle="yes">yjcM</italic>,OGC4457</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU11990,
                                <italic toggle="yes">yjdB</italic>,OGC4465</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU18940,
                                <italic toggle="yes">yobHm</italic>,OGC4568</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU19000,
                                <italic toggle="yes">rttL</italic>,OGC2078</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU29280,
                                <italic toggle="yes">ytnM</italic>,OGC4678</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU29400,
                                <italic toggle="yes">ascR</italic>,OGC4689</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">13</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">no</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU35550,
                                <italic toggle="yes">tuaG</italic>,OGC4724</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU35770,
                                <italic toggle="yes">tagC</italic>,OGC4751</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">28</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU37220,
                                <italic toggle="yes">ywjB</italic>,OGC3904</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU37320,
                                <italic toggle="yes">narK</italic>,OGC3915</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">11</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU39850,
                                <italic toggle="yes">yxbF</italic>,OGC4783</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">BSU39920,
                                <italic toggle="yes">asnH</italic>,OGC4790</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">yes</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <p>To show that our method produces significantly different results than previous methods we compared our pan-genome analysis to the very recent work on a 
                <italic toggle="yes">B. subtilis</italic> pan-genome by Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> While the focus of Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> was on determining which genomes should be excluded from a species/subspecies pan-genome based on &#x201c;incorrectly classified 
                <italic toggle="yes">Bacillus</italic> subspecies strains, phylogenetically distinct strains, engineered genome-reduced strains, chimeric strains, strains with a large number of unique genes or a large proportion of pseudogenes, and multiple clonal strains&#x201d;, their analysis focused on how this affected the determination of core OGCs. We compared our core OGC set to theirs for the reference genome. Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> discussed two pan-genome data sets: &#x201c;old (89 strains) and new (153 strains)&#x201d;. We compared to the new data set which is more recent and more comparable to our pan-genome of 108 strains (Supplementary Table 5). After removing &#x201c;confounding&#x201d; strains the new data set had 128 strains. From their Table 1 compared to our 
                <xref ref-type="table" rid="T2">Table 2</xref>, Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> have many fewer core OGCs whether defined at 95%, 99%, or 100% both for our original and refined PGGs. We compared their methods to ours to attempt to account for the difference. They also apparently restricted genomes to those available from RefSeq since they mention a RefSeq ID. They did not require the genomes to be considered complete by RefSeq as we did but instead used these criteria: &#x201c;Among these 
                <italic toggle="yes">B. subtilis</italic> strains, we removed strains whose N base content was greater than 1% of the genomic size (FB6-3,GS 188, SR1), and we removed the chimeric genome BEST7613 with a genome size of 7.6 Mb.&#x201d;. We used the RefSeq annotation which is generated by a consistent NCBI annotation pipeline. They also tried to ensure consistent annotation: to &#x201c;ensure the consistency and reliability of the annotation and gene prediction of the genome, we used the program Prokaryotic Genome Annotation System (Prokka)&#x201d;. We doubt the different annotations from these two established pipelines accounts for many differences in core OGCs. Both methods used a whole genome ANI method to discard outlier genomes. There are multiple differences in our pan-genome approach. First, we used PanOCT and they used Roary. Second, we used all annotated gene features: gene (protein coding), pseudogene, miscRNA, rRNA, and tRNA, whereas they used only protein-coding genes. Finally, and we think most importantly, we iterated over annotating the genomes and PGG refinement to ensure consistent annotation and they did not. To see what impact our choice of all gene features versus just protein-coding genes had we looked at the annotation of core OGCs on the reference genome (Supplementary Table 1). Luckily all 3778 core (95% threshold) OGCs are present in the reference genome. Of these, 3473, 3334, and 3189 are protein coding OGCs at thresholds 95%, 99%, and 100% respectively. All these numbers are still much higher than those reported by Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> We should note that even though we did not count the 25 core OGCs annotated as pseudogenes in the reference genome, some of the core protein-coding OGCs in the reference genome might be annotated as pseudogenes in other genomes which could impact the Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> numbers. Roary tends to require near full length gene matches which is why we required PanOCT to only use 90% or longer length matches. The authors chose to limit Roary to 95% identity or higher matches which we think is much too high since the species ANI threshold is 95% and even subspecies ANI threshold of 98% is too close to this threshold given that some genes are more rapidly evolving than others so we used a threshold of 90% or higher identity for matches. Even with our 90% identity threshold some genes such as 
                <italic toggle="yes">rodA</italic>, discussed above, drop below this threshold generating possibly unnecessary branching in the PGG. Of the 128 strain pan-genome from Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> that we compared to our 108 strain pan-genome, 92 strains were in common with 16 being exclusive to our pan-genome and 36 being exclusive to theirs. Of the 36 strains exclusive to theirs 23 were removed as being redundant at the ANI level by us, 9 were in RefSeq but not complete genomes, and 4 either were never in RefSeq (they do not have RefSeq IDs in their Supplementary Table 3) or no longer are. Interestingly, while 15 of the 16 genomes exclusive to ours are just more recent strains to RefSeq, one strain, D12-5, was used by us but discarded by them. They discarded D12-5 because &#x201c;BS155 and D12-5 possess the largest proportion of pseudogenes (37.96% and 11.32%) among the 
                <italic toggle="yes">B. subtilis</italic> strains&#x201d; and for D12-5 they indicated this was due to a large number of frameshifts. Pseudogenes due to frameshifts are often an indication of lower quality assembly consensus sequence from using only long reads at lower coverage. Our pan-genome method is resilient to this kind of error profile in the genome due to reannotation of the genomes and PGG refinement whereas other pan-genome methods are not. We believe our higher counts for core protein coding OGCs is correct. To validate this, we compared how many of the 305 essential 
                <italic toggle="yes">B. subtilis</italic> genes are core for both methods. For the 18 genes we discussed above that are essential but not in all 108 genomes of our PGG, 2 are core at 95% and 1 is core at 99%; whereas, for Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> 2 are core at 95%, 2 are core at 99%, and 1 is core at 100%. The only significant difference for these 18 genes is that 
                <italic toggle="yes">ftsZ</italic> is in 95% (105) of our pan-genome and 100% of theirs. The Wu 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref47">47</xref>
                </sup> pan-genome misses many additional essential genes which ours does not: 28, 39, and 47 for 95%, 99%, and 100% thresholds respectively.</p>
            <p>For 
                <italic toggle="yes">E. coli</italic>, Goodall 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> determined 
                <italic toggle="yes">E. coli</italic> essential genes using an analysis of transposon insertion events (TraDIS). The results of their study and two other studies, the Keio collection
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> and the Profiling of the 
                <italic toggle="yes">E. coli</italic> Chromosome (PEC)
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> were captured in Table S2 of Goodall 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> Of the 414 genes with overlap between these studies, the 248 essential genes in common for all three studies are all core OGCs (
                <xref ref-type="fig" rid="f4">Figure 4</xref> and Supplementary Table 7). This set of 248 essential genes should be the highest quality predictions as determined by all three studies and confirms our assertion that essential genes should almost always be core OGCs. The next highest quality set of essential gene predictions is the 45 essential genes where two of the three studies agree which 41 are core OGCs: for Keio&#x2013;PEC, 15 of 16 are core OGCs; for TraDIS&#x2013;Keio, eight of 11 are core OGCs; and for TraDIS&#x2013;PEC, 18 of 18 are core OGCs (Supplementary Table 7). The lowest quality set of essential gene predictions is the 121 essential genes where only one study agrees which 89 are core OGCs: for Keio only, 12 of 22 are core OGCs; for PEC only, 18 of 18 are core OGCs; and for TraDIS only, 59 of 81 are core OGCs (Supplementary Table 7). One of the noncore essential genes present in two studies (TraDIS&#x2013;Keio), 
                <italic toggle="yes">racR,</italic> is probably a toxin suppressor which is not essential in the absence of the toxins. Bindal 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref48">48</xref>
                </sup> noted, &#x201c;We further show that both YdaS and YdaT can act independently as toxins and that RacR serves to counteract the toxicity by tightly downregulating the expression of these toxins&#x201d;. The 
                <italic toggle="yes">racR</italic> gene is found in only 106 of the 971 genomes in the 
                <italic toggle="yes">E. coli</italic> PGG, whereas 
                <italic toggle="yes">ydaS</italic> and 
                <italic toggle="yes">ydaT</italic> are found in 106 and 150 genomes respectively, perhaps arguing that 
                <italic toggle="yes">ydaS</italic> is the key toxin gene. This recapitulates the pattern we observed in 
                <italic toggle="yes">B. subtilis</italic> where toxin suppressor genes are only essential in the presence of toxin genes. Similarly, the 
                <italic toggle="yes">dicA</italic> gene (TraDIS&#x2013;Keio) can be deleted if the 
                <italic toggle="yes">dicB</italic> gene is also deleted. Kato 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref49">49</xref>
                </sup> noted: &#x201c;The 
                <italic toggle="yes">dicA</italic> gene encoding a repressor of a cell division inhibitor was deleted in our study with the 
                <italic toggle="yes">dicB</italic>, the inhibitor gene&#x201d;. There are 521 core regions for 
                <italic toggle="yes">E. coli</italic> (Supplementary Table 8). The 378 essential genes which are core OGCs are contained in only 133 of these regions. These 378 essential genes are not evenly distributed in these 133 regions (
                <italic toggle="yes">e.g.</italic>, 27 are in core region 362). Similarly, the 36 essential genes in non-core regions (the regions between core regions) are contained in only 23 non-core regions with four in the non-core region between core regions 152 and 153. A table of all 
                <italic toggle="yes">E. coli</italic> genes mapped to the reference is provided in Supplementary Table 9.</p>
            <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                <label>Figure 4. </label>
                <caption>
                    <title>There are four tracks mapped to the 
                        <italic toggle="yes">E. coli</italic> reference genome in this Circos figure.</title>
                    <p>Going from the outside to the inside: track 1) core regions (dark red), 2) TraDis essential genes (green), 3) Keio essential genes (light blue), and 4) PEC essential genes (medium blue).</p>
                </caption>
                <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/76417/ffdd4ace-2b07-465f-94e8-75e9e90d93a7_figure4.gif"/>
            </fig>
            <p>Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> presented a similar pan-genome analysis for 491 
                <italic toggle="yes">E. coli</italic> strains. There were 420 strains in common between the Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> 491 strain pan-genome and our 971 strain pan-genome (Supplementary Table 10). Our pan-genome included 
                <italic toggle="yes">Shigella</italic> species (see Methods) which Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> did not. This added diversity of our pan-genome should reduce the number of core OGCs. Likewise, the much larger number of strains in our pan-genome should reduce the number of core OGCs. Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> report 867 core protein-coding genes presumably at a 100% threshold although this is not explicitly stated. For our refined PGG, we had 1501 core OGCs at the 100% threshold. We include all genes in our OGCs but 1234 of the 1501 core OGCs are protein coding at the 100% threshold. Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> did not provide a table of their core genes for sake of comparison, however we expect for the same reasons as for our more detailed analysis of the 
                <italic toggle="yes">B. subtilis</italic> pan-genome that our set of core OGCs is more complete. Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> reported that their core genes included 243 essential genes from the DEG database
                <sup>
                    <xref ref-type="bibr" rid="ref51">51</xref>
                </sup> which contains essential genes from many studies but did not provide a table of these genes. Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> also reference two essential gene studies one by Gerdes 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref52">52</xref>
                </sup> and one by Baba 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> which was one of the three studies we used (Keio). In the DEG database the Gerdes 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref52">52</xref>
                </sup> study has 609 essential genes, and the Baba 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> study has 296 essential genes. Our version of the Baba 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> study we called Keio had 297 essential genes of which 218 were core OGCs at the 100% threshold. For the union of the three studies we compared against, we had 289 essential genes out of 414 which were core OGCs at the 100% threshold. It is unclear whether the 243 core essential genes Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> reported were from the Baba 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> study, the Gerdes 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref52">52</xref>
                </sup> study, or the union of the two studies. Given the much lower number of core genes for the Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> core genes compared with our core OGCs, we believe that Yang 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref50">50</xref>
                </sup> used the union of essential genes from the Baba 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> and Gerdes 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref52">52</xref>
                </sup> studies.</p>
            <p>There is of course no &#x201c;gold standard&#x201d; that provides a 100% correct set of core regions/genes for a pan-genome/species. When comparing our method to others, this leaves only indirect measures of accuracy. We compared our method versus two other recent core gene determinations for 
                <italic toggle="yes">Bacillus subtilis </italic>and
                <italic toggle="yes"> Escherichia coli&#x00a0;</italic>and showed that our method was superior using coverage of essential genes by core genes as an indirect measure. We also showed that the PGG allowed for a detailed analysis of exceptions such as when an OGC is replaced by a more distant ortholog.</p>
        </sec>
        <sec id="sec4" sec-type="discussion">
            <title>Discussion</title>
            <p>For the purpose of biological engineering, determining the set of core regions for a given species is critical as changes to these regions should be expected to reduce fitness or be lethal. Core regions indicate parts of the genome that are conserved across evolution within a species. These regions are not necessarily required for survival but presumably confer a fitness advantage and define the characteristic core genotype which produces the core phenotype (lifestyle). Since most essential gene studies are carried out under specific static laboratory growth conditions, genes which would normally be essential for a species across a diverse set of dynamic environmental conditions might not be discovered (
                <italic toggle="yes">e.g.</italic>, necessary for fluctuating temperatures). Correspondingly, genes required to out compete rival organisms through increased fitness or to evade immune responses might not be found under laboratory conditions are considered facultative essential.
                <sup>
                    <xref ref-type="bibr" rid="ref31">31</xref>
                </sup> Core regions, therefore, should be a superset of essential genes in most cases but exceptions might occur for genes which are not needed in a species&#x2019; natural niche but are required in a laboratory setting. Another exception would be for genes which are essential for a particular strain but not for other strains due to the presence of compensating non-core genes.</p>
            <p>Noncore OGCs/regions which are determined by pan-genome analysis are often horizontally transferred elements, such as phage, prophage, or mobile elements. For industrial applications these regions are dispensable and can even be sources of genome instability.
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>,
                    <xref ref-type="bibr" rid="ref12">12</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref14">14</xref>,
                    <xref ref-type="bibr" rid="ref53">53</xref>
                </sup> While there are other methods for identifying these regions, pan-genome analysis is a reliable complimentary tool. Pan-genome analysis can also reveal enzymatic and other systems/pathways that are present in some strains but not others
                <sup>
                    <xref ref-type="bibr" rid="ref53">53</xref>
                </sup> which indicates they can likely be removed. When choosing between retaining alternate systems for essential functions, biological engineers have looked at conservation of those systems across broad taxonomic levels
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> as an indication of utility and we believe conservation across the pan-genome should also be considered. When specific genes/systems of known function are being targeted for removal pan-genome analysis is less useful but still good information to have. For instance, Reu&#x00df; 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> tried to delete region BSU07710-07820 from 
                <italic toggle="yes">B. subtilis</italic> which was lethal. In this region, six of the 11 OGCs are core but the five noncore genes are adjacent so perhaps region BSU07750-07790 could have been successfully deleted.</p>
            <p>Given that we believe pan-genome analysis is a useful complimentary tool for biological engineers, it is important that the pan-genome analysis used be as accurate and helpful as possible. We showed by comparing with other recent pan-genome studies for 
                <italic toggle="yes">B. subtilis</italic> and 
                <italic toggle="yes">E. coli</italic> that our method is more accurate for determining core OGCs/regions as validated by coverage of essential genes. Further, we believe that the PGG is valuable for confirming when noncore OGCs may be compensated for with alternate homologous OGCs at the same relative genomic location performing the same function as we showed in 
                <xref ref-type="fig" rid="f3">Figure 3</xref>. The function of these noncore OGCs may be essential and should be considered appropriately.</p>
            <p>Pan-genome studies often capture the diversity of sequenced species but fail to compare gene lists to experimentally validated essential genes lists or the results are confusing. Interestingly in 
                <italic toggle="yes">Mycoplasma</italic>, fewer essential genes were determined with the pan-genome method compared with the laboratory experimental approach.
                <sup>
                    <xref ref-type="bibr" rid="ref54">54</xref>
                </sup> In 
                <italic toggle="yes">Pseudomonas</italic>, only one-third of the pan-genome single copy genes had overlap with the essential genes from experimentally reduced genomic studies.
                <sup>
                    <xref ref-type="bibr" rid="ref55">55</xref>
                </sup> We showed that the core OGCs/regions from our refined PGG encompass 91% and 95% of the 
                <italic toggle="yes">E. coli</italic> and 
                <italic toggle="yes">B. subtilis</italic> experimentally determined essential gene lists
                <italic toggle="yes">,</italic> respectively. Both model bacterial species 
                <italic toggle="yes">E. coli</italic> and 
                <italic toggle="yes">B. subtilis</italic> have had many genome reduction studies performed and reviewed elsewhere.
                <sup>
                    <xref ref-type="bibr" rid="ref56">56</xref>
                </sup>
            </p>
            <p>Experimental verification of the essentiality of computationally predicted core OGCs or regions requires that each strain of the pan-genome study be minimized. However, it is cost prohibitive to do knockout studies on all strains of a pan-genome. One must carefully choose a single genome as a representative of the entire pan-genome for the purpose of verifying the essentiality of core regions and/or the non-essentiality of noncore regions by experimental validation. However, given the diversity of most bacterial species it is unlikely that any one strain completely captures the capabilities of the species in all environmental conditions. Further, while there are clearly core OGCs/regions associated with viability for a species, other core regions probably contribute to a lesser degree to cell viability. For example, for the purpose of biological engineering, changes in these locations may reduce fitness by slowing cell growth.</p>
            <p>The use of a PGG for identifying core regions of a bacterium is an automatable, low-cost, rapid, and effective way to evaluate both Gram-negative and Gram-positive bacteria. This method compliments and expands upon the experimental knockout approach by including environmental diversity as a measure of what regions and OGCs are conserved across the species. The approach also overcomes the limitations of knockout studies that are specific to the strains and growth conditions used.</p>
            <p>The 
                <italic toggle="yes">B. subtilis</italic> WTA region provides a cautionary note for relying entirely upon core regions to determine what is safe to remove. While most non-core regions involve cassettes of genes which are entirely absent from some strains such as phage regions, sometimes orthologous replacement possibly due to homologous recombination can have functionally equivalent genes appearing to be non-core. A closer examination of the PGG can determine if a region is simply missing from some strains versus being replaced in which case further study may be needed before removal of the region. Of course, in some cases the orthologous replacement does not need to occur at the same location in the genome but that was the case for all instances we examined in 
                <italic toggle="yes">B. subtilis.</italic>
            </p>
            <p>While we showed that almost all essential genes are core OGCs and most are OGCs at the 100% threshold, the exceptions are interesting. We discussed issues such as &#x201c;protective essential genes&#x201d;
                <sup>
                    <xref ref-type="bibr" rid="ref36">36</xref>
                </sup> (such as toxin/anti-toxin gene pairs) and more distant orthologs not captured in OGCs. We did not discuss genes which might be undergoing gene loss.
                <sup>
                    <xref ref-type="bibr" rid="ref57">57</xref>
                </sup> The PGG is well suited to looking at which subset of genomes have suffered a gene loss and possible mechanisms such as gene replacement. The PGG has been used to show which genomic regions tend not to allow insertions of horizontally transferred genes
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> and where metabolic cassettes can be swapped.
                <sup>
                    <xref ref-type="bibr" rid="ref53">53</xref>
                </sup>
            </p>
        </sec>
    </body>
    <back>
        <ack>
            <title>Acknowledgements</title>
            <p>The authors would like to thank IARPA for sponsoring this research and would like to thank Derren Barken for his assistance in table generation.</p>
        </ack>
        <sec id="sec6">
            <title>Data availability</title>
            <sec id="sec7">
                <title>Underlying data</title>
                <p>Figshare: Underlying data for &#x2018;A pan-genome method to determine core regions of the 
                    <italic toggle="yes">Bacillus subtilis</italic> and 
                    <italic toggle="yes">Escherichia coli</italic> genomes&#x2019;, 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/m9.figshare.15129636.v1">https://doi.org/10.6084/m9.figshare.15129636.v1</ext-link>.
                    <sup>
                        <xref ref-type="bibr" rid="ref58">58</xref>
                    </sup>
                </p>
                <p>This project contains the following underlying data:
                    <list list-type="bullet">
                        <list-item>
                            <label>&#x2022;</label>
                            <p>
                                <xref ref-type="table" rid="T1">Table 1</xref>. Selection of complete genomes for 
                                <italic toggle="yes">B. subtilis</italic> and 
                                <italic toggle="yes">E. coli</italic> PGGs.
                                <italic toggle="yes"/>
                                <italic toggle="yes"/>
                            </p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>
                                <xref ref-type="table" rid="T2">Table 2</xref>. Pan-genome graph statistics for 
                                <italic toggle="yes">B. subtilis</italic> and 
                                <italic toggle="yes">E. coli.</italic>
                                <italic toggle="yes"/>
                            </p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>
                                <xref ref-type="table" rid="T3">Table 3</xref>. The number of deleted genes from 
                                <italic toggle="yes">B. subtilis</italic> reduced strains which are noncore versus core.</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>
                                <xref ref-type="table" rid="T4">Table 4</xref>. Large noncore regions which have not been deleted from any of the strains delta 6, IIG-Bs27-47-24, or PG10, PS38.</p>
                        </list-item>
                    </list>
                </p>
                <p>Data are available under the terms of the 
                    <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International license</ext-link> (CC BY 4.0).</p>
            </sec>
            <sec id="sec">
                <title>Extended data</title>
                <p>Figshare: Extended data for &#x2018;A pan-genome method to determine core regions of the 
                    <italic toggle="yes">Bacillus subtilis</italic> and 
                    <italic toggle="yes">Escherichia coli</italic> genomes&#x2019;, 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6084/m9.figshare.15129636.v1">https://doi.org/10.6084/m9.figshare.15129636.v1</ext-link>.
                    <sup>
                        <xref ref-type="bibr" rid="ref58">58</xref>
                    </sup>
                </p>
                <p>This project contains the following extended data:
                    <list list-type="bullet">
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 1</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 2</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 3</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 4</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 5</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 6</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 7</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 8</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 9</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Supplementary Table 10</p>
                        </list-item>
                    </list>
                </p>
                <p>Data are available under the terms of the 
                    <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International license</ext-link> (CC BY 4.0).</p>
            </sec>
        </sec>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hutchison</surname>
                            <given-names>CA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Chuang</surname>
                            <given-names>RY</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Noskov</surname>
                            <given-names>VN</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Design and synthesis of a minimal bacterial genome.</article-title>
                    <source>

                        <italic toggle="yes">Science.</italic>
</source>
                    <year>2016</year>;<volume>351</volume>:<fpage>aad6253</fpage>.
                    <pub-id pub-id-type="pmid">27013737</pub-id>
                    <pub-id pub-id-type="doi">10.1126/science.aad6253</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Reu&#x00df;</surname>
                            <given-names>DR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Altenbuchner</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>M&#x00e4;der</surname>
                            <given-names>U</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large-scale reduction of the 
                        <italic toggle="yes">Bacillus subtilis</italic> genome: consequences for the transcriptional network, resource allocation, and metabolism.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2017</year>;<volume>27</volume>(<issue>2</issue>):<fpage>289</fpage>&#x2013;<lpage>299</lpage>.
                    <pub-id pub-id-type="pmid">27965289</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.215293.116</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5287234</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Reu&#x00df;</surname>
                            <given-names>DR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Commichau</surname>
                            <given-names>FM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gundlach</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The Blueprint of a Minimal Cell: MiniBacillus.</article-title>
                    <source>

                        <italic toggle="yes">Microbiol Mol Biol Rev.</italic>
</source>
                    <year>2016</year>;<volume>80</volume>(<issue>4</issue>):<fpage>955</fpage>&#x2013;<lpage>987</lpage>.
                    <pub-id pub-id-type="pmid">27681641</pub-id>
                    <pub-id pub-id-type="doi">10.1128/MMBR.00029-16</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5116877</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Mario</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reu&#x00df;</surname>
                            <given-names>DR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhu</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>
                        <italic toggle="yes">Bacillus subtilis</italic> and 
                        <italic toggle="yes">Escherichia coli</italic> essential genes and minimal cell factories after one decade of genome engineering.</article-title>
                    <source>

                        <italic toggle="yes">Microbiol.</italic>
</source>
                    <year>2014</year>;<volume>160</volume>(<issue>11</issue>):<fpage>2341</fpage>&#x2013;<lpage>2351</lpage>.
                    <pub-id pub-id-type="pmid">25092907</pub-id>
                    <pub-id pub-id-type="doi">10.1099/mic.0.079376-0</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kolisnychenko</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Plunkett</surname>
                            <given-names>G</given-names>
                            <suffix>3rd</suffix>
                        </name>

                        <name name-style="western">
                            <surname>Herring</surname>
                            <given-names>CD</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Engineering a reduced 
                        <italic toggle="yes">Escherichia coli</italic> genome.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2002</year>;<volume>12</volume>(<issue>4</issue>):<fpage>640</fpage>&#x2013;<lpage>647</lpage>.
                    <pub-id pub-id-type="pmid">11932248</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.217202</pub-id>
                    <pub-id pub-id-type="pmcid">PMC187512</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Maranas</surname>
                            <given-names>CD</given-names>
                        </name>
</person-group>:
                    <article-title>MinGenome: An 
                        <italic toggle="yes">In Silico</italic> Top-Down Approach for the Synthesis of Minimized Genomes.</article-title>
                    <source>

                        <italic toggle="yes">ACS Synth Biol.</italic>
</source>
                    <year>2018</year>;<volume>7</volume>(<issue>2</issue>):<fpage>462</fpage>&#x2013;<lpage>473</lpage>.
                    <pub-id pub-id-type="pmid">29254336</pub-id>
                    <pub-id pub-id-type="doi">10.1021/acssynbio.7b00296</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kobayashi</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ehrlich</surname>
                            <given-names>SD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Albertini</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Essential 
                        <italic toggle="yes">Bacillus subtilis</italic> genes.</article-title>
                    <source>

                        <italic toggle="yes">Proc Natl Acad Sci U S A.</italic>
</source>
                    <year>2003</year>;<volume>100</volume>:<fpage>4678</fpage>&#x2013;<lpage>4683</lpage>.
                    <pub-id pub-id-type="pmid">12682299</pub-id>
                    <pub-id pub-id-type="doi">10.1073/pnas.0730515100</pub-id>
                    <pub-id pub-id-type="pmcid">PMC153615</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Koo</surname>
                            <given-names>BM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kritikos</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Farelli</surname>
                            <given-names>JD</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Construction and Analysis of Two Genome-Scale Deletion Libraries for 
                        <italic toggle="yes">Bacillus subtilis.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Cell Syst.</italic>
</source>
                    <year>2017</year>;<volume>4</volume>:<fpage>291</fpage>&#x2013;<lpage>305</lpage>.
                    <pub-id pub-id-type="pmid">28189581</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.cels.2016.12.013</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5400513</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Goodall</surname>
                            <given-names>ECA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Robinson</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Johnston</surname>
                            <given-names>IG</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The Essential Genome of 
                        <italic toggle="yes">Escherichia coli</italic> K-12.</article-title>
                    <source>

                        <italic toggle="yes">mBio.</italic>
</source>
                    <year>2018</year>;<volume>20</volume>:<fpage>e02096</fpage>&#x2013;<lpage>17</lpage>.
                    <pub-id pub-id-type="pmid">29463657</pub-id>
                    <pub-id pub-id-type="doi">10.1128/mBio.02096-17</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5821084</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Baba</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ara</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hasegawa</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Construction of 
                        <italic toggle="yes">Escherichia coli</italic> K-12 in-frame, single-gene knockout mutants: the Keio collection.</article-title>
                    <source>

                        <italic toggle="yes">Mol Syst Biol.</italic>
</source>
                    <year>2006</year>;<volume>2</volume>:<fpage>2006.0008</fpage>.
                    <pub-id pub-id-type="pmid">16738554</pub-id>
                    <pub-id pub-id-type="doi">10.1038/msb4100050</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1681482</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Yamazaki</surname>
                            <given-names>Y</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Niki</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kato</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Profiling of 
                        <italic toggle="yes">Escherichia coli</italic> Chromosome database.</article-title>
                    <source>

                        <italic toggle="yes">Methods Mol Biol.</italic>
</source>
                    <year>2008</year>;<volume>416</volume>:<fpage>385</fpage>&#x2013;<lpage>389</lpage>.
                    <pub-id pub-id-type="pmid">18392982</pub-id>
                    <pub-id pub-id-type="doi">10.1007/978-1-59745-321-9_26</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Westers</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dorenbos</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>van Dijl</surname>
                            <given-names>JM</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Genome engineering reveals large dispensable regions in 
                        <italic toggle="yes">Bacillus subtilis.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Mol Biol Evol.</italic>
</source>
                    <year>2003</year>;<volume>20</volume>:<fpage>2076</fpage>&#x2013;<lpage>2090</lpage>.
                    <pub-id pub-id-type="pmid">12949151</pub-id>
                    <pub-id pub-id-type="doi">10.1093/molbev/msg219</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wenzel</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Altenbuchner</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Development of a markerless gene deletion system for 
                        <italic toggle="yes">Bacillus subtilis</italic> based on the mannose phosphoenolpyruvate-dependent phosphotransferase system.</article-title>
                    <source>

                        <italic toggle="yes">Microbiology.</italic>
</source>
                    <year>2015</year>;<volume>161</volume>(<issue>10</issue>):<fpage>1942</fpage>&#x2013;<lpage>1949</lpage>.
                    <pub-id pub-id-type="pmid">26238998</pub-id>
                    <pub-id pub-id-type="doi">10.1099/mic.0.000150</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Umenhoffer</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Feh&#x00e9;r</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Balik&#x00f3;</surname>
                            <given-names>G</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Reduced evolvability of Escherichia coli MDS42, an IS-less cellular chassis for molecular and synthetic biology applications.</article-title>
                    <source>

                        <italic toggle="yes">Microb Cell Fact.</italic>
</source>
                    <year>2010</year>;<volume>9</volume>:<fpage>38</fpage>.
                    <pub-id pub-id-type="pmid">20492662</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1475-2859-9-38</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2891674</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Cs&#x00f6;rgo</surname>
                            <given-names>B</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Feh&#x00e9;r</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>T&#x00ed;m&#x00e1;r</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Low-mutation-rate, reduced-genome 
                        <italic toggle="yes">Escherichia coli</italic>: an improved host for faithful maintenance of engineered genetic constructs.</article-title>
                    <source>

                        <italic toggle="yes">Microb Cell Fact.</italic>
</source>
                    <year>2012</year>;<volume>1</volume>:<fpage>11</fpage>.
                    <pub-id pub-id-type="pmid">22264280</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1475-2859-11-11</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3280934</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Tettelin</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Masignani</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cieslewicz</surname>
                            <given-names>MJ</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Genome analysis of multiple pathogenic isolates of 
                        <italic toggle="yes">Streptococcus agalactiae</italic>: implications for the microbial &#x201c;pan-genome&#x201d;</article-title>
                    <source>

                        <italic toggle="yes">Proc Natl Acad Sci U S A.</italic>
</source>
                    <year>2005</year>;<volume>102</volume>(<issue>39</issue>):<fpage>13950</fpage>&#x2013;<lpage>13955</lpage>. [published correction appears in Proc Natl Acad Sci U S A. 2005 Nov 8;102(45):16530].
                    <pub-id pub-id-type="pmid">16172379</pub-id>
                    <pub-id pub-id-type="doi">10.1073/pnas.0506758102</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1216834</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Remm</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Storm</surname>
                            <given-names>CE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sonnhammer</surname>
                            <given-names>EL</given-names>
                        </name>
</person-group>:
                    <article-title>Automatic clustering of orthologs and in-paralogs from pairwise species comparisons.</article-title>
                    <source>

                        <italic toggle="yes">J Mol Biol.</italic>
</source>
                    <year>2001</year>;<volume>314</volume>(<issue>5</issue>):<fpage>1041</fpage>&#x2013;<lpage>1052</lpage>.
                    <pub-id pub-id-type="pmid">11743721</pub-id>
                    <pub-id pub-id-type="doi">10.1006/jmbi.2000.5197</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Stoeckert</surname>
                            <given-names>CJ</given-names>
                            <suffix>Jr</suffix>
                        </name>

                        <name name-style="western">
                            <surname>Roos</surname>
                            <given-names>DS</given-names>
                        </name>
</person-group>:
                    <article-title>OrthoMCL: Identification of ortholog groups for eukaryotic genomes.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2003</year>;<volume>13</volume>(<issue>9</issue>):<fpage>2178</fpage>&#x2013;<lpage>2189</lpage>.
                    <pub-id pub-id-type="pmid">12952885</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.1224503</pub-id>
                    <pub-id pub-id-type="pmcid">PMC403725</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Chan</surname>
                            <given-names>AP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sutton</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>DePew</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A novel method of consensus pan-chromosome assembly and large-scale comparative analysis reveal the highly flexible pan-genome of 
                        <italic toggle="yes">Acinetobacter baumannii.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Genome Biol.</italic>
</source>
                    <year>2015</year>;<volume>16</volume>:<fpage>143</fpage>.
                    <pub-id pub-id-type="pmid">26195261</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s13059-015-0701-6</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4507327</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Tatusov</surname>
                            <given-names>RL</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Galperin</surname>
                            <given-names>MY</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Natale</surname>
                            <given-names>DA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The COG database: a tool for genome-scale analysis of protein functions and evolution.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2000</year>;<volume>28</volume>(<issue>1</issue>):<fpage>33</fpage>&#x2013;<lpage>36</lpage>.
                    <pub-id pub-id-type="pmid">10592175</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/28.1.33</pub-id>
                    <pub-id pub-id-type="pmcid">PMC102395</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gil</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Silva</surname>
                            <given-names>FJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Peret&#x00f3;</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Determination of the core of a minimal bacterial gene set.</article-title>
                    <source>

                        <italic toggle="yes">Microbiol Mol Biol Rev.</italic>
</source>
                    <year>2004</year>;<volume>68</volume>(<issue>3</issue>):<fpage>518</fpage>&#x2013;<lpage>537</lpage>.
                    <pub-id pub-id-type="pmid">15353568</pub-id>
                    <pub-id pub-id-type="doi">10.1128/MMBR.68.3.518-537.2004</pub-id>
                    <pub-id pub-id-type="pmcid">PMC515251</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Jordan</surname>
                            <given-names>IK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rogozin</surname>
                            <given-names>IB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wolf</surname>
                            <given-names>YI</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Essential genes are more evolutionarily conserved than are nonessential genes in bacteria.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2002</year>;<volume>12</volume>(<issue>6</issue>):<fpage>962</fpage>&#x2013;<lpage>968</lpage>.
                    <pub-id pub-id-type="pmid">12045149</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.87702</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1383730</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Podell</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gaasterland</surname>
                            <given-names>T</given-names>
                        </name>
</person-group>:
                    <article-title>DarkHorse: a method for genome-wide prediction of horizontal gene transfer.</article-title>
                    <source>

                        <italic toggle="yes">Genome Biol.</italic>
</source>
                    <year>2007</year>;<volume>8</volume>(<issue>2</issue>):<fpage>R16</fpage>.
                    <pub-id pub-id-type="pmid">17274820</pub-id>
                    <pub-id pub-id-type="doi">10.1186/gb-2007-8-2-r16</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1852411</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Koonin</surname>
                            <given-names>EV</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Makarova</surname>
                            <given-names>KS</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Aravind</surname>
                            <given-names>L</given-names>
                        </name>
</person-group>:
                    <article-title>Horizontal gene transfer in prokaryotes: quantification and classification.</article-title>
                    <source>

                        <italic toggle="yes">Annu Rev Microbiol.</italic>
</source>
                    <year>2001</year>;<volume>55</volume>:<fpage>709</fpage>&#x2013;<lpage>742</lpage>.
                    <pub-id pub-id-type="pmid">11544372</pub-id>
                    <pub-id pub-id-type="doi">10.1146/annurev.micro.55.1.709</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4781227</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Fouts</surname>
                            <given-names>DE</given-names>
                        </name>
</person-group>:
                    <article-title>Phage_Finder: automated identification and classification of prophage regions in complete bacterial genome sequences.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2006</year>;<volume>34</volume>(<issue>20</issue>):<fpage>5839</fpage>&#x2013;<lpage>5851</lpage>.
                    <pub-id pub-id-type="pmid">17062630</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkl732</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1635311</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Page</surname>
                            <given-names>AJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cummins</surname>
                            <given-names>CA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hunt</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Roary: rapid large-scale prokaryote pan genome analysis.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2015</year>;<volume>31</volume>(<issue>22</issue>):<fpage>3691</fpage>&#x2013;<lpage>3693</lpage>.
                    <pub-id pub-id-type="pmid">26198102</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/btv421</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4817141</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <mixed-citation publication-type="book">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vernikos</surname>
                            <given-names>GS</given-names>
                        </name>
</person-group>:
                    <chapter-title>A Review of Pangenome Tools and Recent Studies</chapter-title>. In:
                    <person-group person-group-type="editor">

                        <name name-style="western">
                            <surname>Tettelin</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Medini</surname>
                            <given-names>D</given-names>
                        </name>
</person-group>, eds.
                    <source>

                        <italic toggle="yes">The Pangenome: Diversity, Dynamics and Evolution of Genomes.</italic>
</source>
                    <publisher-loc>Cham (CH)</publisher-loc>:
                    <publisher-name>Springer</publisher-name>;<year>2020</year>:<fpage>89</fpage>&#x2013;<lpage>112</lpage>.
                    <pub-id pub-id-type="pmid">32633917</pub-id>
                    <pub-id pub-id-type="doi">10.1007/978-3-030-38281-0_4</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Clarke</surname>
                            <given-names>TH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Brinkac</surname>
                            <given-names>LM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sutton</surname>
                            <given-names>G</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>GGRaSP: a R-package for selecting representative genomes using Gaussian mixture models.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2018</year>;<volume>34</volume>:<fpage>3032</fpage>&#x2013;<lpage>3034</lpage>.
                    <pub-id pub-id-type="pmid">29668840</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/bty300</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6129299</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref29">
                <label>29</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Inman</surname>
                            <given-names>JM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sutton</surname>
                            <given-names>GG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Beck</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large-scale comparative analysis of microbial pan-genomes using PanOCT.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2019</year>;<volume>35</volume>:<fpage>1049</fpage>&#x2013;<lpage>1050</lpage>.
                    <pub-id pub-id-type="pmid">30165579</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/bty744</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6419995</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref30">
                <label>30</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Fouts</surname>
                            <given-names>DE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Brinkac</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Beck</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>PanOCT: automated clustering of orthologs using conserved gene neighborhood for pan-genomic analysis of bacterial strains and closely related species.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2012</year>;<volume>40</volume>:<fpage>e172</fpage>.
                    <pub-id pub-id-type="pmid">22904089</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gks757</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3526259</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref31">
                <label>31</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>O'Leary</surname>
                            <given-names>NA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wright</surname>
                            <given-names>MW</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Brister</surname>
                            <given-names>JR</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2016</year>;<volume>44</volume>:<fpage>D733</fpage>&#x2013;<lpage>45</lpage>.
                    <pub-id pub-id-type="pmid">26553804</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkv1189</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4702849</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref32">
                <label>32</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lan</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reeves</surname>
                            <given-names>PR</given-names>
                        </name>
</person-group>:
                    <article-title>
                        <italic toggle="yes">Escherichia coli</italic> in disguise: molecular origins of 
                        <italic toggle="yes">Shigella.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Microbes Infect.</italic>
</source>
                    <year>2002</year>;<volume>4</volume>:<fpage>1125</fpage>&#x2013;<lpage>1132</lpage>.
                    <pub-id pub-id-type="pmid">12361912</pub-id>
                    <pub-id pub-id-type="doi">10.1016/s1286-4579(02)01637-4</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref33">
                <label>33</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Meier-Kolthoff</surname>
                            <given-names>JP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hahnke</surname>
                            <given-names>RL</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Petersen</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Complete genome sequence of DSM 30083(T), the type strain (U5/41(T)) of 
                        <italic toggle="yes">Escherichia coli</italic>, and a proposal for delineating subspecies in microbial taxonomy.</article-title>
                    <source>

                        <italic toggle="yes">Stand Genomic Sci.</italic>
</source>
                    <year>2014</year>;<volume>8</volume>:<issue>9</issue>:<fpage>2</fpage>.
                    <pub-id pub-id-type="pmid">25780495</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1944-3277-9-2</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4334874</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref34">
                <label>34</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ondov</surname>
                            <given-names>BD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Treangen</surname>
                            <given-names>TJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Melsted</surname>
                            <given-names>P</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Mash: fast genome and metagenome distance estimation using MinHash.</article-title>
                    <source>

                        <italic toggle="yes">Genome Biol.</italic>
</source>
                    <year>2016</year>;<volume>17</volume>:<fpage>132</fpage>.
                    <pub-id pub-id-type="pmid">27323842</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s13059-016-0997-x</pub-id>
                    <pub-id pub-id-type="pmcid">PMC4915045</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref35">
                <label>35</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Needleman</surname>
                            <given-names>SB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wunsch</surname>
                            <given-names>CD</given-names>
                        </name>
</person-group>:
                    <article-title>A general method applicable to the search for similarities in the amino acid sequence of two proteins.</article-title>
                    <source>

                        <italic toggle="yes">J Mol Biol.</italic>
</source>
                    <year>1970</year>;<volume>48</volume>:<fpage>443</fpage>&#x2013;<lpage>453</lpage>.
                    <pub-id pub-id-type="pmid">5420325</pub-id>
                    <pub-id pub-id-type="doi">10.1016/0022-2836(70)90057-4</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref36">
                <label>36</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Commichau</surname>
                            <given-names>FM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pietack</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>St&#x00fc;lke</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Essential genes in 
                        <italic toggle="yes">Bacillus subtilis</italic>: a re-evaluation after ten years.</article-title>
                    <source>

                        <italic toggle="yes">Mol Biosyst.</italic>
</source>
                    <year>2013</year>;<volume>9</volume>(<issue>6</issue>):<fpage>1068</fpage>&#x2013;<lpage>1075</lpage>.
                    <pub-id pub-id-type="pmid">23420519</pub-id>
                    <pub-id pub-id-type="doi">10.1039/c3mb25595f</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref37">
                <label>37</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Koskiniemi</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lamoureux</surname>
                            <given-names>JG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nikolakakis</surname>
                            <given-names>KC</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Rhs proteins from diverse bacteria mediate intercellular competition.</article-title>
                    <source>

                        <italic toggle="yes">Proc Natl Acad Sci U S A.</italic>
</source>
                    <year>2013</year>;<volume>110</volume>:<fpage>7032</fpage>&#x2013;<lpage>7037</lpage>.
                    <pub-id pub-id-type="pmid">23572593</pub-id>
                    <pub-id pub-id-type="doi">10.1073/pnas.1300627110</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3637788</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref38">
                <label>38</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Holberger</surname>
                            <given-names>LE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Garza-S&#x00e1;nchez</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lamoureux</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A novel family of toxin/antitoxin proteins in 
                        <italic toggle="yes">Bacillus</italic> species.</article-title>
                    <source>

                        <italic toggle="yes">FEBS Lett.</italic>
</source>
                    <year>2012</year>;<volume>586</volume>(<issue>2</issue>):<fpage>132</fpage>&#x2013;<lpage>136</lpage>.
                    <pub-id pub-id-type="pmid">22200572</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.febslet.2011.12.020</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3259279</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref39">
                <label>39</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Brantl</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>M&#x00fc;ller</surname>
                            <given-names>P</given-names>
                        </name>
</person-group>:
                    <article-title>Toxin-Antitoxin Systems in 
                        <italic toggle="yes">Bacillus subtilis.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Toxins.</italic>
</source>
                    <year>2019</year>;<volume>11</volume>:<fpage>pii: E262</fpage>.
                    <pub-id pub-id-type="pmid">31075979</pub-id>
                    <pub-id pub-id-type="doi">10.3390/toxins11050262</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6562991</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref40">
                <label>40</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ohshima</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Matsuoka</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Asai</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Molecular organization of intrinsic restriction and modification genes 
                        <italic toggle="yes">BsuM</italic> of 
                        <italic toggle="yes">Bacillus subtilis</italic> Marburg.</article-title>
                    <source>

                        <italic toggle="yes">J Bacteriol.</italic>
</source>
                    <year>2002</year>;<volume>184</volume>:<fpage>381</fpage>&#x2013;<lpage>389</lpage>.
                    <pub-id pub-id-type="pmid">11751814</pub-id>
                    <pub-id pub-id-type="doi">10.1128/jb.184.2.381-389.2002</pub-id>
                    <pub-id pub-id-type="pmcid">PMC139560</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref41">
                <label>41</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Brown</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Santa Maria Jr</surname>
                            <given-names>JP</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Walker</surname>
                            <given-names>S</given-names>
                        </name>
</person-group>:
                    <article-title>Wall teichoic acids of gram-positive bacteria.</article-title>
                    <source>

                        <italic toggle="yes">Annu Rev Microbiol.</italic>
</source>
                    <year>2013</year>;<volume>67</volume>:<fpage>313</fpage>&#x2013;<lpage>336</lpage>.
                    <pub-id pub-id-type="pmid">24024634</pub-id>
                    <pub-id pub-id-type="doi">10.1146/annurev-micro-092412-155620</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3883102</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref42">
                <label>42</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>D'Elia</surname>
                            <given-names>MA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Millar</surname>
                            <given-names>KE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Beveridge</surname>
                            <given-names>TJ</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Wall teichoic acid polymers are dispensable for cell viability in 
                        <italic toggle="yes">Bacillus subtilis.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">J Bacteriol.</italic>
</source>
                    <year>2006</year>;<volume>188</volume>:<fpage>8313</fpage>&#x2013;<lpage>8316</lpage>.
                    <pub-id pub-id-type="pmid">17012386</pub-id>
                    <pub-id pub-id-type="doi">10.1128/JB.01336-06</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1698200</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref43">
                <label>43</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Henriques</surname>
                            <given-names>AO</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Glaser</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Piggot</surname>
                            <given-names>PJ</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Control of cell shape and elongation by the 
                        <italic toggle="yes">rodA</italic> gene in 
                        <italic toggle="yes">Bacillus subtilis.</italic>
                    </article-title>
                    <source>

                        <italic toggle="yes">Mol Microbiol.</italic>
</source>
                    <year>1998</year>;<volume>28</volume>:<fpage>235</fpage>&#x2013;<lpage>247</lpage>.
                    <pub-id pub-id-type="pmid">9622350</pub-id>
                    <pub-id pub-id-type="doi">10.1046/j.1365-2958.1998.00766.x</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref44">
                <label>44</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lazarevic</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Abellan</surname>
                            <given-names>F-X</given-names>
                        </name>

                        <name name-style="western">
                            <surname>M&#x00f6;ller</surname>
                            <given-names>SB</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Comparison of ribitol and glycerol teichoic acid genes in 
                        <italic toggle="yes">Bacillius subtilis</italic>W23 and 168: Identical function, similar divergent organization, but different regulation.</article-title>
                    <source>

                        <italic toggle="yes">Microbiology.</italic>
</source>
                    <year>2002</year>;<volume>148</volume>:<fpage>815</fpage>&#x2013;<lpage>824</lpage>.
                    <pub-id pub-id-type="pmid">11882717</pub-id>
                    <pub-id pub-id-type="doi">10.1099/00221287-148-3-815</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref45">
                <label>45</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ahn</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Jun</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ro</surname>
                            <given-names>H-J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Complete genome of 
                        <italic toggle="yes">Bacillus subtilis</italic> subsp. 
                        <italic toggle="yes">subtilis</italic> KCTC 3135
                        <sup>T</sup> and variation in cell wall genes of 
                        <italic toggle="yes">B. subtilis</italic> strains.</article-title>
                    <source>

                        <italic toggle="yes">J Microbiol Biotechnol.</italic>
</source>
                    <year>2018</year>;<volume>28</volume>:<fpage>1760</fpage>&#x2013;<lpage>1768</lpage>.
                    <pub-id pub-id-type="pmid">30196596</pub-id>
                    <pub-id pub-id-type="doi">10.4014/jmb,1712.12006</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref46">
                <label>46</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sutton</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Fogel</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Abramson</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Horizontal transfer and evolution of wall teichoic acid gene cassettes in 
                        <italic toggle="yes">Bacillus subtilis</italic> [version 1; peer review: awaiting peer review].</article-title>
                    <source>

                        <italic toggle="yes">F1000Res.</italic>
</source>
                    <year>2021</year>.
                    <pub-id pub-id-type="doi">10.12688/f1000research.51874.1</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref47">
                <label>47</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wu</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gao</surname>
                            <given-names>F</given-names>
                        </name>
</person-group>:
                    <article-title>Toward a high-quality pan-genome landscape of Bacillus subtilis by removal of confounding strains.</article-title>
                    <source>

                        <italic toggle="yes">Brief Bioinform.</italic>
</source>
                    <year>2020</year>;<fpage>bbaa013</fpage>.
                    <pub-id pub-id-type="pmid">32065216</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bib/bbaa013</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref48">
                <label>48</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bindal</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Krishnamurthi</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Seshasayee</surname>
                            <given-names>ASN</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>CRISPR-Cas-mediated gene silencing reveals RacR to be a negative regulator of YdaS and YdaT toxins in 
                        <italic toggle="yes">Escherichia coli</italic> K-12.</article-title>
                    <source>

                        <italic toggle="yes">mSphere.</italic>
</source>
                    <year>2017</year>;<volume>2</volume>:<fpage>e00483</fpage>&#x2013;<lpage>17</lpage>.
                    <pub-id pub-id-type="pmid">29205229</pub-id>
                    <pub-id pub-id-type="doi">10.1128/mSphere.00483-17</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5700377</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref49">
                <label>49</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kato</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hashimoto</surname>
                            <given-names>M</given-names>
                        </name>
</person-group>:
                    <article-title>Construction of consecutive deletions of the 
                        <italic toggle="yes">Escherichia coli</italic> chromosome.</article-title>
                    <source>

                        <italic toggle="yes">Mol Syst Biol.</italic>
</source>
                    <year>2007</year>;<volume>3</volume>:<fpage>132</fpage>.
                    <pub-id pub-id-type="pmid">17700540</pub-id>
                    <pub-id pub-id-type="doi">10.1038/msb4100174</pub-id>
                    <pub-id pub-id-type="pmcid">PMC1964801</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref50">
                <label>50</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Yang</surname>
                            <given-names>ZK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Luo</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhang</surname>
                            <given-names>Y</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Pan-genomic analysis provides novel insights into the association of 
                        <italic toggle="yes">E. coli</italic> with human host and its minimal genome.</article-title>
                    <source>

                        <italic toggle="yes">Bioinformatics.</italic>
</source>
                    <year>2019</year>;<volume>35</volume>(<issue>12</issue>):<fpage>1987</fpage>&#x2013;<lpage>1991</lpage>.
                    <pub-id pub-id-type="pmid">30418478</pub-id>
                    <pub-id pub-id-type="doi">10.1093/bioinformatics/bty938</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref51">
                <label>51</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Luo</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lin</surname>
                            <given-names>Y</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gao</surname>
                            <given-names>F</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>DEG 10, an update of the database of essential genes that includes both protein-coding genes and noncoding genomic elements.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2014</year>;<volume>42</volume>(<issue>Database issue</issue>):<fpage>D574</fpage>&#x2013;<lpage>D580</lpage>.
                    <pub-id pub-id-type="pmid">24243843</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkt1131</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3965060</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref52">
                <label>52</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gerdes</surname>
                            <given-names>SY</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Scholle</surname>
                            <given-names>MD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Campbell</surname>
                            <given-names>JW</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Experimental determination and system level analysis of essential genes in Escherichia coli MG1655.</article-title>
                    <source>

                        <italic toggle="yes">J Bacteriol.</italic>
</source>
                    <year>2003</year>;<volume>185</volume>(<issue>19</issue>):<fpage>5673</fpage>&#x2013;<lpage>5684</lpage>.
                    <pub-id pub-id-type="pmid">13129938</pub-id>
                    <pub-id pub-id-type="doi">10.1128/jb.185.19.5673-5684.2003</pub-id>
                    <pub-id pub-id-type="pmcid">PMC193955</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref53">
                <label>53</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Chavda</surname>
                            <given-names>KD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Chen</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Fouts</surname>
                            <given-names>DE</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Comprehensive Genome Analysis of Carbapenemase-Producing Enterobacter spp.: New Insights into Phylogeny, Population Structure, and Resistance Mechanisms.</article-title>
                    <source>

                        <italic toggle="yes">mBio.</italic>
</source>
                    <year>2016</year>;<volume>7</volume>(<issue>6</issue>):<fpage>e02093</fpage>&#x2013;<lpage>16</lpage>.
                    <pub-id pub-id-type="pmid">27965456</pub-id>
                    <pub-id pub-id-type="doi">10.1128/mBio.02093-16</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5156309</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref54">
                <label>54</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Fang</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Comparative genomics of Mycoplasma: analysis of conserved essential genes and diversity of the pan-genome.</article-title>
                    <source>

                        <italic toggle="yes">PLoS One.</italic>
</source>
                    <year>2012</year>;<volume>7</volume>(<issue>4</issue>):<fpage>e35698</fpage>.
                    <pub-id pub-id-type="pmid">22536428</pub-id>
                    <pub-id pub-id-type="doi">10.1371/journal.pone.0035698</pub-id>
                    <pub-id pub-id-type="pmcid">PMC3335003</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref55">
                <label>55</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Koehorst</surname>
                            <given-names>JJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>van Dam</surname>
                            <given-names>JC</given-names>
                        </name>

                        <name name-style="western">
                            <surname>van Heck</surname>
                            <given-names>RG</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Comparison of 432 
                        <italic toggle="yes">Pseudomonas</italic> strains through integration of genomic, functional, metabolic and expression data.</article-title>
                    <source>

                        <italic toggle="yes">Sci Rep.</italic>
</source>
                    <year>2016</year>;<volume>6</volume>:<fpage>38699</fpage>.
                    <pub-id pub-id-type="pmid">27922098</pub-id>
                    <pub-id pub-id-type="doi">10.1038/srep38699</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5138606</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref56">
                <label>56</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Juhas</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reu&#x00df;</surname>
                            <given-names>DR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhu</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>
                        <italic toggle="yes">Bacillus subtilis</italic> and 
                        <italic toggle="yes">Escherichia coli</italic> essential genes and minimal cell factories after one decade of genome engineering.</article-title>
                    <source>

                        <italic toggle="yes">Microbiology.</italic>
</source>
                    <year>2014</year>;<volume>160</volume>(<issue>Pt 11</issue>):<fpage>2341</fpage>&#x2013;<lpage>2351</lpage>.
                    <pub-id pub-id-type="pmid">25092907</pub-id>
                    <pub-id pub-id-type="doi">10.1099/mic.0.079376-0</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref57">
                <label>57</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kunin</surname>
                            <given-names>V</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ouzounis</surname>
                            <given-names>CA</given-names>
                        </name>
</person-group>:
                    <article-title>The balance of driving forces during genome evolution in prokaryotes.</article-title>
                    <source>

                        <italic toggle="yes">Genome Res.</italic>
</source>
                    <year>2003 Jul</year>;<volume>13</volume>(<issue>7</issue>):<fpage>1589</fpage>&#x2013;<lpage>1594</lpage>.
                    <pub-id pub-id-type="pmid">12840037</pub-id>
                    <pub-id pub-id-type="doi">10.1101/gr.1092603</pub-id>
                    <pub-id pub-id-type="pmcid">PMC403731</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref58">
                <label>58</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sutton</surname>
                            <given-names>G</given-names>
                        </name>
</person-group>:
                    <bold>PGG Core Genes - Tables F1000 version 2.xlsx.</bold>
                    <italic toggle="yes">figshare</italic>.
                    <italic toggle="yes">Dataset</italic>.<year>2021</year>.
                    <pub-id pub-id-type="doi">10.6084/m9.figshare.15129636.v1</pub-id>
                    <pub-id pub-id-type="pmcid">PMC403731</pub-id>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report84280">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.55083.r84280</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Ouzounis</surname>
                        <given-names>Christos</given-names>
                    </name>
                    <xref ref-type="aff" rid="r84280a1">1</xref>
                    <xref ref-type="aff" rid="r84280a2">2</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0086-8657</uri>
                </contrib>
                <aff id="r84280a1">
                    <label>1</label>Department of Computer Science, Aristotle University of Thessaloniki, Thessalonica, Greece</aff>
                <aff id="r84280a2">
                    <label>2</label>Centre for Research &amp; Technology Hellas, Thessalonica, Greece</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>26</day>
                <month>5</month>
                <year>2021</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2021 Ouzounis C</copyright-statement>
                <copyright-year>2021</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport84280" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.51873.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>This extensive, complex report provides details about a new methodological approach for the detection of &#x2018;core&#x2019; regions of 
                <italic>Bacillus subtilis</italic> and 
                <italic>Escherichia coli</italic>. Core regions are defined within the pan-genome context of conserved genomic loci for the two bacterial species, as a case study. An underlying assumption and implicit goal of the study is that the detected core regions largely correspond to &#x2018;essential&#x2019; genes, as those have been determined by independent experimental methodology, with implications for synthetic engineering of bacteria. For both purposes, namely the detection of core regions and the correspondence of those to essential genes, this report is an important contribution, especially as it resolves the connection of core to essential genes. It brings to the forefront the use of pangenome analysis for synthetic biology &#x2013; a factor that so far has been, to our amazement (!), ignored by biotechnologists. Solid work and a significant contribution to the field.</p>
            <p> </p>
            <p> 
                <bold>Major comments:</bold> 
                <list list-type="order">
                    <list-item>
                        <p>A general stylistic observation is that the manuscript is dense, in particular the Introduction and Methods are quite extensive and discursive, the Introduction containing multiple quotes from previous works. While this is not necessarily a bad thing, some details (&#x201c;in their table 4, etc.&#x201d; and other quoted phrases from cited papers) could be avoided or better summarized. This level of detail is welcome for experts, but non-experts are at risk to miss the main point and the motivations for this study. A more standard style, perhaps for the first paragraph might be useful, in order to address a wider audience.</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;While it is possible to define &#x201c;essential&#x201d; genes&#x201d;: the definition of &#x2018;essential&#x2019; genes is problematic as it refers to the growth medium and general environmental conditions, as the authors correctly point out. Therefore, &#x2018;essentiality&#x2019; is a functional definition. Core (conserved, species-defining) genes, on the other hand, do not rely on environmental factors but evolutionary history, therefore &#x2018;conservation&#x2019; is a structural definition. Coupling those is always tricky, however as the authors state early on in their paper, the equivalence between core and essential genes is indeed their primary hypothesis (&#x201c;We expect that all truly essential genes for the species/subspecies would be a subset of the core OGCs/regions&#x201d;). This should be more explicitly stated, perhaps in the first paragraph of the Introduction.</p>
                    </list-item>
                    <list-item>
                        <p>What advantage is provided by keeping the directionality of OGCs in the PGG? Is this purely a methodological checkpoint, i.e. improve the detection capability by reducing the number of false positive or negative hits, or is it further used in the analysis and interpretation of the results? Needs to be clarified, as it increases the complexity of the pan-genome turning a set into a graph. There is a passage &#x201c;PGG refinement to ensure consistent annotation&#x201d;, which alludes to the actual role of PGG.</p>
                    </list-item>
                    <list-item>
                        <p>Another general comment connected to the above, esp. major comment 2: the report serves a dual role as a software announcement (update) of JCVI&#x2019;s pan-genome pipeline software suite, with additional elements and certain conceptual advances, as well as the comparison of the core-vs-essential sets for two of the best studied/sampled species pangenomes. This should be a bit more clearly explained perhaps. The correspondence of core to essential genes is a welcome contribution but may not be the main topic of the manuscript, just a conclusion drawn from the analysis.</p>
                    </list-item>
                    <list-item>
                        <p>Following major comment 4: the method does well in identifying core regions and indeed makes a convincing case for an improvement over other methods. Yet, the comparison with essential genes is an addition, but not a comparison against other methods that define core regions. As the authors decided to take this direction, as they improve over their own previous methodology, this point should be qualified appropriately. In other words, the &#x2018;improvement&#x2019; can be shown as an incremental step over a previous protocol and explicitly shown that it is validated against &#x2018;essential&#x2019; gene sets. If this point is not emphasized, the analysis will be seen as lacking a comparison to another &#x2018;gold-standard&#x2019; method (experts know that there is no such thing, yet). Pages 11-12 have some elements of a comparison to another approach, this could be extended by a couple of concluding sentences. A good spot where some concluding remarks can be made might be a short paragraph before the Discussion.</p>
                    </list-item>
                </list> 
                <bold>Minor comments:</bold> 
                <list list-type="order">
                    <list-item>
                        <p>In Introduction: &#x201c;We further define a pan-genome graph (PGG) to be a graph&#x201d;, this should probably follow the paragraph starting &#x201c;Here we present a pan-genome based calculation...&#x201d; ?</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;For 
                            <italic>E. coli</italic> (and 
                            <italic>Shigella</italic>) we downloaded 1097 complete genomes&#x201d;, start a new paragraph? Using subtitles for Methods might also be a good idea, to break down the dense text into digestible sections.</p>
                    </list-item>
                    <list-item>
                        <p>Following minor comment 2: a mini table with three columns (filtering step, 
                            <italic>B. subtilis</italic>, 
                            <italic>E. coli</italic>) and as many rows as the filtering steps used with the number of genomes at each step might be helpful.</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;used by Goodall&#x201d; (reference 9? missing).</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;This is done by blasting&#x201d; - executing BLAST etc. / &#x201c;conflicting blast&#x201d; -&gt; conflicting BLAST...</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;to not under call core OGCs/edges&#x201d;, i.e. to reduce the number of potentialy false negatives. Or, increase coverage.</p>
                    </list-item>
                    <list-item>
                        <p>for 
                            <italic>B. subtilis</italic>: &#x201c;3419 (73.5%) core and present in all 108 genomes&#x201d;: this row in Table 1 should be somehow highlighted, perhaps by color or other means -- it is an important part of the study and a key result.</p>
                    </list-item>
                    <list-item>
                        <p>a word for missing genes in the context of potential gene loss and the possibility of including them in future steps (see PMID: 12840037
                            <sup>
                                <xref ref-type="bibr" rid="rep-ref-84280-1">1</xref>
                            </sup>); this is something we (and possibly others) have been trying to implement for pangenome data, without much success. Something to discuss as a partial explanation for &#x2018;key&#x2019; (essential?) missing genes in certain lineages within the species pedigree, perhaps?</p>
                    </list-item>
                    <list-item>
                        <p>&#x201c;For the 34 protein coding genes&#x201d;... good yet incredibly dense paragraph, a (supplementary) table might help here.</p>
                    </list-item>
                    <list-item>
                        <p>Would the PGG implementation also help future studies in synteny analysis/conservation? Maybe a minor point that can be included in the discussion, with appropriate (1-2) references. A concluding short paragraph following the current one with the WTA region might be a good way to wrap up.</p>
                    </list-item>
                </list>
            </p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Yes</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Yes</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Computational Biology, Biological Computation, Systems Biomedicine, Bioinformatics, Protein Structure</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-84280-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>The balance of driving forces during genome evolution in prokaryotes.</article-title>
                        <source>
                            <italic>Genome Res</italic>
                        </source>.<year>2003</year>;<volume>13</volume>(<issue>7</issue>) :
                        <elocation-id>10.1101/gr.1092603</elocation-id>
                        <fpage>1589</fpage>-<lpage>94</lpage>
                        <pub-id pub-id-type="pmid">12840037</pub-id>
                        <pub-id pub-id-type="doi">10.1101/gr.1092603</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
        <sub-article article-type="response" id="comment7015-84280">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Sutton</surname>
                            <given-names>Granger</given-names>
                        </name>
                        <aff>J. Craig Venter Institute, USA</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>9</day>
                    <month>8</month>
                    <year>2021</year>
                </pub-date>
            </front-stub>
            <body>
                <p>We thank the reviewer for the thoughtful comments and have tried to respond to all of the suggestions including the new table and supplementary table in version two of our manuscript.</p>
            </body>
        </sub-article>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report83244">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.55083.r83244</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Ussery</surname>
                        <given-names>David</given-names>
                    </name>
                    <xref ref-type="aff" rid="r83244a2">2</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-3632-5512</uri>
                </contrib>
                <contrib contrib-type="author">
                    <name>
                        <surname>Abram</surname>
                        <given-names>Kaleb</given-names>
                    </name>
                    <xref ref-type="aff" rid="r83244a1">1</xref>
                    <role>Co-referee</role>
                </contrib>
                <aff id="r83244a1">
                    <label>1</label>Programming Associate, DBMI, University of Arkansas for Medical Sciences, Little Rock, Arkansas, USA</aff>
                <aff id="r83244a2">
                    <label>2</label>Department of Biomedical Informatics, University of Arkansas for Medical Sciences, Little Rock, AR, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>28</day>
                <month>4</month>
                <year>2021</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2021 Abram K and Ussery D</copyright-statement>
                <copyright-year>2021</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport83244" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.51873.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>
                <bold>Introduction:&#x00a0;</bold>
            </p>
            <p> The authors provide an adequate background literature detailing attempts by others in the field to produce minimal genomes. A variety of approaches are covered and drawbacks to these attempts are mentioned. The authors also provide sufficient explanation of the need for their approach in addition to experimental approaches. A good overview of their pan-genome graph approach is presented, along with the reasoning for their design choices for the graph.</p>
            <p> </p>
            <p> Towards end of first paragraph in Introduction - should be _G_ram-negative (name for the Danish microbiologist, Hans Christian Gram)</p>
            <p> </p>
            <p> Last sentence in last paragraph in Introduction - "Our method builds directly upon our previous pan-genome work and includes several improvements: 1) being able to _automatically_ use only complete high-quality genomes..." &#x00a0;Surely the previous methods could also have used only complete high-quality genomes as input? &#x00a0;My understanding is that the advantage of this new method is that it's now taking steps to ensure that 'bad genomes' are filtered out, and only the 'high-quality' ones are left....</p>
            <p> </p>
            <p> 
                <bold>Methods:&#x00a0;</bold>
            </p>
            <p> Figure 1, 2nd line: "Compute genome ANI using Mash". This doesn't make sense, as ANI and Mash are different approaches. Mash does not estimate ANI (as it is a distance). Unless the authors took the distance and subtracted it from 1 before multiplying by 100, they do not have an approximate ANI (see fastANI paper [PMID: 30504855]
                <sup>
                    <xref ref-type="bibr" rid="rep-ref-83244-1">1</xref>
                </sup> or the Mash paper [their ref. 34] where Mash and ANI methods are compared). Further, the authors state they use type strains and ANI (presumably using the Mash derived approximation which is not ANI) to remove very closely related strains but do not specify what criteria/value was used to determine very closely related strains. Since the authors chose to use a program (GGRaSP) that uses ANI matrixes as the input, it can be assumed that either ANI values were calculated by an unspecified method, or they used transformed Mash values to approximate ANI and need to specify this transformation earlier in the methods. Either way, this should be clearly stated in the methods section, and not leave the reviewer to guess how this might have been done.</p>
            <p> </p>
            <p> &#x201c;The 132 genomes were reduced to 109 after removing&#x2026;&#x201d; &#x2013; it is unclear what the condition for removal was. &#x00a0;It would be helpful if this was explicitly stated (presumably an approximate ANI value between 95.73% and 97.28%). Also the authors state the minimum ANI between B. subtilis was 97.28% and the maximum ANI of any of the 11 other genomes to the 132 was 95.73%. The 11 genomes referenced here are unclear and the maximum ANI for the 132 is not provided. It is important to clearly bound their values, in order to enable comparison to other studies. For E. coli the parameters used to remove redundancy need to be explicitly stated and how the groups are collapsed (i.e. genomes A to genome B has 99% ANI value, which genomes is removed and which genome is retained?). The authors should explicitly state why they added 2 redundant genomes to the E. coli dataset but did not do similar additions for B. subtilis. While the PGG approach seems fairly good, the heavy reliance on RefSeq annotations could be problematic for other species.</p>
            <p> </p>
            <p> 
                <bold>Results:&#x00a0;</bold>
            </p>
            <p> The results shown in Table 1, and the bottom line is that for both B. subtilis and E. coli, the refined cores are a bit larger (and contain a larger fraction of 'essential genes' for the species). &#x00a0;The E. coli core is about a third larger, going from 2200 to 3100. &#x00a0;The latter number (3100) seems to be more consistent with what's expected for E. coli, based on many different experiments - historically, there has always been roughly 3000 E. coli genes. &#x00a0;So from this perspective, 2218 genes seems a bit too small (and also some of the 'essential genes' were missing from the core.)</p>
            <p> </p>
            <p> I'm curious as to whether a non-RefSeq gene annotation tool (for example, Prokka) be utilized to improve the consistency of gene calls? The specific results with number breakdowns are very confusing to read on a first pass and require very careful reading to understand the somewhat odd notation being used. This should be cleaned up to enhance readability.&#x00a0;</p>
            <p> </p>
            <p> Figure 2 should have a color key containing color to corresponding track to increase readability of this figure. (The same thing for Figure 4 for consistency.)&#x00a0;</p>
            <p> </p>
            <p> 
                <bold>Discussion:&#x00a0;</bold>
            </p>
            <p> The discussion surrounding the issue of lab conditions and core regions is a good. In addition, the discussion around noncore OGCs/regions also shows how the proposed pan-genome analysis could be used to identify noncore regions that could be removed that experimental results have been unable to identify. It might have been good to have a brief discussion of the phylogroup-specific cores in E. coli [see PMID: 33500552
                <sup>
                    <xref ref-type="bibr" rid="rep-ref-83244-2">2</xref>
                </sup> - disclaimer - this is a recent publication from our group.]</p>
            <p> </p>
            <p> The discussion section overall provides a good wrap up to the paper and summarizes how the PGG approach can be leveraged and the benefits from utilizing this approach.</p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Yes</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Yes</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Comparative genomics</p>
            <p>We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-83244-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>High throughput ANI analysis of 90K prokaryotic genomes reveals clear species boundaries</article-title>.
                        <source>
                            <italic>Nature Communications</italic>
                        </source>.<year>2018</year>;<volume>9</volume>(<issue>1</issue>) :
                        <elocation-id>10.1038/s41467-018-07641-9</elocation-id>
                        <pub-id pub-id-type="doi">10.1038/s41467-018-07641-9</pub-id>
                    </mixed-citation>
                </ref>
                <ref id="rep-ref-83244-2">
                    <label>2</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Mash-based analyses of Escherichia coli genomes reveal 14 distinct phylogroups.</article-title>
                        <source>
                            <italic>Commun Biol</italic>
                        </source>.<year>2021</year>;<volume>4</volume>(<issue>1</issue>) :
                        <elocation-id>10.1038/s42003-020-01626-5</elocation-id>
                        <fpage>117</fpage>
                        <pub-id pub-id-type="pmid">33500552</pub-id>
                        <pub-id pub-id-type="doi">10.1038/s42003-020-01626-5</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
        <sub-article article-type="response" id="comment7014-83244">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Sutton</surname>
                            <given-names>Granger</given-names>
                        </name>
                        <aff>J. Craig Venter Institute, USA</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>9</day>
                    <month>8</month>
                    <year>2021</year>
                </pub-date>
            </front-stub>
            <body>
                <p>We thank the reviewers for their thoughtful comments and have attempted to address all of the suggestions in version 2 of our manuscript.</p>
            </body>
        </sub-article>
    </sub-article>
</article>
