<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.19427.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Research Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Understanding life sciences data curation practices via user research</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 1 approved, 1 approved with reservations]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes" equal-contrib="yes">
                    <name>
                        <surname>Venkatesan</surname>
                        <given-names>Aravind</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Project Administration</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-4019-1940</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no" equal-contrib="yes">
                    <name>
                        <surname>Karamanis</surname>
                        <given-names>Nikiforos</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ide-Smith</surname>
                        <given-names>Michele</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Hickford</surname>
                        <given-names>Jonathan</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>McEntyre</surname>
                        <given-names>Johanna</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Funding Acquisition</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>EMBL European Bioinformatics Institute, Cambridge, CB10 1SD, UK</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:avenkat@ebi.ac.uk">avenkat@ebi.ac.uk</email>
                </corresp>
                <fn fn-type="equal" id="FN1">
                    <label>*</label>
                    <p>Equal contributions</p>
                </fn>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>11</day>
                <month>9</month>
                <year>2019</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2019</year>
            </pub-date>
            <volume>8</volume>
            <elocation-id>ELIXIR-1622</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>5</day>
                    <month>9</month>
                    <year>2019</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2019 Venkatesan A et al.</copyright-statement>
                <copyright-year>2019</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/8-1622/pdf"/>
            <abstract>
                <p>
                    <bold>Background:</bold> Manual curation is a cornerstone of public biological data resources. However, it is a time-consuming process that urgently needs supportive technical solutions in the face of rapid data growth. Supporting scalable curation is a part of the mission of the Elixir Data Platform. Thus far, we have established infrastructure capable of ingesting and aggregating text-mined outputs from multiple providers and making these available via an API. This public API is used by Europe PMC to display specific entities and relationships on full text articles (via the SciLite application).</p>
                <p>
                    <bold>Methods:</bold> To ensure that the future development of this infrastructure meets the needs of curators, we carried out a user research project to understand and identify common workflow patterns and practices via an observational study. Building on these outcomes, we then devised a curator community survey to more specifically understand which entity types, sections of a paper and tools are of top priority to address.</p>
                <p>
                    <bold>Results:</bold> The main challenges faced by curators included the following: a) There is a need for ways to prioritise and identify relevant papers for curation as the volume of literature is large; b) Finding specific information can prove difficult; quick ways of filtering articles based on specific entities, such as experimental methods, species and other important entities, such as genes, cell lines and tissue samples, are required; and c) Transferring information from the search/annotation tools to the various curation workflows was also challenging.</p>
                <p>
                    <bold>Conclusions:</bold> This study lays the foundation for identifying actionable items to orient the current infrastructure towards meeting the needs of curation community, by improving text-mined annotation quality and coverage and other engineering solutions; and reusing text-mined annotations and other metadata in Europe PMC for article triage. Furthermore, this study presents an opportunity to explore customisation of triage/ranking systems to suit different curation contexts.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>Database curation</kwd>
                <kwd>User research</kwd>
                <kwd>Observational study</kwd>
                <kwd>Curator survey</kwd>
                <kwd>Annotation Infrastructure</kwd>
                <kwd>Europe PMC</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/501100000780">
                    <funding-source>European Commission</funding-source>
                    <award-id>676559</award-id>
                </award-group>
                <funding-statement>This research was part of ELIXIR-EXCELERATE project, funded by the European Commission under Grant Agreement number 676559.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec sec-type="intro">
            <title>Introduction</title>
            <p>Biological databases play a key role in knowledge discovery in life science research. A major contributor towards the maintenance of these databases is the process of manual curation. Curation is a high value task as experts carefully examine the relevant scientific literature and extract the essential information, such as biological functions and relationships between biological entities, generating the corresponding database records in a structured way. The advances in high-throughput technologies have resulted in tremendous growth of biological data, consequently increasing the number of research papers being published. As a result, the demand for high-quality curation that makes use of these resources has never been higher, but this demand can present challenges for curators in finding, and assimilating scientific conclusions described in the literature.</p>
            <p>Text mining, machine learning and analytics promise to provide better ranking of reading lists, classification of articles, and identification of assertions with their biological context and evidence buried within the text of articles. To this end, many life science knowledgebases now include text mining (to varying degrees) in curation workflows. For example, databases, such as neXtProt
                <sup>
                    <xref ref-type="bibr" rid="ref-1">1</xref>,
                    <xref ref-type="bibr" rid="ref-2">2</xref>
                </sup> and FlyBase
                <sup>
                    <xref ref-type="bibr" rid="ref-3">3</xref>
                </sup> have integrated text mining algorithms into their respective curation workflows to retrieve a ranked list of relevant articles and tag entities of interest. Furthermore, tools like PubTator
                <sup>
                    <xref ref-type="bibr" rid="ref-4">4</xref>
                </sup> and TextPresso
                <sup>
                    <xref ref-type="bibr" rid="ref-5">5</xref>
                </sup> are other examples of text mining tools that have been adopted by some curation communities. On the other hand, databases that mainly rely on manual curation, such as IntAct
                <sup>
                    <xref ref-type="bibr" rid="ref-6">6</xref>
                </sup> and DisProt
                <sup>
                    <xref ref-type="bibr" rid="ref-7">7</xref>
                </sup>, are exploring possibilities to leverage text mining approaches to select articles for further curation.</p>
            <p>Broadly speaking, the curation community recognises the potential of text mining in article triage and the identification of entities/concepts for curation. Nevertheless, text mining pipelines adopted thus far have been engineered to cater to specific domains or projects and wide uptake is lacking; curators often continue to use manual curation methods. This mainly stems from the wide variety of very precise information required by curators. The challenge is therefore to produce robust systems that both address the immediate and specific needs of curators as well as scale across multiple curation groups. In order to do this, we need to know the immediate challenges faced by curators with respect to selection and prioritisation of articles to curate. A clear understanding of the requirements will help build new systems and/or re-orient existing systems that cater to the needs of the curation community.</p>
            <p>In this report we describe the outcomes of a user research project, conducted to understand curation practices and priorities for article selection. The project comprised of two parts, a) an observational study, to understand how curators proceed with selecting articles to curate, to identify commonalities in curator requirements; and b) a community survey, to specifically identify the immediate priorities of curators, such as entity types and sections of interest in an article, to name a few. The aim of this study is to identify specific actions for the Elixir Data Platform in the future, optimising and extending existing systems and infrastructural components. In the subsequent sections we present the main findings from our investigation.</p>
        </sec>
        <sec sec-type="methods">
            <title>Methods</title>
            <sec>
                <title>Observational study</title>
                <p>We initially drafted an interview guide (list of questions available as 
                    <italic toggle="yes">Extended data</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>
                    </sup>) and a preliminary curator persona, reflecting our initial hypotheses about curators and their work practices. Following this we selected five different curation teams. The selection criteria was based on the type of curation the group was involved with, i.e., extracting biological information from scientific literature and integrating it into a biological database, contrary to groups that process raw data submissions (such as sequencing data). Out of these, two teams were based at EMBL-EBI, the other three teams were based in Norway, Switzerland and Italy. For teams that were situated at EMBL-EBI, the interviews were held in person and for the other teams the interviews were conducted over conference calls. The sessions were conducted over a period of two months: between March and April 2018. We observed three project leaders and four curators from the selected teams. The participants belong to teams that focused on curating very specific experimental evidence primarily on proteins such as protein-protein interactions, the role of a protein in a complex, protein disruption, human protein functions and transcription factor regulation. One team is focussed on the annotation of human genes relevant to a particular disease and another one on curating publications reporting associations of genetic variants with diseases.</p>
                <p>For each session we followed an iterative user research process
                    <sup>
                        <xref ref-type="bibr" rid="ref-9">9</xref>,
                        <xref ref-type="bibr" rid="ref-10">10</xref>
                    </sup> (as outlined in 
                    <xref ref-type="table" rid="T1">Table 1</xref>):</p>
                <table-wrap id="T1" orientation="portrait" position="anchor">
                    <label>Table 1. </label>
                    <caption>
                        <title>Overview of user research activities.</title>
                        <p>The table provides an overview of the observation study on curation practices.</p>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Activity</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Purpose</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Participants</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Materials</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Outputs</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">Analysis</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Stakeholder
                                    <break/>interviews</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Learn about curation
                                    <break/>practices</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">3 project leads and 4
                                    <break/>curators from 5 different
                                    <break/>curation teams</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Interview guide</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Interview notes</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Observations
                                    <break/>- Patterns -
                                    <break/>Implications</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Follow-up
                                    <break/>interviews</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Clarify certain
                                    <break/>aspects of curation</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">1 project lead and 1
                                    <break/>curator from the same
                                    <break/>team</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Clarification questions</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Interview notes</td>
                                <td colspan="1" rowspan="1"/>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Stakeholder
                                    <break/>workshop</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Validate learnings
                                    <break/>from interviews</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">3 project leads and 4
                                    <break/>curators from 4 different
                                    <break/>curation teams</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Guide and notes</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">HCW themes</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Transcribed
                                    <break/>HCW themes
                                    <break/>and feedback</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">Draft curator persona</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Curator persona
                                    <break/>with feedback</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Revised curator
                                    <break/>persona</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">Draft curation process and
                                    <break/>example screenshots</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Curation process
                                    <break/>with feedback</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Revised curation
                                    <break/>process</td>
                            </tr>
                            <tr>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td colspan="1" rowspan="1"/>
                                <td align="left" colspan="1" rowspan="1" valign="top">Curation
                                    <break/>experience map</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Report on
                                    <break/>curation practice</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Consolidate and
                                    <break/>share validated
                                    <break/>learnings from our
                                    <break/>research</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Previous participants and
                                    <break/>another researcher</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Draft report
                                    <break/>Revised persona &amp;
                                    <break/>curation experience map</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">Revised materials
                                    <break/>incorporated into
                                    <break/>this report</td>
                                <td colspan="1" rowspan="1"/>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <list list-type="bullet">
                    <list-item>
                        <p>The participants were asked to proceed with their daily curation work
                            <sup>
                                <xref ref-type="bibr" rid="ref-11">11</xref>
                            </sup> and were observed on how:</p>
                        <list list-type="bullet">
                            <list-item>
                                <label>&#x25cb;</label>
                                <p>they select the entities (either come from a spreadsheet or partially curated data record) they wish to curate,</p>
                            </list-item>
                            <list-item>
                                <label>&#x25cb;</label>
                                <p>perform searches (including the query parameters) to retrieve the initial set of publications,</p>
                            </list-item>
                            <list-item>
                                <label>&#x25cb;</label>
                                <p>the criteria they used to either discard or select an article,</p>
                            </list-item>
                            <list-item>
                                <label>&#x25cb;</label>
                                <p>the information from the selected article is transferred to the respective curation platform.</p>
                            </list-item>
                        </list>
                    </list-item>
                </list>
                <list list-type="bullet">
                    <list-item>
                        <p>Using the &#x201c;What? So What? Now What?&#x201d; method
                            <sup>
                                <xref ref-type="other" rid="fn1">1</xref>
                            </sup>, we transcribed our notes from these sessions and identified the most important observations, patterns and their implications.</p>
                    </list-item>
                </list>
                <p>Additionally, we further carried out two follow-up interviews with one project lead and one curator from the same team at EMBL-EBI to clarify particular curation tasks.</p>
                <p>Furthermore, we conducted a stakeholder workshop
                    <sup>
                        <xref ref-type="bibr" rid="ref-12">12</xref>
                    </sup> with three project leads and four curators from four different curation teams to validate our main learnings from the interview sessions. As some curators took part both in the interviews and the workshop, overall we have engaged with 12 curators and team leads from seven different teams. The participants were presented with the preliminary curator persona
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup> and a workflow outlining the curation process. The participants were invited to give us their feedback on these drafts and express their challenges or pain points as How Can We (HCW) questions
                    <sup>
                        <xref ref-type="other" rid="fn2">2</xref>
                    </sup>. Their feedback was used to revise the curator persona and the curation process workflow and was consolidated into the curation experience map.</p>
            </sec>
            <sec>
                <title>Community survey</title>
                <p>Based on the interview guide used for the observational study, we formulated questions to understand the immediate challenges. The survey consisted of 15 questions (see 
                    <italic toggle="yes">Extended data</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>
                    </sup>) ranging from, for instance, the section of the article the curators were most interested in; the types of biological entities curators look for; and whether it helps to know that a given article has been curated/accounted for in another database. The community survey was conducted online and was developed using 
                    <ext-link ext-link-type="uri" xlink:href="https://www.typeform.com/">Typeform</ext-link>. The survey was promoted via the mailing lists of various consortia, such as ELIXIR, International Society for Biocuration (ISB) and Alliance of Genome Resources. These widely known consortia provide a forum for developers, researchers and curators to streamline and standardise the maintenance of biological resources. The survey was conducted between December 2018 to January 2019.</p>
            </sec>
            <sec>
                <title>Ethical issues</title>
                <p>We confirm that we have obtained consent to use data from the participants as per the 
                    <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/data-protection/privacy-notice/europe-pmc-user-research-training-and-outreach">Europe PMC privacy notice</ext-link>. The privacy notice is formulated in accordance with 
                    <ext-link ext-link-type="uri" xlink:href="http://www.embl-hamburg.de/aboutus/administration/legal-services/data-protection/">EMBL&#x2019;s data protection framework</ext-link>. The consent was part of the survey form and the participants can take the survey on accepting the terms and conditions for data re-use.</p>
            </sec>
        </sec>
        <sec sec-type="results | discussion">
            <title>Results and discussion</title>
            <sec>
                <title>Observational study</title>
                <p>
                    <bold>
                        <italic toggle="yes">Curator persona</italic>
                    </bold>. We created a persona called Ashley (see 
                    <italic toggle="yes">Underlying data</italic>
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>
                    </sup>) to present the curators&#x2019; needs, sentiments, tasks and pain points from their own perspective in more detail to help us empathise with them
                    <sup>
                        <xref ref-type="bibr" rid="ref-13">13</xref>
                    </sup>:</p>
                <p>Ashley curates with 
                    <bold>precision</bold> and 
                    <bold>attention to detail</bold>, while trying to be as efficient as possible. Ashley is looking for 
                    <bold>very specific information</bold> about an experiment that the authors of a paper do not always report in a lot of detail. Ashley appreciates being able to ask a team mate when the &#x201c;detective work&#x201d; bears no fruit.</p>
                <p>Apart from these &#x201c;organic, informal discussions&#x201d;, Ashley works independently during &#x201c;triage&#x201d;, &#x201c;annotation&#x201d; and to fill in the curation record in the Editor. During the latter stage Ashley tries to &#x201c;translate from the author&#x2019;s language to the curator&#x2019;s language&#x201d; using the appropriate 
                    <bold>identifiers</bold> and 
                    <bold>Control Vocabulary (CV) terms</bold> for species, proteins, methods and other important entities so that the curated evidence is referred to 
                    <bold>precisely</bold> and 
                    <bold>consistently</bold> and the annotations in the curation record are self explanatory outside the context of the paper.</p>
                <p>This is cumbersome as a particular type of evidence is not always referred to in the same way and in enough detail in the literature. Moreover, the Editor is not integrated with the search and annotation tools, so Ashley spends a lot of time going back and forth between the paper and the Editor, translating the curatable text from the paper into CV terms, switching browser tabs and consulting notes from the online research and team discussions.</p>
                <p>
                    <bold>
                        <italic toggle="yes">Curation experience map</italic>
                    </bold>. The curation experience map in 
                    <xref ref-type="fig" rid="f1">Figure 1</xref> presents the identified pain points in the context of the main curation activities. As shown in the map, curation consists of four stages:</p>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>The curation experience map presents the pain points in the context of the main curation activities.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/21298/009aa614-a176-4994-97ee-078572925bc2_figure1.gif"/>
                </fig>
                <list list-type="bullet">
                    <list-item>
                        <label>a)</label>
                        <p>Deciding which entity (primarily protein in this case) to curate. What to curate often depends on the curator&#x2019;s background and the project that they are working on.</p>
                    </list-item>
                    <list-item>
                        <label>b)</label>
                        <p>&#x201c;Triaging&#x201d; the literature to identify relevant publications.</p>
                    </list-item>
                    <list-item>
                        <label>c)</label>
                        <p>&#x201c;Annotating&#x201d; a relevant publication to identify the precise curatable information in detail, including determining the 
                            <bold>species</bold> and the relevant 
                            <bold>experimental method</bold>.</p>
                    </list-item>
                    <list-item>
                        <label>d)</label>
                        <p>Filling in the curation record based on the curatable information in the publication (which is often done in parallel with annotating the paper).</p>
                    </list-item>
                </list>
                <p>In a typical curation scenario, curators:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>Search PubMed for a protein and scan the titles in the search results to identify relevant 
                            <bold>experimental</bold> papers during &#x201c;triage&#x201d;.</p>
                    </list-item>
                    <list-item>
                        <p>If a 
                            <bold>title</bold> indicates that the paper is relevant (e.g. by mentioning the protein and/or 
                            <bold>species</bold> of interest), then they skim read the 
                            <bold>Methods</bold> and the 
                            <bold>Results</bold> of the paper. They are particularly interested in 
                            <bold>Figures</bold>, 
                            <bold>Tables</bold> and their 
                            <bold>Legends</bold>, which is where they usually find the key (curatable) information.</p>
                    </list-item>
                    <list-item>
                        <p>These sections are read more thoroughly during the &#x201c;annotation&#x201d; stage to identify the 
                            <bold>exact experimental context</bold> that needs to be curated. They may &#x201c;glance through&#x201d; the Abstract or skip it altogether.</p>
                    </list-item>
                </list>
                <p>A pain point during &#x201c;triage&#x201d; is 
                    <bold>identifying relevant publications</bold>: The curators reported that most publications returned in a PubMed search are usually not relevant (i.e. they are false positives). Additionally, because they are often looking for 
                    <bold>very specific</bold> and at times 
                    <bold>underreported</bold> experimental evidence, some searches return very 
                    <bold>few papers</bold> or 
                    <bold>no papers</bold> at all.</p>
                <p>Ambiguities in the paper about 
                    <bold>species</bold>, 
                    <bold>proteins</bold>, and relevant 
                    <bold>experimental methods</bold> may slow down curation significantly during &#x201c;triage&#x201d; and &#x201c;annotation&#x201d;. The curators highlighted that identifying the 
                    <bold>species</bold> as their main pain point as this task may take up to &#x201c;75% of the curation effort&#x201d;, and in the end may turn out to be irretrievable from the paper.</p>
                <p>Furthermore, to get clarity on the details of an experiment, curators would look up specific references in the paper and do further research online. If this &#x201c;detective work&#x201d; is not successful, the curators will either not annotate the paper or will provide fewer annotations. The curators would also discuss unresolved questions such as the annotation of unusual data or what to do when there is no curatable data with their teammates during &#x201c;organic, informal discussions&#x201d;. As a last resort they would contact the authors directly; however, authors often do not respond to requests for clarification.</p>
                <p>If there are no matching CV terms to annotate the paper new ones are requested. This can delay annotating the paper because the ontology staff are often different to those curating the papers and sometimes requesting new terms results in prolonged discussions between curators and ontologists.</p>
                <p>It was observed that curators use different tools at each stage, which are not integrated with each other. During &#x201c;triage&#x201d; they would search PubMed for relevant publications and then look at a particular paper on the publisher&#x2019;s site. Annotating a publication may involve downloading or printing the pdf version of the paper and highlighting curatable text. To fill in the curation record they use a bespoke tool which they call &#x201c;the Editor&#x201d;, which presents a template that needs to be filled in with molecule names and experimental context, supported by standardised identifiers, controlled vocabularies and ontologies as well as free text describing the experimental evidence. Most of the times the assertions that go in the database are not in the paper in the same words.</p>
                <p>
                    <bold>
                        <italic toggle="yes">Summary of the pain points in curation workflows</italic>
                    </bold>. The identified pain points were formulated as How Can We (HCW) questions as follows:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>HCW identify 
                            <bold>relevant publications</bold> for curation in search results or a list of references during &#x201c;triage&#x201d;?</p>
                    </list-item>
                    <list-item>
                        <p>HCW identify 
                            <bold>species</bold>, 
                            <bold>experimental methods, molecules</bold> (primarily 
                            <bold>proteins</bold>) and other important entities (such as 
                            <bold>cells</bold> and 
                            <bold>tissues</bold>) in a publication during &#x201c;triage&#x201d; and &#x201c;annotation&#x201d;?</p>
                    </list-item>
                    <list-item>
                        <p>HCW help curators fill in the curation record more efficiently?</p>
                    </list-item>
                </list>
            </sec>
            <sec>
                <title>Community survey</title>
                <p>The survey received 42 respondents in total, covering a number of European countries, such as the United Kingdom, France, Italy and Switzerland. The majority of the participants identified themselves as &#x2018;Scientific curator&#x2019; with over 5 years experience in curation. Broadly speaking respondents mainly curate peer-reviewed articles (43.6%), followed by review (25.5%) and preprint (16%) articles. 
                    <xref ref-type="fig" rid="f2">Figure 2</xref> shows their preference in the type of articles for curation.</p>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>Figure 2. </label>
                    <caption>
                        <title>The pie chart shows the article types that are of interest to curators.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/21298/009aa614-a176-4994-97ee-078572925bc2_figure2.gif"/>
                </fig>
                <p>
                    <xref ref-type="fig" rid="f3">Figure 3</xref> shows the section of articles of interest to the curators: the majority look for method sections in articles. The other sections of importance are the figures/tables and their legends. Apart from these, the supplementary data seems to be a section of importance. Furthermore, as shown in 
                    <xref ref-type="fig" rid="f4">Figure 4</xref>, the types of entities curators look for in articles were diverse with preference given mainly to: genes/protein curation and their functions, database accession numbers, experimental methods and gene mutations.</p>
                <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                    <label>Figure 3. </label>
                    <caption>
                        <title>The figure shows the article sections that are of interest to curators.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/21298/009aa614-a176-4994-97ee-078572925bc2_figure3.gif"/>
                </fig>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>Figure 4. </label>
                    <caption>
                        <title>The graph provides an overview of the entity types of interest.</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/21298/009aa614-a176-4994-97ee-078572925bc2_figure4.gif"/>
                </fig>
                <p>Respondents were asked if it was useful to know a given article was already curated by another database: the results indicate that it was useful (see 
                    <xref ref-type="fig" rid="f5">Figure 5</xref>). A follow-up question was asked as to (if useful) why it was useful to know if an article is already annotated by another database, the majority of the responses ranged from: avoiding duplication if the curators belong to the same consortium, as a means of validation (in case of ontology terms), and consistency in annotations.</p>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>Figure 5. </label>
                    <caption>
                        <title>The figure shows the response for the question: How useful is it to know if the article has been curated by another database?</title>
                    </caption>
                    <graphic orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/21298/009aa614-a176-4994-97ee-078572925bc2_figure5.gif"/>
                </fig>
            </sec>
            <sec>
                <title>Outcomes of the user research project</title>
                <p>Text mining approaches are sophisticated and play a vital role in addressing big data questions, the results of which can contribute to supplying &#x201c;leads&#x201d; on key papers for curation. However, curators require a wide variety of very precise information. Addressing each of those specific requirements will be a complex task, but text mining systems can certainly provide underlying services based on broad commonalities in the requirements that can prove useful to curators. To this end, this effort has proved to be useful in terms of understanding the main challenges faced by curators. While the sample size of the community survey was small, when analysed in conjunction with the observational study we found significant commonalities on the work practices. For instance, in the survey when the respondents were asked for their biggest challenge while curating, the majority responses indicated finding relevant papers, and identifying specific information that includes genes and species.</p>
                <p>Our research on curation practices so far indicates a need to better support curators on the following areas:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>Identify 
                            <bold>relevant papers</bold> for curation during &#x201c;triage&#x201d;. An efficient way towards article selection, where search results could be prioritised based on a set of parameters.</p>
                    </list-item>
                    <list-item>
                        <p>Identify 
                            <bold>species</bold>, relevant 
                            <bold>experimental methods, molecules</bold> (primarily 
                            <bold>proteins</bold>) and other important entities (such as 
                            <bold>cells</bold> and 
                            <bold>tissues</bold>) in a publication during &#x201c;triage&#x201d; and &#x201c;annotation&#x201d;.</p>
                    </list-item>
                    <list-item>
                        <p>Retrieving certain sections of articles such as Methods, Figures or Results.</p>
                    </list-item>
                    <list-item>
                        <p>Integrate triage systems to the various curation workflows.</p>
                    </list-item>
                </list>
            </sec>
        </sec>
        <sec sec-type="conclusions">
            <title>Conclusion</title>
            <p>Contributions made by manual curation are vital to the maintenance of biological databases. To maximise the impact of this critically important process, the latest technological advancements need to be leveraged. Under the Elixir Data platform, we have established infrastructural elements to support scalable curation, which includes automated systems to ingest and aggregate from various sources, APIs to redistribute the annotations and an application called SciLite to display annotations on articles. However, a key challenge for scalable curation is to make use of such core components across different curation teams, whose requirements and workflows can be highly precise and vary widely. Consequently, this requires engagement with the curation community to derive actionable insights that may contribute towards service delivery. Therefore, this project lays the foundation needed to understand the commonalities shared among various curation workflows. Going forward, we will use the results of the project to feed into improvements to text-mined annotation quality and coverage, triage and browsing systems or other engineering solutions.</p>
        </sec>
        <sec>
            <title>Data availability</title>
            <p>The interview responses from the observation study have not been made public to protect the participants&#x2019; privacy. Please contact the corresponding author to apply for access to the data, providing details of the information required and the intended use of the data. Access to the data will be granted once permission from participants to share the data has been obtained.</p>
            <sec>
                <title>Underlying data</title>
                <p>Zenodo: Results of user research project to understand data curation practices. 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3209658">https://doi.org/10.5281/zenodo.3209658</ext-link>
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>
                    </sup>.</p>
                <p>This project contains the following underlying data:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>Curator persona.docx (the curator persona generated during the first part of the study).</p>
                    </list-item>
                    <list-item>
                        <p>Curator survey results.xlsx (raw data taken from the survey given to each participant).</p>
                    </list-item>
                </list>
            </sec>
            <sec>
                <title>Extended data</title>
                <p>Zenodo: Results of user research project to understand data curation practices. 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3209658">https://doi.org/10.5281/zenodo.3209658</ext-link>
                    <sup>
                        <xref ref-type="bibr" rid="ref-8">8</xref>
                    </sup>.</p>
                <p>This project contains the following extended data:</p>
                <list list-type="bullet">
                    <list-item>
                        <p>Observation study - interview guide.docx (interview guide outlines the type of questions to be asked)</p>
                    </list-item>
                    <list-item>
                        <p>Curator survey questions.docx (questionnaire given to each participant in the community survey).</p>
                    </list-item>
                </list>
                <p>Data are available under the terms of the 
                    <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International license</ext-link> (CC-BY 4.0).</p>
            </sec>
        </sec>
    </body>
    <back>
        <ack>
            <title>Acknowledgements</title>
            <p>We are grateful to our participants, to Francisco Talo for his involvement and to Ane M&#x00f8;ller Gabrielsen for her comments.</p>
        </ack>
        <fn-group>
            <fn id="fn1">
                <p>
                    <sup>1</sup>
                    <ext-link ext-link-type="uri" xlink:href="http://www.myddelton.co.uk/blog/what-so-what-now-what">http://www.myddelton.co.uk/blog/what-so-what-now-what</ext-link>
                </p>
            </fn>
            <fn id="fn2">
                <p>
                    <sup>2</sup>
                    <ext-link ext-link-type="uri" xlink:href="http://www.designkit.org/methods/3">http://www.designkit.org/methods/3</ext-link>
                </p>
            </fn>
        </fn-group>
        <ref-list>
            <ref id="ref-1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Lane</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname> Argoud-Puy</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Britan</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>neXtProt: a knowledge platform for human proteins.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2012</year>;<volume>40</volume>(<issue>Database issue</issue>):<fpage>D76</fpage>&#x2013;<lpage>D83</lpage>.
                    <pub-id pub-id-type="pmid">22139911</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkr1179</pub-id>
                    <pub-id pub-id-type="pmcid">3245017</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Mottin</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gobeill</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pasche</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>neXtA5: accelerating annotation of articles via automated approaches in neXtProt.</article-title>
                    <source>

                        <italic toggle="yes">Database (Oxford).</italic>
</source>
                    <year>2016</year>;<volume>2016</volume>: pii: baw098.
                    <pub-id pub-id-type="pmid">27374119</pub-id>
                    <pub-id pub-id-type="doi">10.1093/database/baw098</pub-id>
                    <pub-id pub-id-type="pmcid">4930835</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Karamanis</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Seal</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lewin</surname>
                            <given-names>I</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Natural language processing in aid of FlyBase curators.</article-title>
                    <source>

                        <italic toggle="yes">BMC Bioinformatics.</italic>
</source>
                    <year>2008</year>;<volume>9</volume>:<fpage>193</fpage>.
                    <pub-id pub-id-type="pmid">18410678</pub-id>
                    <pub-id pub-id-type="doi">10.1186/1471-2105-9-193</pub-id>
                    <pub-id pub-id-type="pmcid">2375127</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wei</surname>
                            <given-names>CH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kao</surname>
                            <given-names>HY</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lu</surname>
                            <given-names>Z</given-names>
                        </name>
</person-group>:
                    <article-title>PubTator: a web-based text mining tool for assisting biocuration.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2013</year>;<volume>41</volume>(<issue>Web Server issue</issue>):<fpage>W518</fpage>&#x2013;<lpage>22</lpage>.
                    <pub-id pub-id-type="pmid">23703206</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkt441</pub-id>
                    <pub-id pub-id-type="pmcid">3692066</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>M&#x00fc;ller</surname>
                            <given-names>HM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kenny</surname>
                            <given-names>EE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sternberg</surname>
                            <given-names>PW</given-names>
                        </name>
</person-group>:
                    <article-title>Textpresso: An Ontology-Based Information Retrieval and Extraction System for Biological Literature.</article-title>
                    <source>

                        <italic toggle="yes">PLoS Biol.</italic>
</source>
                    <year>2004</year>;<volume>2</volume>(<issue>11</issue>):<fpage>e309</fpage>.
                    <pub-id pub-id-type="pmid">15383839</pub-id>
                    <pub-id pub-id-type="doi">10.1371/journal.pbio.0020309</pub-id>
                    <pub-id pub-id-type="pmcid">517822</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Orchard</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ammari</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Aranda</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The MIntAct project--IntAct as a common curation platform for 11 molecular interaction databases.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2014</year>;<volume>42</volume>(<issue>Database issue</issue>):<fpage>D358</fpage>&#x2013;<lpage>63</lpage>.
                    <pub-id pub-id-type="pmid">24234451</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkt1115</pub-id>
                    <pub-id pub-id-type="pmcid">3965093</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Piovesan</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tabaro</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Mi&#x010d;eti&#x0107;</surname>
                            <given-names>I</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>DisProt 7.0: a major update of the database of disordered proteins.</article-title>
                    <source>

                        <italic toggle="yes">Nucleic Acids Res.</italic>
</source>
                    <year>2017</year>;<volume>45</volume>(<issue>D1</issue>):<fpage>D219</fpage>&#x2013;<lpage>D227</lpage>.
                    <pub-id pub-id-type="pmid">27899601</pub-id>
                    <pub-id pub-id-type="doi">10.1093/nar/gkw1056</pub-id>
                    <pub-id pub-id-type="pmcid">5210544</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-8">
                <label>8</label>
                <mixed-citation publication-type="data">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Venkatesan</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Karamanis</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ide-Smith</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Results of user research project to understand data curation practices</article-title>. [Data set]. Zenodo.<year>2019</year>.
                    <ext-link ext-link-type="uri" xlink:href="http://www.doi.org/10.5281/zenodo.3209659">http://www.doi.org/10.5281/zenodo.3209659</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gothelf</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Seiden</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Lean UX: Designing Great Products with Agile Teams</article-title>. O&#x2019;Reilly Media, Inc.<year>2016</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://books.google.co.in/books?id=QnQNDQAAQBAJ&amp;printsec=frontcover&amp;source=gbs_ge_summary_r&amp;cad=0#v=onepage&amp;q&amp;f=false">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Karamanis</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pignatelli</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Carvalho-Silva</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Designing an intuitive web application for drug discovery scientists.</article-title>
                    <source>

                        <italic toggle="yes">Drug Discov Today.</italic>
</source>
                    <year>2018</year>;<volume>23</volume>(<issue>6</issue>):<fpage>1169</fpage>&#x2013;<lpage>1174</lpage>.
                    <pub-id pub-id-type="pmid">29337199</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.drudis.2018.01.032</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref-11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Beyer</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Holtzblatt</surname>
                            <given-names>K</given-names>
                        </name>
</person-group>:
                    <article-title>Contextual Design: Defining Customer-Centered Systems</article-title>. Morgan Kaufmann.<year>1997</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/citation.cfm?id=2821566">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gray</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Brown</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Macanufo</surname>
                            <given-names>J</given-names>
                        </name>
</person-group>:
                    <article-title>Gamestorming: A Playbook for Innovators, Rulebreakers and Changemakers</article-title>. O&#x2019;Reilly Media, Inc.<year>2010</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://books.google.co.in/books?id=_-xnEDNPxwYC&amp;printsec=frontcover&amp;source=gbs_ge_summary_r&amp;cad=0#v=onepage&amp;q&amp;amp;f=false">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref-13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Cooper</surname>
                            <given-names>A</given-names>
                        </name>
</person-group>:
                    <article-title>The Inmates Are Running the Asylum: Why High Tech Products Drive Us Crazy and How to Restore the Sanity</article-title>. Sams-Pearson Education.<year>2004</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/citation.cfm?id=984201">Reference Source</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report53748">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.21298.r53748</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Arighi</surname>
                        <given-names>Cecilia N.</given-names>
                    </name>
                    <xref ref-type="aff" rid="r53748a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0803-4817</uri>
                </contrib>
                <aff id="r53748a1">
                    <label>1</label>Center for Bioinformatics and Computational Biology, University of Delaware, Newark, DE, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>14</day>
                <month>10</month>
                <year>2019</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2019 Arighi CN</copyright-statement>
                <copyright-year>2019</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport53748" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.19427.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>This work presents a study about the biocuration community and its literature-based curation practices. The work intends to identify pain points and commonalities in the curation workflow where ePMC infrastructure could work to assist this community. 
                <list list-type="bullet">
                    <list-item>
                        <p>The study includes the detailed observation of 5 curation groups following their regular literature curation work and a survey targeted to the biocuration community to learn about the literature-based curation tasks. The approach is appropriate and well designed. However, it seems that the curation groups observed are biased toward protein/gene-centric curation, whereas there are other workflows, such as those for model organisms, chemicals, that may have a completely different approach to the literature curation. In fact, some model organism databases (like MGI) first do triage to select articles about the organism, then classify articles based on specific curation topics (phenotype, GO, etc). Then the conclusion from their workflow analysis could be different. Has anything come up from the survey indicating that other groups were represented in this work?</p>
                        <p> For example, in the survey there is a question about database resource &#x201c;Which data resource(s) do you curate?&#x201d; But the result of this, which is important to learn about the databases represented, is not shown in the survey result document. I understand that the information in the survey result may be hidden to protect privacy of participants but showing the distribution of databases and/or type of databases represented (model organism vs. specific domain, like structure, PPI, etc) can shed light into bias or non-bias toward one type of curation. &#x00a0;</p>
                    </list-item>
                    <list-item>
                        <p>Another question is about the groups observed. In the introduction, a couple of groups are mentioned that have integrated text mining pipelines in their curation work. It would be important for the study to indicate if any of the curation groups that were observed or curators surveyed used text mining tools for their work, or to include some group in observation study that do use text mining to see in what capacity and if bottlenecks in literature curation are the same.</p>
                    </list-item>
                    <list-item>
                        <p>The manuscript would be enriched by describing previous work done on this area (biocuration workflows and bottlenecks in literature curation) and compare the conclusions in this manuscript with others. There are a few papers from BioCreative that looked into this matter.
                            <sup>
                                <xref ref-type="bibr" rid="rep-ref-53748-1">1</xref>
                            </sup>
                            <sup>,</sup>
                            <sup>&#x00a0;</sup>
                            <sup>
                                <xref ref-type="bibr" rid="rep-ref-53748-2">2</xref>
                            </sup>
                            <sup>,</sup>&#x00a0;
                            <sup>
                                <xref ref-type="bibr" rid="rep-ref-53748-3">3</xref>
                            </sup>
                        </p>
                    </list-item>
                    <list-item>
                        <p>Finally, please consider using the term expert curation instead of manual curation. I think using the concept of manual curation in databases is not appropriate for modern times, as curators use tools to help them do all or part of their work, is not completely manual.</p>
                    </list-item>
                    <list-item>
                        <p>Minor: Figures 2-5 should indicate number of participants who responded to the questions over total number of survey participants. The X-axis in Figures 3-4 need a label, same with Y-axis in Figure 5.</p>
                    </list-item>
                </list>
            </p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Yes</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>My area of expertise is on biocuration, usability and text mining applied to biocuration.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-53748-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Text mining for the biocuration workflow.</article-title>
                        <source>
                            <italic>Database (Oxford)</italic>
                        </source>.<year>2012</year>;<volume>2012</volume>:
                        <elocation-id>10.1093/database/bas020</elocation-id>
                        <fpage>bas020</fpage>
                        <pub-id pub-id-type="pmid">22513129</pub-id>
                        <pub-id pub-id-type="doi">10.1093/database/bas020</pub-id>
                    </mixed-citation>
                </ref>
                <ref id="rep-ref-53748-2">
                    <label>2</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Biocuration workflows and text mining: overview of the BioCreative 2012 Workshop Track II.</article-title>
                        <source>
                            <italic>Database (Oxford)</italic>
                        </source>.<year>2012</year>;<volume>2012</volume>:
                        <elocation-id>10.1093/database/bas043</elocation-id>
                        <fpage>bas043</fpage>
                        <pub-id pub-id-type="pmid">23160416</pub-id>
                        <pub-id pub-id-type="doi">10.1093/database/bas043</pub-id>
                    </mixed-citation>
                </ref>
                <ref id="rep-ref-53748-3">
                    <label>3</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Pressing needs of biomedical text mining in biocuration and beyond: opportunities and challenges.</article-title>
                        <source>
                            <italic>Database (Oxford)</italic>
                        </source>.<year>2016</year>;<volume>2016</volume>:
                        <elocation-id>10.1093/database/baw161</elocation-id>
                        <pub-id pub-id-type="pmid">28025348</pub-id>
                        <pub-id pub-id-type="doi">10.1093/database/baw161</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report53750">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.21298.r53750</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Hirschman</surname>
                        <given-names>Lynette</given-names>
                    </name>
                    <xref ref-type="aff" rid="r53750a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-8442-0720</uri>
                </contrib>
                <aff id="r53750a1">
                    <label>1</label>The MITRE Corporation, Bedford, MA, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>7</day>
                <month>10</month>
                <year>2019</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2019 Hirschman L</copyright-statement>
                <copyright-year>2019</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport53750" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.19427.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>This is a well-designed and informative examination of data curation practices in support of the Elixir Data Platform, with a focus on exploring curator needs and pain points to identify where automated tools might help. The article presents results from a series of studies based on interviews with curators and responses from a questionnaire filled out by over 40 curators from 11 countries.&#x00a0;</p>
            <p> These results are consistent with an earlier article from 2012,&#x00a0;Text Mining for the Biocuration Workflow
                <sup>
                    <xref ref-type="bibr" rid="rep-ref-53750-1">1</xref>
                </sup>, which summarized the findings of a workshop held at the third International Biocuration Conference. Below is a short&#x00a0; from the 2012 paper:</p>
            <p> </p>
            <p> ***</p>
            <p> 
                <italic>Curators wanted tools that were easy to use, easy to install and easy to maintain by the intended end user (ideally, a developer associated with the curation team, who will not necessarily be an expert in text mining or natural language processing). The tools do not have to be perfect, but they need to complement (not replace) the biocurator's function. A number of curation groups indicated that they would use the tools to do an initial batch processing, followed by biocurator validation, where the biocurator makes a yes/no decision and avoids having to type or look names up in a large database. Another important use was linking mentions of biological entities in text with the correct identifiers in biological databases, as well as linkage to the appropriate ontology terms. A number of curators felt that they would like text mining tools to aid in identifying and prioritizing papers for curation, to avoid wasting time on papers that did not have &#x2018;relevant&#x2019; (e.g. curatable or novel) results. They also wanted tools to identify the sections of full-text papers containing curatable information.</italic>
            </p>
            <p> ***</p>
            <p> </p>
            <p> It would be informative to do a comparison of the findings from the 2012 paper with this paper, to see whether curator needs have changed &#x2013; and to what extent automated tools have been able to address some of the curator needs.&#x00a0;</p>
            <p> One of the paper&#x2019;s most interesting findings from the curator feedback is the need to identify&#x00a0;
                <underline>species</underline>. This was flagged as a particular pain point, taking &#x2018;up to 75% of the curation effort&#x2019;! This has also been a consistent stumbling block for text mining systems, because identification of species is essential to link a mention of a gene or protein to the correct accession number in databases such as UniProt or EntrezGene. This turns out to be a hard problem for text mining systems (as well as for curators) for several reasons: 1) mentions of species are often given as background information and may well not be mentioned in the same sentence (or even section) as mentions of the gene or protein being studied; 2) authors may want to generalize findings to other species (especially to humans) even when the experiments have been done on other species; 3) information about the specific experimental constructs may be buried in the methods section &#x2013; and may involve inserting a gene from one organism into the genome of a different organism.&#x00a0;&#x00a0;</p>
            <p> One interesting omission is a discussion of the need for interactive curation tools. This has been a major theme of recent BioCreative evaluations (see, e.g.,&#x00a0;Overview of the interactive task in BioCreative V
                <sup>
                    <xref ref-type="bibr" rid="rep-ref-53750-2">2</xref>
                </sup>). If an interactive system could show the curator a prioritized list of candidates, this could speed up several curation activities. Specifically, an interactive system could, e.g., show a ranked list of candidate papers for curation, with evidence highlighted; or show a paper (section) with a gene or protein mention highlighted together with a selectable list of candidate species, so the curator could quickly select the correct species in order to link to the correct accession number; or show highlighted evidence sentences supporting protein-protein interaction, for quick validation or rejection by the curator. Given a goal of providing tools to speed the curation workflow, interactive systems offer a promising approach by putting the human in the loop to augment text mining, where automated tools do not, on their own, provide sufficient accuracy.</p>
            <p> Finally, in the Conclusion section, the authors discuss the need to tailor capabilities for different curation tasks or workflows. Identifying commonalities is indeed key, as the authors note; but there is also an urgent need to develop methods to quickly tailor tools to new tasks or specific requirements in a curation workflow. This is an underexplored area, but may be key to more widespread adoption of text mining tools.&#x00a0;&#x00a0;</p>
            <p> Specific comments: 
                <list list-type="bullet">
                    <list-item>
                        <p>Add references to some of the older background work in this area.&#x00a0;&#x00a0;</p>
                    </list-item>
                </list> 
                <list list-type="bullet">
                    <list-item>
                        <p>The Conclusions section of the Abstract mentions &#x201c;actionable items&#x201d; but doesn&#x2019;t provide specifics. The list on p. 9, col 1 top is informative, and could be included in the abstract, e.g., prioritizing papers; filtering articles based on specific entity types; and retrieving specific sections of articles.&#x00a0;</p>
                    </list-item>
                    <list-item>
                        <p>A table summarizing the different interactions with curation teams and curators would be helpful, along with the specific types of curation being done by the teams. This information is presented in the text at different points, but it is hard to keep track without a summary, since there were several rounds and types of interactions.&#x00a0;</p>
                    </list-item>
                </list> 
                <list list-type="bullet">
                    <list-item>
                        <p>In Figure 3, it would be useful to know the denominator, as well as the actual number of curators identifying specific sections.&#x00a0; &#x00a0;</p>
                    </list-item>
                </list> 
                <list list-type="bullet">
                    <list-item>
                        <p>Figure 4 is very useful, but hard to read.&#x00a0;In addition, it would be interesting to know what specific controlled vocabularies are in use for each of these types of information.&#x00a0; &#x00a0;</p>
                    </list-item>
                </list> 
                <list list-type="bullet">
                    <list-item>
                        <p>Figure 5 &#x2013; again, it would be useful to know the denominator (the total number of respondents for that question).&#x00a0;&#x00a0;</p>
                    </list-item>
                </list>
            </p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Yes</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Evaluation of text mining for biomedical applications, particularly curation.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-53750-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Text mining for the biocuration workflow.</article-title>
                        <source>
                            <italic>Database (Oxford)</italic>
                        </source>.<year>2012</year>;<volume>2012</volume>:
                        <elocation-id>10.1093/database/bas020</elocation-id>
                        <fpage>bas020</fpage>
                        <pub-id pub-id-type="pmid">22513129</pub-id>
                        <pub-id pub-id-type="doi">10.1093/database/bas020</pub-id>
                    </mixed-citation>
                </ref>
                <ref id="rep-ref-53750-2">
                    <label>2</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Overview of the interactive task in BioCreative V.</article-title>
                        <source>
                            <italic>Database (Oxford)</italic>
                        </source>.<year>2016</year>;<volume>2016</volume>:
                        <elocation-id>10.1093/database/baw119</elocation-id>
                        <pub-id pub-id-type="pmid">27589961</pub-id>
                        <pub-id pub-id-type="doi">10.1093/database/baw119</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
    </sub-article>
</article>
