<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="other" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.179775.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Study Protocol</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Artificial intelligence tools for automating assessments of reporting guideline adherence: a protocol for a systematic review</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 2 approved with reservations]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Zeng</surname>
                        <given-names>Minyan</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-7294-2599</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Liu</surname>
                        <given-names>Shiwei</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0009-0006-9382-1538</uri>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Clark</surname>
                        <given-names>David PQ</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>McDonald</surname>
                        <given-names>Steve</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-2832-5205</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Mayo-Wilson</surname>
                        <given-names>Evan</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ying</surname>
                        <given-names>Xiangji</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Menke</surname>
                        <given-names>Joe</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Lan</surname>
                        <given-names>Mengfei</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Jiang</surname>
                        <given-names>Lan</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ninan</surname>
                        <given-names>Kiran</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Oberste</surname>
                        <given-names>Jean-Pierre</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0009-0003-2075-5267</uri>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>McKenzie</surname>
                        <given-names>Joanne E</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-3534-1641</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Kilicoglu</surname>
                        <given-names>Halil</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Page</surname>
                        <given-names>Matthew J</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-4242-7526</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Methods in Evidence Synthesis Unit, Monash University School of Public Health and Preventive Medicine, Melbourne, Victoria, Australia</aff>
                <aff id="a2">
                    <label>2</label>School of Information Sciences, University of Illinois Urbana-Champaign, Champaign, USA</aff>
                <aff id="a3">
                    <label>3</label>Department of Epidemiology, University of North Carolina Gillings School of Global Public Health, Chapel Hill, USA</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:minyan.zeng@monash.edu">minyan.zeng@monash.edu</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>28</day>
                <month>4</month>
                <year>2026</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2026</year>
            </pub-date>
            <volume>15</volume>
            <elocation-id>626</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>10</day>
                    <month>4</month>
                    <year>2026</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2026 Zeng M et al.</copyright-statement>
                <copyright-year>2026</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/15-626/pdf"/>
            <abstract>
                <sec>
                    <title>Background</title>
                    <p>Complete reporting of health-related research is necessary for users to understand, appraise, and apply research results appropriately. Reporting guidelines have been developed to support complete reporting. However, assessments of reporting guideline adherence remain inconsistent, time-consuming, and difficult to scale. Artificial intelligence (AI) tools, such as traditional natural language processing models and large language models, might provide a potential solution. While numerous AI tools have been developed, no comprehensive synthesis has been undertaken to investigate what they assess, how they are implemented and perform, and their potential utility.</p>
                </sec>
                <sec>
                    <title>Objective</title>
                    <p>This systematic review aims to synthesise the characteristics and findings of studies evaluating AI tools developed to assist or automate assessments of reporting guideline adherence.</p>
                </sec>
                <sec>
                    <title>Methods</title>
                    <p>We will search MEDLINE, Embase, Scopus, Europe PMC, ACM Digital Library, IEEE Xplore, arXiv and Cochrane Colloquium Abstracts, with no restrictions on date, language, or publication type. We will include studies that evaluate AI tools to assess adherence of health-related papers to any reporting guidelines. Two authors will independently screen records, extract data and assess risk of bias. We will extract study characteristics, AI tool details, how reporting guidelines are operationalised for AI assessment, AI implementation details, comparison details, and evaluation outcomes including agreement metrics, classification performance metrics, and utility indicators. We will present and summarise results through structured tables and plots, stratified by reporting guideline and AI tool type.</p>
                </sec>
                <sec>
                    <title>Discussion</title>
                    <p>This systematic review will provide a comprehensive synthesis of AI tools developed to automate assessments of reporting guideline adherence. It will provide interest holders with insights into what AI tools have been used, their implementation approaches, which AI tool types perform well, and any improvements that can be made to AI tools automating assessments of reporting guideline adherence in the future.</p>
                </sec>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>Reporting guidelines</kwd>
                <kwd>Artificial intelligence</kwd>
                <kwd>Adherence</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1">
                    <funding-source>Monash University Early Career Research Excellence Program (ECREP) grant</funding-source>
                </award-group>
                <award-group id="fund-2">
                    <funding-source>National Health and Medical Research Council Investigator Grant</funding-source>
                    <award-id>GNT2009612</award-id>
                </award-group>
                <award-group id="fund-3">
                    <funding-source>National Health and Medical Research Council Investigator Grant</funding-source>
                    <award-id>GNT2033917</award-id>
                </award-group>
                <funding-statement>This research was supported by a Monash University Early Career Research Excellence Program (ECREP) grant. MJP is supported by a National Health and Medical Research Council Investigator Grant (GNT2033917). JEM is supported by a National Health and Medical Research Council Investigator Grant (GNT2009612). The funders had no role in the study design, decision to publish, or preparation of the manuscript.</funding-statement>
                <funding-statement>
                    <italic>The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</italic>
                </funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec id="sec5" sec-type="intro">
            <title>Introduction</title>
            <p>Complete reporting of health-related research is necessary for users to understand, appraise, and apply research results appropriately. Reporting guidelines provide recommendations on what should be reported, why it should be reported, and include exemplars of complete reporting to guide authors and other interest holders (e.g. peer reviewers, editors).
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup> Reporting guidelines have been developed for different types of research, such as PRISMA (preferred reporting items for systematic reviews and meta-analyses) for systematic reviews,
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> CONSORT (consolidated standards of reporting trials) for randomised trials,
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> TRIPOD (transparent reporting of a multivariable prediction model for individual prognosis or diagnosis) for prediction models,
                <sup>
                    <xref ref-type="bibr" rid="ref4">4</xref>
                </sup> STROBE (strengthening the reporting of observational studies in epidemiology) for observational studies
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>
                </sup> and STARD (standard for reporting of diagnostic accuracy studies) for diagnostic studies.
                <sup>
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> Many of these &#x201c;core&#x201d; reporting guidelines have multiple extensions that provide additional reporting recommendations for specific aspects not covered in the core statement (e.g., types of outcomes, specific designs, analytic methods).</p>
            <p>Routine assessments of reporting guideline adherence have been performed manually by authors, editors, and reviewers to judge whether reporting recommendations have been met. Because reporting guidelines do not specify criteria for evaluating adherence, researchers have had to develop their own assessment criteria and methods.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>,
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> Researchers must also decide whether to assess all checklist items/recommendations or only a subset, and meta-research studies suggest that most have chosen to focus on selected items.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>,
                    <xref ref-type="bibr" rid="ref9">9</xref>,
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> These decisions have led to considerable variability in what is assessed and how it is assessed.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>,
                    <xref ref-type="bibr" rid="ref9">9</xref>,
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> Also, manual evaluation is time-consuming and resource-intensive.
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> Additionally, research questions such as what characteristics (e.g., time, discipline, journal) predict better or worse reporting are difficult to address at scale with a large body of literature using a manual evaluation approach. Therefore, more efficient, consistent, and scalable methods are needed.</p>
            <p>Artificial intelligence (AI), defined as computational systems capable of performing tasks that typically require human intelligence, such as learning, reasoning, and decision-making, might provide a potential solution. Early attempts to automate assessments of reporting guideline adherence relied on traditional natural language processing (NLP) models. Examples include CONSORT-NLP,
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup> which combines rule-based and machine learning-based approaches to automatically complete the CONSORT checklist from randomized clinical trial reports, and the SPIRIT-CONSORT-TM,
                <sup>
                    <xref ref-type="bibr" rid="ref13">13</xref>
                </sup> an annotated corpus designed to train NLP models to automatically assess adherence to reporting recommendations in clinical trial protocols and result publications. However, these traditional NLP systems generally require substantial guideline-specific annotated datasets for development, and are applicable only to the particular guideline for which they were designed. Moreover, most systems focus on detecting local text segments, which could limit their utility for end-to-end evaluation in long research publications with multimodal data components (e.g., text, tables, and figures).</p>
            <p>The advent of large language models (LLMs) and vision language models (VLMs), such as GPT and Gemini, provides another opportunity to scale up assessments of reporting guideline adherence. Trained on extensive data from articles, books and other online sources,
                <sup>
                    <xref ref-type="bibr" rid="ref14">14</xref>
                </sup> these models are capable of processing complex data components, extracting information, summarising evidence, and generating outputs that are relevant to reporting guideline items. Several studies have used these models to assess reporting guideline adherence.
                <sup>
                    <xref ref-type="bibr" rid="ref15">15</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref17">17</xref>
                </sup>
            </p>
            <p>However, the outputs of LLMs and VLMs are sensitive to how they are implemented. Data preprocessing, prompts, and model inference settings might all influence model performance on specific tasks. For example, empirical work has shown that different prompt templates and formatting can substantially influence LLM outputs, though advanced models (e.g., GPT-4 compared to GPT-3.5-turbo) may demonstrate more robustness to such variations.
                <sup>
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> More importantly, because of the variability in assessment criteria and methods for evaluating adherence, researchers might use different prompts to ask subtly different questions for reporting guideline items (e.g., whether a guideline item is reported or whether it is reported adequately or fully). Additionally, even with identical prompts, fixed model parameters and fixed random seed, models may occasionally generate different outputs across runs due to hardware-level randomness. This leads to difficulties in achieving strict reproducibility. Their &#x201c;black-box&#x201d; nature also limits transparency in the process of decision-making, and model hallucinations, although an area of active improvement, may also challenge reliability in high-stakes fields such as health-related research.</p>
            <p>While numerous AI systems and prototypes have been developed to automate assessment of reporting guideline adherence,
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref13">13</xref>,
                    <xref ref-type="bibr" rid="ref15">15</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref17">17</xref>
                </sup> no comprehensive synthesis has been undertaken to investigate what they assess, how they are implemented and perform, and their potential utility in research and publication workflows.</p>
            <sec id="sec6">
                <title>Objective</title>
                <p>This systematic review aims to summarise and synthesize the characteristics and findings of studies evaluating AI tools developed to assist or automate assessments of reporting guideline adherence.</p>
            </sec>
        </sec>
        <sec id="sec7" sec-type="methods">
            <title>Methods</title>
            <p>We have reported this protocol in accordance with the Preferred Reporting Items for Systematic reviews and Meta-analysis Protocols (PRISMA-P) statement
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> and with consideration of the methods items in the more recent PRISMA 2020 statement.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> We have not registered the review.</p>
            <sec id="sec8">
                <title>Eligibility criteria</title>
                <p>

                    <list list-type="bullet">
                        <list-item>
                            <label>&#x2022;</label>
                            <p>

                                <italic toggle="yes">Study designs</italic>
                            </p>
                            <p>We will include studies of any design that evaluate the performance of AI tools developed to assess adherence of health-related research papers to reporting guidelines. Eligible study designs include diagnostic accuracy studies, validation studies, and trials comparing AI tool and human performance, as well as methodological studies comparing different AI approaches. Studies will be included regardless of language, publication date, or publication type (e.g., journal article, conference proceeding).</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>

                                <italic toggle="yes">Reporting guidelines</italic>
                            </p>
                            <p>We will include studies regardless of the reporting guideline evaluated, such as PRISMA, CONSORT, TRIPOD, STROBE, and STARD, and any of their extensions. By &#x201c;reporting guideline&#x201d;, we mean any document presenting reporting items that should appear in a research paper (regardless of whether presented as a checklist or structured text) and in which the authors explain how the items were developed.
                                <sup>
                                    <xref ref-type="bibr" rid="ref20">20</xref>
                                </sup>
                            </p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>

                                <italic toggle="yes">AI tools and comparator</italic>
                            </p>
                            <p>We will include any AI application, tool, or algorithm that (i) makes judgements about reporting guideline adherence, or (ii) identifies relevant text about reporting guideline adherence in a paper without making a judgement about adherence. Eligible systems could include any models that learn patterns from text with/without imaging data in the research papers, such as traditional natural language processing models (e.g., rule-based and BERT-like models) as well as LLMs and VLMs (e.g., GPT-5.2 and Gemini 3). We will include studies that compare AI tools with human assessment and studies that compare multiple AI tools with each other. Studies without an explicit comparator will also be eligible.</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>

                                <italic toggle="yes">Outcomes</italic>
                            </p>
                            <p>We will include studies regardless of the outcomes assessed or reported. Outcomes of interest to this review include: (i) agreement (overall and for each item/recommendation) between the AI tool and human assessors using raw and chance corrected agreement metrics (e.g., Cohen&#x2019;s kappa); (ii) classification performance (overall and/or for each item/recommendation) as determined using metrics such as accuracy, F1 score, sensitivity, specificity, positive and negative predictive values, and c-statistic; and (iii) utility indicators (e.g., task completion time, computational/API cost, and token usage across papers).</p>
                        </list-item>
                    </list>
                </p>
            </sec>
            <sec id="sec9">
                <title>Search methods</title>
                <p>We will search bibliographic databases and supplementary sources for eligible studies. Databases include MEDLINE (via Ovid), Embase (via Ovid), Scopus, Europe PMC, ACM Digital Library, and IEEE Xplore. We will not limit searches by date, language, publication status or publication format (except for Europe PMC, which will be restricted to preprints). Europe PMC will be used to search across several preprint servers (e.g., medRxiv, bioRxiv, 
                    <ext-link ext-link-type="uri" xlink:href="http://preprints.org">preprints.org</ext-link>, SSRN, etc.) and we will also search the arXiv preprint server, as it is not comprehensively covered by Europe PMC. Additional sources include the abstracts of the Cochrane Colloquium. The final part of the search will involve manually backward citation tracking and forward citation tracking using 
                    <ext-link ext-link-type="uri" xlink:href="http://LENS.org">LENS.org</ext-link> for all studies included in the review.</p>
                <p>An experienced information specialist (SM) designed the search strategies with input from the review team. The search includes terms related to the concepts of AI, adherence, and reporting. Several seed articles (based on articles known to the review team)
                    <sup>
                        <xref ref-type="bibr" rid="ref11">11</xref>,
                        <xref ref-type="bibr" rid="ref13">13</xref>,
                        <xref ref-type="bibr" rid="ref15">15</xref>&#x2013;
                        <xref ref-type="bibr" rid="ref17">17</xref>,
                        <xref ref-type="bibr" rid="ref21">21</xref>&#x2013;
                        <xref ref-type="bibr" rid="ref24">24</xref>
                    </sup> were used to develop the MEDLINE search. The MEDLINE search was then translated and adapted for use in the other sources. The search strategy was iteratively tested to achieve an optimal balance between recall and precision. Full search strategies are available as Extended data (see Data availability section).
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup>
                </p>
            </sec>
            <sec id="sec10">
                <title>Study selection</title>
                <p>All records will first be deduplicated using the built-in functions of the reference management tools we will use (i.e., EndNote and Covidence). Two reviewers (out of MZ, SL, DPQC, JM, ML, LJ, KN, JO) will then independently screen all titles and abstracts, and records that are considered eligible or uncertain by either reviewer will undergo full-text screening, where those reviewers will independently assess the full text of potentially eligible records. Any disagreements will be resolved by discussion or consulting with a third reviewer. Title and abstract screening of bibliographic databases records will be conducted using Covidence. For arXiv and Cochrane Colloquium Abstracts, a screening form will be created in Microsoft Excel with the link for each record and the search date.</p>
            </sec>
            <sec id="sec11">
                <title>Data extraction</title>
                <p>Two reviewers (out of MZ, SL, DPQC) will independently conduct the data extraction using a data extraction form (available as Extended data; see Data availability section).
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup> The data extraction form will be piloted by reviewers on a sample of included studies prior to the full data extraction process. Any discrepancies in the data collected between the two reviewers will be resolved via discussion or by consulting with a third reviewer (MJP or JEM). Data extraction will be conducted using a data extraction tool (REDCap version 15.5.30).
                    <sup>
                        <xref ref-type="bibr" rid="ref26">26</xref>
                    </sup> Where necessary and available, additional sources will be consulted to supplement information extracted from the included studies, such as published study protocols, registry entries, or primary dataset documentation. If information remains missing or unclear, we will contact the study authors for further information. The information that will be extracted from each included study is provided in Table 1 (available as Extended data; see Data availability section).
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup>
                </p>
            </sec>
            <sec id="sec12">
                <title>Quality assessment of included studies</title>
                <p>To evaluate the quality of the included studies, two reviewers (out of MZ, SL, DPQC) will independently apply a defined set of quality indicators. These indicators are informed by established tools PROBAST+AI
                    <sup>
                        <xref ref-type="bibr" rid="ref27">27</xref>
                    </sup> and the tool used in a living systematic review of AI tools for risk of bias assessment,
                    <sup>
                        <xref ref-type="bibr" rid="ref28">28</xref>
                    </sup> which offer relevant concepts for assessing AI tools. The quality indicators will cover the following domains:
                    <list list-type="bullet">
                        <list-item>
                            <label>&#x2022;</label>
                            <p>AI tool development</p>
                            <p>Whether the AI tool was developed rigorously (e.g., adequate training model and prompt engineering).</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Reference standard</p>
                            <p>Whether the reference standard assessment was conducted rigorously (e.g., performed by trained assessors, assessed by at least two assessors independently with consensus procedures in place).</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Independence of assessments and risk of data leakage</p>
                            <p>Whether the AI tool was applied to the studies without knowledge of the reference standard assessment and vice versa; Whether the AI tool&#x2019;s final performance was evaluated on an independent test set that was not used for model training or prompt development/refinement; Whether there was a low risk that the annotation of test corpus was part of the AI model&#x2019;s training data.</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>Study planning</p>
                            <p>Whether the study was based on a publicly available protocol or registration record.</p>
                        </list-item>
                    </list>
                </p>
                <p>Each indicator will be judged as low quality, high quality, or unclear quality. Quality assessment form is available as Extended data (see Data availability section).
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup> A study will be deemed high quality overall if all quality indicators were deemed high quality, low quality overall if at least one indicator was deemed low quality, and unclear quality overall if at least one indicator was deemed unclear quality, but none were deemed low quality. Disagreements between reviewers will be resolved through discussion or adjudication by a third reviewer.</p>
            </sec>
            <sec id="sec13">
                <title>Data syntheses and analyses</title>
                <p>Given the anticipated diversity in AI tools, reporting guidelines, study designs, and outcome measures, formal meta-analysis is unlikely to be feasible across all outcomes. We will therefore present and summarise results of each of the included studies through structured tables and plots.</p>
                <p>We will use structured tables to present study characteristics, reporting guidelines assessed and scope, dataset characteristics, dataset sources and formats, reference annotation for datasets, AI tool details, application of the AI tool, AI implementation details, and comparison details. Tables will be organised by reporting guideline evaluated, and then by the type of AI tool (traditional NLP models versus LLM-based/VLM-based models).</p>
                <p>We will then present AI tool performance and utility findings in tables organised by the reporting guideline evaluated, stratified by the type of AI tool and each outcome category (i.e., classification performance metrics, agreement metrics and utility indicators). Where multiple metrics are reported within the same outcome category, we will extract pre-specified metrics as detailed in the Data extraction form (available as Extended data; see Data availability section).
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup> Where preferred metrics are unavailable, we will consider and note the alternative metrics reported by the study authors. We will summarise outcomes at overall level using descriptive statistics (e.g., mean, median, range across items) and also present the overall results in forest plots, stratified by reporting guideline and AI models. When item-level/recommendation-level outcomes are also available (e.g., classification performance metrics of adherence for each PRISMA item), we will summarise specific item-level results to facilitate performance interpretation using pre-specified rules, including the items with high and low performance (e.g., top and bottom five items for agreement metrics, accuracy and F1 score). When there are multiple results available for the same outcome across training, validation and test datasets, we will extract and summarise results identified by study authors as primary and/or the results from the most representative evaluation setting. In this circumstance, we will note that multiple results are available, and our reason for selecting the reported result.</p>
                <p>We will finally present and summarise the overall quality of studies by the reporting guideline evaluated, stratified by the type of AI tool.</p>
            </sec>
            <sec id="sec14">
                <title>Dissemination plan</title>
                <p>We plan to disseminate the findings of this systematic review through publication in a peer-reviewed scientific journal. The final manuscript will include all methods, results, and interpretations arising from the review to support transparency and reproducibility. In addition to journal publication, we will present the key findings at relevant academic conferences and seminars to reach researchers, and developers working in AI and reporting guidelines. We will also make our data extraction forms, summary tables, and analytical code publicly accessible to facilitate future research in this area.</p>
            </sec>
            <sec id="sec15">
                <title>Study status</title>
                <p>This study is currently at study selection stage.</p>
            </sec>
        </sec>
        <sec id="sec16" sec-type="discussion">
            <title>Discussion</title>
            <p>Complete reporting of health-related research is important for the usability and trustworthiness of research evidence. Reporting guidelines have been widely used to support complete reporting. However, assessments of reporting guideline adherence remain inconsistent, time-consuming, and difficult to scale. AI tools have the potential to address these limitations. As the AI field continues to evolve rapidly, a rigorous evidence synthesis is timely. This systematic review will be the first to comprehensively summarise and synthesise what AI tools have been developed to automate assessments of reporting guideline adherence. It will provide interest holders with insights into what AI tools have been used, their implementation approaches, which AI tool types perform well, and any improvements that can be made to AI tools automating assessments of reporting guideline adherence in the future.</p>
        </sec>
    </body>
    <back>
        <sec id="sec19" sec-type="data-availability">
            <title>Data availability</title>
            <sec id="sec20">
                <title>Underlying data</title>
                <p>No data are associated with this article.</p>
            </sec>
            <sec id="sec21">
                <title>Extended data</title>
                <p>Open Science Framework: Artificial intelligence tools for automating assessments of reporting guideline adherence: a protocol for a systematic review. DOI: 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.17605/OSF.IO/AYSTK">https://doi.org/10.17605/OSF.IO/AYSTK</ext-link>.
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup>
                </p>
                <p>This project contains the following extended data:
                    <list list-type="bullet">
                        <list-item>
                            <label>&#x2022;</label>
                            <p>
Table 1. docx</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>APPENDIX Section 1 Search strategy.docx</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>APPENDIX Section 2 Data extraction form.docx</p>
                        </list-item>
                        <list-item>
                            <label>&#x2022;</label>
                            <p>APPENDIX Section 3 Quality assessment form.docx</p>
                        </list-item>
                    </list>
                </p>
            </sec>
            <sec id="sec22">
                <title>Reporting guidelines</title>
                <p>Open Science Framework: PRISMA-P checklist for Artificial intelligence tools for automating assessments of reporting guideline adherence: a protocol for a systematic review. DOI: 
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.17605/OSF.IO/AYSTK">https://doi.org/10.17605/OSF.IO/AYSTK</ext-link>.
                    <sup>
                        <xref ref-type="bibr" rid="ref25">25</xref>
                    </sup>
                </p>
                <p>Data are available under the terms of the 
                    <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International license (CC-BY 4.0)</ext-link>.</p>
            </sec>
        </sec>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="other">
                    <collab>EQUATOR Network - What is a reporting guideline</collab>: 
(access on 11 Feb 2026).
                    <ext-link ext-link-type="uri" xlink:href="https://wwwequator-networkorg/about-us/what-is-a-reporting-guideline/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Page</surname>
                            <given-names>MJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>McKenzie</surname>
                            <given-names>JE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bossuyt</surname>
                            <given-names>PM</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews.</article-title>
                    <source>

                        <italic toggle="yes">BMJ.</italic>
</source>
                    <year>2021</year>;<volume>372</volume>:<fpage>n71</fpage>.</mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hopewell</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Chan</surname>
                            <given-names>A-W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Collins</surname>
                            <given-names>GS</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>CONSORT 2025 statement: updated guideline for reporting randomised trials.</article-title>
                    <source>

                        <italic toggle="yes">BMJ.</italic>
</source>
                    <year>2025</year>;<volume>389</volume>:<fpage>e081123</fpage>.
                    <pub-id pub-id-type="pmid">40228833</pub-id>
                    <pub-id pub-id-type="doi">10.1136/bmj-2024-081123</pub-id>
                    <pub-id pub-id-type="pmcid">PMC11995449</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Collins</surname>
                            <given-names>GS</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reitsma</surname>
                            <given-names>JB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Altman</surname>
                            <given-names>DG</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Transparent reporting of a multivariable prediction model for individual prognosis or diagnosis (TRIPOD): the TRIPOD statement.</article-title>
                    <source>

                        <italic toggle="yes">J Br Surg.</italic>
</source>
                    <year>2015</year>;<volume>102</volume>(<issue>3</issue>):<fpage>148</fpage>&#x2013;<lpage>158</lpage>.</mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Von Elm</surname>
                            <given-names>E</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Altman</surname>
                            <given-names>DG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Egger</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statement: guidelines for reporting observational studies.</article-title>
                    <source>

                        <italic toggle="yes">The Lancet.</italic>
</source>
                    <year>2007</year>;<volume>370</volume>(<issue>9596</issue>):<fpage>1453</fpage>&#x2013;<lpage>1457</lpage>.
                    <pub-id pub-id-type="doi">10.1016/S0140-6736(07)61602-X</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bossuyt</surname>
                            <given-names>PM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Reitsma</surname>
                            <given-names>JB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bruns</surname>
                            <given-names>DE</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>STARD 2015: an updated list of essential items for reporting diagnostic accuracy studies.</article-title>
                    <source>

                        <italic toggle="yes">Radiology.</italic>
</source>
                    <year>2015</year>;<volume>277</volume>(<issue>3</issue>):<fpage>826</fpage>&#x2013;<lpage>832</lpage>.
                    <pub-id pub-id-type="doi">10.1148/radiol.2015151516</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hamilton</surname>
                            <given-names>DG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>McKenzie</surname>
                            <given-names>JE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nejstgaard</surname>
                            <given-names>CH</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Evaluation of tools used to assess adherence to PRISMA 2020 reveals inconsistent methods and poor tool implementability: part I of a systematic review.</article-title>
                    <source>

                        <italic toggle="yes">J Clin Epidemiol.</italic>
</source>
                    <year>2026</year>;<fpage>112133</fpage>.</mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Dal Santo</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rice</surname>
                            <given-names>DB</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Amiri</surname>
                            <given-names>LSN</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Methods and results of studies on reporting guideline adherence are poorly reported: a meta-research study.</article-title>
                    <source>

                        <italic toggle="yes">J Clin Epidemiol.</italic>
</source>
                    <year>2023</year>;<volume>159</volume>:<fpage>225</fpage>&#x2013;<lpage>234</lpage>.
                    <pub-id pub-id-type="pmid">37271424</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.jclinepi.2023.05.017</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ivaldi</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Burgos</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Oltra</surname>
                            <given-names>G</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Adherence to PRISMA 2020 statement assessed through the expanded checklist in systematic reviews of interventions: A meta-epidemiological study.</article-title>
                    <source>

                        <italic toggle="yes">Cochrane Evidence Synthesis and Methods.</italic>
</source>
                    <year>2024</year>;<volume>2</volume>(<issue>5</issue>):<fpage>e12074</fpage>.
                    <pub-id pub-id-type="pmid">40476264</pub-id>
                    <pub-id pub-id-type="doi">10.1002/cesm.12074</pub-id>
                    <pub-id pub-id-type="pmcid">PMC11795886</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Turner</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shamseer</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Altman</surname>
                            <given-names>DG</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Consolidated standards of reporting trials (CONSORT) and the completeness of reporting of randomised controlled trials (RCTs) published in medical journals.</article-title>
                    <source>

                        <italic toggle="yes">Cochrane Database Syst Rev.</italic>
</source>
                    <year>2012</year>;<volume>11</volume>(<issue>11</issue>):<fpage>MR000030</fpage>.</mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Woelfle</surname>
                            <given-names>T</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Hirt</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Janiaud</surname>
                            <given-names>P</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Benchmarking Human&#x2013;AI collaboration for common evidence appraisal tools.</article-title>
                    <source>

                        <italic toggle="yes">J Clin Epidemiol.</italic>
</source>
                    <year>2024</year>;<volume>175</volume>:<fpage>111533</fpage>.
                    <pub-id pub-id-type="pmid">39277058</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111533</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Schilsky</surname>
                            <given-names>RL</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Page</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Development and Validation of a Natural Language Processing Tool to Generate the CONSORT Reporting Checklist for Randomized Clinical Trials.</article-title>
                    <source>

                        <italic toggle="yes">JAMA Netw Open.</italic>
</source>
                    <year>2020</year>;<volume>3</volume>(<issue>10</issue>):<fpage>e2014661</fpage>.
                    <pub-id pub-id-type="pmid">33030549</pub-id>
                    <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.14661</pub-id>
                    <pub-id pub-id-type="pmcid">PMC7545295</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Jiang</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Vorland</surname>
                            <given-names>CJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ying</surname>
                            <given-names>X</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>SPIRIT-CONSORT-TM: a corpus for assessing transparency of clinical trial protocol and results publications.</article-title>
                    <source>

                        <italic toggle="yes">Scientific Data.</italic>
</source>
                    <year>2025</year>;<volume>12</volume>(<issue>1</issue>):<fpage>355</fpage>.
                    <pub-id pub-id-type="pmid">40021657</pub-id>
                    <pub-id pub-id-type="doi">10.1038/s41597-025-04629-1</pub-id>
                    <pub-id pub-id-type="pmcid">PMC11871027</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Thirunavukarasu</surname>
                            <given-names>AJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ting</surname>
                            <given-names>DSJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Elangovan</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large language models in medicine.</article-title>
                    <source>

                        <italic toggle="yes">Nature medicine.</italic>
</source>
                    <year>2023</year>;<volume>29</volume>(<issue>8</issue>):<fpage>1930</fpage>&#x2013;<lpage>1940</lpage>.
                    <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wrightson</surname>
                            <given-names>JG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Blazey</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Moher</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>GPT for RCTs? Using AI to determine adherence to clinical trial reporting guidelines.</article-title>
                    <source>

                        <italic toggle="yes">BMJ Open.</italic>
</source>
                    <year>2025</year>;<volume>15</volume>(<issue>3</issue>):<fpage>e088735</fpage>.
                    <pub-id pub-id-type="pmid">40107689</pub-id>
                    <pub-id pub-id-type="doi">10.1136/bmjopen-2024-088735</pub-id>
                    <pub-id pub-id-type="pmcid">PMC11927406</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Chen</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Khoshkish</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>AutoReporter: Development of an artificial intelligence tool for automated assessment of research reporting guideline adherence.</article-title>
                    <source>

                        <italic toggle="yes">medRxiv.</italic>
</source>
                    <year>2025</year>;<fpage>2025.04. 18.25326076</fpage>.</mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Forero</surname>
                            <given-names>DA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Abreu</surname>
                            <given-names>SE</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tovar</surname>
                            <given-names>BE</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large Language Models and the Analyses of Adherence to Reporting Guidelines in Systematic Reviews and Overviews of Reviews (PRISMA 2020 and PRIOR).</article-title>
                    <source>

                        <italic toggle="yes">Journal of Medical Systems.</italic>
</source>
                    <year>2025</year>;<volume>49</volume>(<issue>1</issue>):<fpage>80</fpage>.
                    <pub-id pub-id-type="pmid">40504403</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s10916-025-02212-0</pub-id>
                    <pub-id pub-id-type="pmcid">PMC12162794</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>He</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rungta</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Koleczek</surname>
                            <given-names>D</given-names>
                        </name>

                        <etal/>
</person-group>;
                    <article-title>Does prompt formatting have any impact on llm performance?.</article-title>
                    <source>

                        <italic toggle="yes">arXiv preprint arXiv:241110541.</italic>
</source>
                    <year>2024</year>.</mixed-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Moher</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Shamseer</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Clarke</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Preferred reporting items for systematic review and meta-analysis protocols (PRISMA-P) 2015 statement.</article-title>
                    <source>

                        <italic toggle="yes">Syst Rev.</italic>
</source>
                    <year>2015</year>;<volume>4</volume>(<issue>1</issue>):<fpage>1</fpage>.
                    <pub-id pub-id-type="doi">10.1186/2046-4053-4-1</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <mixed-citation publication-type="other">
                    <collab>EQUATOR Network - How to develop a reporting guideline</collab>: 
(access on 11 Feb 2026).
                    <ext-link ext-link-type="uri" xlink:href="https://wwwequator-networkorg/toolkits/developing-a-reporting-guideline/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Srinivasan</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Berkowitz</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Friedrich</surname>
                            <given-names>NA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large Language Model Analysis of Reporting Quality of Randomized Clinical Trial Articles: A Systematic Review.</article-title>
                    <source>

                        <italic toggle="yes">JAMA Network Open.</italic>
</source>
                    <year>2025</year>;<volume>8</volume>(<issue>8</issue>):<fpage>e2529418</fpage>.
                    <pub-id pub-id-type="pmid">40875232</pub-id>
                    <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.29418</pub-id>
                    <pub-id pub-id-type="pmcid">PMC12395317</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Alharbi</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Asiri</surname>
                            <given-names>S</given-names>
                        </name>
</person-group>:
                    <article-title>Automated Assessment of Reporting Completeness in Orthodontic Research Using LLMs: An Observational Study.</article-title>
                    <source>

                        <italic toggle="yes">Applied Sciences.</italic>
</source>
                    <year>2024</year>;<volume>14</volume>(<issue>22</issue>):<fpage>10323</fpage>.
                    <pub-id pub-id-type="doi">10.3390/app142210323</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kataoka</surname>
                            <given-names>Y</given-names>
                        </name>

                        <name name-style="western">
                            <surname>So</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Banno</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Large language models for automated PRISMA 2020 adherence checking.</article-title>
                    <source>

                        <italic toggle="yes">arXiv preprint arXiv:251116707.</italic>
</source>
                    <year>2025</year>.</mixed-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>He</surname>
                            <given-names>Z</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bian</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhu</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Evaluating the Ability of Large Language Models to Identify Adherence to CONSORT Reporting Guidelines in Randomized Controlled Trials: A Methodological Evaluation Study.</article-title>
                    <source>

                        <italic toggle="yes">arXiv preprint arXiv:251113107.</italic>
</source>
                    <year>2025</year>.</mixed-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Zeng</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Clark</surname>
                            <given-names>DP</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Artificial intelligence tools for automating assessments of reporting guideline adherence: a protocol for a systematic review.</article-title>
                    <source>

                        <italic toggle="yes">OSF.</italic>
</source>
                    <pub-id pub-id-type="doi">10.17605/OSFIO/AYSTK.2026</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Harris</surname>
                            <given-names>PA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Taylor</surname>
                            <given-names>R</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Thielke</surname>
                            <given-names>R</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Research electronic data capture (REDCap) - a metadata-driven methodology and workflow process for providing translational research informatics support.</article-title>
                    <source>

                        <italic toggle="yes">Journal of Biomedical Informatics.</italic>
</source>
                    <year>2009</year>;<volume>42</volume>(<issue>2</issue>):<fpage>377</fpage>&#x2013;<lpage>381</lpage>.
                    <pub-id pub-id-type="pmid">18929686</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.jbi.2008.08.010</pub-id>
                    <pub-id pub-id-type="pmcid">PMC2700030</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Moons</surname>
                            <given-names>KGM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Damen</surname>
                            <given-names>JAA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaul</surname>
                            <given-names>T</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods.</article-title>
                    <source>

                        <italic toggle="yes">BMJ.</italic>
</source>
                    <year>2025</year>;<volume>388</volume>:<fpage>e082505</fpage>.
                    <pub-id pub-id-type="pmid">40127903</pub-id>
                    <pub-id pub-id-type="doi">10.1136/bmj-2024-082505</pub-id>
                    <pub-id pub-id-type="pmcid">PMC11931409</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Albarqouni</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sondrup</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ostengaard</surname>
                            <given-names>L</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Artificial Intelligence tools for Risk of Bias assessment in systematic reviews (AI4RoB): a protocol for a living systematic review.</article-title>
                    <source>

                        <italic toggle="yes">OSF.</italic>
</source>
                    <year>2025</year>.
                    <pub-id pub-id-type="doi">1017605/OSFIO/RDEZ3</pub-id>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report480816">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.198323.r480816</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Marques-Cruz</surname>
                        <given-names>Manuel</given-names>
                    </name>
                    <xref ref-type="aff" rid="r480816a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-9827-2551</uri>
                </contrib>
                <aff id="r480816a1">
                    <label>1</label>University of Porto, Porto, Porto District, Portugal</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>9</day>
                <month>6</month>
                <year>2026</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2026 Marques-Cruz M</copyright-statement>
                <copyright-year>2026</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport480816" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.179775.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>Dear authors,</p>
            <p> First of all, I would like to congratulate you on a (unsurprisingly) well-designed protocol for a systematic review.</p>
            <p> I do not see any major flaws with the study design you presented. However, I do have some thoughts I would like to share regarding some methodological choices, that I would like you to consider.</p>
            <p> </p>
            <p> 1. One difficulty in defining AI is where to draw the line between deterministic approaches and &#x201c;real&#x201d; &#x201c;computational systems capable of performing tasks that typically require human intelligence&#x201d;. Some rule-based models could therefore be classified as either AI or not-AI. I would suggest the authors constrain a little more the methods that they are considering as AI.</p>
            <p> </p>
            <p> 2. Building on the first point, the search strategy may need to be revised: (i) not all NLP methods will have been described as &#x201c;NLP&#x201d; or equivalent anywhere on the records to retrieve; (ii) if you imply that some authors may be defining the company/chatbot/commercial model used (gpt, claude, gemini) instead of the use of LLMs as methodology, then you must acknowledge the existence of other models that are equally LLMs (such as mistral, deepseek, llama, qwen,&#x2026;); (iii) not all rule-based (not-NLP) methods will fit on the &#x201c;machine learning or deep learning or supervised learning or unsupervised learning&#x201d; either.</p>
            <p> </p>
            <p> 3. Regarding the concept of reporting guidelines there is an analogous situation. While defining reporting guideline as &#x201c;any document presenting reporting items that should appear in a research paper&#x201d;, you may have not exhausted all possible descriptors for this in the search strategy.</p>
            <p> </p>
            <p> 4. Concept 2 of the search strategy may also lead to loss of important records. There are other terms that may be used to define the use of automated methods to assess reporting guidelines, such as: &#x201c;application&#x201d;, &#x201c;implementation&#x201d; , &#x2026; .</p>
            <p> </p>
            <p> 5. The main concern I tried to express is that you may end up not fulfilling your objective of doing a systematic review (which should be the synthesis of all available evidence) on AI (which was ill-defined) use on reporting guidelines (also ill-defined). I would be more keen on reporting these shortcomings in defining AI and reporting guidelines by assuming a focus on specific AI methods and specific reporting guidelines (which the search strategy already shows).</p>
            <p> </p>
            <p> 6. A second high concern is that, while aiming to synthetise all evidence, you may end up doing not a systematic review but doing a more superficial metadata analysis of these records (in line with a scoping, other than a systematic review). I am not convinced that it will not happen, based on the proposed data extraction form.</p>
            <p>Is the study design appropriate for the research question?</p>
            <p>Yes</p>
            <p>Is the rationale for, and objectives of, the study clearly described?</p>
            <p>Yes</p>
            <p>Are sufficient details of the methods provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Are the datasets clearly presented in a useable and accessible format?</p>
            <p>Not applicable</p>
            <p>Reviewer Expertise:</p>
            <p>Health Data Science, Machine Learning, Artificial Intelligence (Large Language Models), Evidence Synthesis.</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report482643">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.198323.r482643</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Celestin</surname>
                        <given-names>Mbonigaba</given-names>
                    </name>
                    <xref ref-type="aff" rid="r482643a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-7381-8888</uri>
                </contrib>
                <aff id="r482643a1">
                    <label>1</label>Brainae University, Delaware, USA</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>25</day>
                <month>5</month>
                <year>2026</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2026 Celestin M</copyright-statement>
                <copyright-year>2026</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport482643" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.179775.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>
                <bold>1. Abstract</bold>
            </p>
            <p> 
                <bold>1.1: Absence of protocol registration creates transparency concern</bold>
            </p>
            <p> The abstract omits the fact that the review was not prospectively registered. This omission weakens methodological transparency.</p>
            <p> The manuscript later states:</p>
            <p> </p>
            <p> &#x201c;We have not registered the review.&#x201d;</p>
            <p> </p>
            <p> For a high impact systematic review protocol, failure to register in PROSPERO, OSF registration before commencement, or INPLASY reduces confidence in protocol immutability and selective reporting control.</p>
            <p> </p>
            <p> Authors should explicitly justify non-registration in the abstract and methods.</p>
            <p> </p>
            <p> 
                <bold>2. Introduction</bold>
            </p>
            <p> 
                <bold>2.1: Conceptual distinction between &#x201c;reporting adherence&#x201d; and &#x201c;reporting quality&#x201d; is insufficiently clarified</bold>
            </p>
            <p> The manuscript repeatedly treats adherence and quality as closely related constructs without explicitly distinguishing them.</p>
            <p> This is problematic because: 
                <list list-type="bullet">
                    <list-item>
                        <p>reporting completeness &#x2260; methodological quality</p>
                    </list-item>
                    <list-item>
                        <p>AI tools may identify textual presence without assessing epistemic adequacy</p>
                    </list-item>
                </list> This conceptual distinction is essential in meta-research methodology.</p>
            <p> The introduction should explicitly define: 
                <list list-type="bullet">
                    <list-item>
                        <p>reporting adherence</p>
                    </list-item>
                    <list-item>
                        <p>reporting completeness</p>
                    </list-item>
                    <list-item>
                        <p>reporting quality</p>
                    </list-item>
                    <list-item>
                        <p>reporting transparency</p>
                    </list-item>
                </list> and explain their boundaries.</p>
            <p> 
                <bold>2.3: Overstatement of scalability benefits without acknowledging computational limitations</bold>
            </p>
            <p> The manuscript strongly promotes scalability benefits of AI but does not sufficiently discuss: 
                <list list-type="bullet">
                    <list-item>
                        <p>API cost barriers</p>
                    </list-item>
                    <list-item>
                        <p>GPU dependency</p>
                    </list-item>
                    <list-item>
                        <p>token limitations</p>
                    </list-item>
                    <list-item>
                        <p>multimodal processing failures</p>
                    </list-item>
                    <list-item>
                        <p>hallucination-induced false positives</p>
                    </list-item>
                </list> A balanced discussion requires both opportunities and structural limitations.</p>
            <p> 
                <bold>3. Methods</bold>
            </p>
            <p> 
                <bold>3.1 PRISMA and Protocol Registration</bold>
            </p>
            <p> 
                <bold>3.1.1: Non-registration is a major methodological weakness</bold>
            </p>
            <p> The statement:</p>
            <p> &#x201c;We have not registered the review.&#x201d;</p>
            <p> is a serious concern for a high-impact systematic review protocol.</p>
            <p> This creates risk regarding: 
                <list list-type="bullet">
                    <list-item>
                        <p>protocol deviation</p>
                    </list-item>
                    <list-item>
                        <p>outcome switching</p>
                    </list-item>
                    <list-item>
                        <p>selective inclusion</p>
                    </list-item>
                    <list-item>
                        <p>post hoc methodological adaptation</p>
                    </list-item>
                </list> At minimum, the authors should: 
                <list list-type="bullet">
                    <list-item>
                        <p>register retrospectively on OSF</p>
                    </list-item>
                    <list-item>
                        <p>provide timestamped protocol freeze</p>
                    </list-item>
                    <list-item>
                        <p>justify why registration was omitted</p>
                    </list-item>
                </list> 
                <bold>3.2 Eligibility Criteria</bold>
            </p>
            <p> 
                <bold>3.2.1: Inclusion criteria for AI systems are excessively broad</bold>
            </p>
            <p> The manuscript includes: 
                <list list-type="bullet">
                    <list-item>
                        <p>rule-based systems</p>
                    </list-item>
                    <list-item>
                        <p>BERT-like models</p>
                    </list-item>
                    <list-item>
                        <p>LLMs</p>
                    </list-item>
                    <list-item>
                        <p>VLMs</p>
                    </list-item>
                    <list-item>
                        <p>tools without explicit comparator</p>
                    </list-item>
                    <list-item>
                        <p>tools without judgment generation</p>
                    </list-item>
                </list> This breadth creates severe heterogeneity risk.</p>
            <p> The authors must define: 
                <list list-type="bullet">
                    <list-item>
                        <p>minimum AI capability threshold</p>
                    </list-item>
                    <list-item>
                        <p>operational definition of &#x201c;AI tool&#x201d;</p>
                    </list-item>
                    <list-item>
                        <p>distinction between extraction systems and evaluative systems</p>
                    </list-item>
                </list> Otherwise, synthesis validity may become compromised.</p>
            <p> 
                <bold>3.2.2: No restriction on publication type introduces high risk of low-quality evidence inclusion</bold>
            </p>
            <p> Including: 
                <list list-type="bullet">
                    <list-item>
                        <p>preprints</p>
                    </list-item>
                    <list-item>
                        <p>conference abstracts</p>
                    </list-item>
                    <list-item>
                        <p>non-peer-reviewed studies</p>
                    </list-item>
                    <list-item>
                        <p>arXiv manuscripts</p>
                    </list-item>
                </list> without a weighting strategy threatens evidentiary consistency.</p>
            <p> The protocol lacks: 
                <list list-type="bullet">
                    <list-item>
                        <p>publication quality stratification</p>
                    </list-item>
                    <list-item>
                        <p>sensitivity analysis excluding preprints</p>
                    </list-item>
                    <list-item>
                        <p>risk weighting by peer-review status</p>
                    </list-item>
                </list> This should be added.</p>
            <p> 
                <bold>3.2.3: Comparator definition is methodologically vague</bold>
            </p>
            <p> The manuscript states:</p>
            <p> &#x201c;Studies without an explicit comparator will also be eligible.&#x201d;</p>
            <p> This creates a major evaluation problem because: 
                <list list-type="bullet">
                    <list-item>
                        <p>tool performance becomes uninterpretable</p>
                    </list-item>
                    <list-item>
                        <p>no benchmark validity exists</p>
                    </list-item>
                    <list-item>
                        <p>internal claims cannot be verified</p>
                    </list-item>
                </list> The authors should justify how performance validity will be interpreted in non-comparator studies.</p>
            <p> 
                <bold>3.3 Search Strategy</bold>
            </p>
            <p> 
                <bold>3.3.1: AI terminology search coverage may be insufficient</bold>
            </p>
            <p> The protocol does not clearly indicate whether search terms include: 
                <list list-type="bullet">
                    <list-item>
                        <p>generative AI</p>
                    </list-item>
                    <list-item>
                        <p>foundation models</p>
                    </list-item>
                    <list-item>
                        <p>transformer models</p>
                    </list-item>
                    <list-item>
                        <p>GPT</p>
                    </list-item>
                    <list-item>
                        <p>Gemini</p>
                    </list-item>
                    <list-item>
                        <p>Claude</p>
                    </list-item>
                    <list-item>
                        <p>retrieval augmented generation</p>
                    </list-item>
                    <list-item>
                        <p>prompt engineering</p>
                    </list-item>
                </list> Given the rapid evolution of terminology, missing these terms risks retrieval bias.</p>
            <p> 
                <bold>3.4 Study Selection</bold>
            </p>
            <p> 
                <bold>3.4.1: No mention of calibration exercises for reviewers</bold>
            </p>
            <p> The protocol does not specify: 
                <list list-type="bullet">
                    <list-item>
                        <p>pilot screening agreement</p>
                    </list-item>
                    <list-item>
                        <p>calibration threshold</p>
                    </list-item>
                    <list-item>
                        <p>kappa agreement target</p>
                    </list-item>
                    <list-item>
                        <p>reviewer training procedures</p>
                    </list-item>
                </list> This weakens reproducibility and consistency assurance.</p>
            <p> 
                <bold>3.5 Data Extraction</bold>
            </p>
            <p> 
                <bold>3.5.1: Extraction framework is underdeveloped for AI reproducibility assessment</bold>
            </p>
            <p> The planned extraction omits critical AI reproducibility variables such as: 
                <list list-type="bullet">
                    <list-item>
                        <p>model version</p>
                    </list-item>
                    <list-item>
                        <p>inference temperature</p>
                    </list-item>
                    <list-item>
                        <p>random seed</p>
                    </list-item>
                    <list-item>
                        <p>API version</p>
                    </list-item>
                    <list-item>
                        <p>hardware dependency</p>
                    </list-item>
                    <list-item>
                        <p>context window size</p>
                    </list-item>
                    <list-item>
                        <p>prompt chaining</p>
                    </list-item>
                    <list-item>
                        <p>retrieval augmentation use</p>
                    </list-item>
                </list> These variables are essential for AI methodological interpretation.</p>
            <p> 
                <bold>3.5.2: No extraction of dataset governance characteristics</bold>
            </p>
            <p> The protocol ignores: 
                <list list-type="bullet">
                    <list-item>
                        <p>dataset licensing</p>
                    </list-item>
                    <list-item>
                        <p>annotation provenance</p>
                    </list-item>
                    <list-item>
                        <p>copyright status</p>
                    </list-item>
                    <list-item>
                        <p>benchmark contamination risks</p>
                    </list-item>
                </list> These are highly important in AI evaluation research.</p>
            <p> 
                <bold>3.7 Data Synthesis</bold>
            </p>
            <p> 
                <bold>3.7.1: Statistical synthesis plan is underdeveloped</bold>
            </p>
            <p> The protocol states that meta-analysis is unlikely feasible but does not specify: 
                <list list-type="bullet">
                    <list-item>
                        <p>criteria for determining feasibility</p>
                    </list-item>
                    <list-item>
                        <p>heterogeneity thresholds</p>
                    </list-item>
                    <list-item>
                        <p>subgroup analysis plan</p>
                    </list-item>
                    <list-item>
                        <p>meta-regression possibilities</p>
                    </list-item>
                    <list-item>
                        <p>publication bias assessment</p>
                    </list-item>
                </list> This creates analytical incompleteness.</p>
            <p> 
                <bold>3.7.2: Forest plots are proposed without clear effect size harmonization strategy</bold>
            </p>
            <p> The manuscript proposes forest plots despite highly heterogeneous metrics: 
                <list list-type="bullet">
                    <list-item>
                        <p>accuracy</p>
                    </list-item>
                    <list-item>
                        <p>F1</p>
                    </list-item>
                    <list-item>
                        <p>sensitivity</p>
                    </list-item>
                    <list-item>
                        <p>agreement</p>
                    </list-item>
                    <list-item>
                        <p>kappa</p>
                    </list-item>
                </list> Without standardization, pooled visual interpretation may become misleading.</p>
            <p> The authors should define: 
                <list list-type="bullet">
                    <list-item>
                        <p>standardized performance metrics</p>
                    </list-item>
                    <list-item>
                        <p>transformation methods</p>
                    </list-item>
                    <list-item>
                        <p>normalization approach</p>
                    </list-item>
                </list> 
                <bold>4. Discussion</bold>
            </p>
            <p> 
                <bold>4.1: Ethical and governance implications are insufficiently explored</bold>
            </p>
            <p> Major missing themes: 
                <list list-type="bullet">
                    <list-item>
                        <p>AI replacing peer reviewers</p>
                    </list-item>
                    <list-item>
                        <p>editorial accountability</p>
                    </list-item>
                    <list-item>
                        <p>bias amplification</p>
                    </list-item>
                    <list-item>
                        <p>automated gatekeeping risks</p>
                    </list-item>
                    <list-item>
                        <p>transparency obligations in AI-assisted review</p>
                    </list-item>
                </list> These issues are central to responsible AI deployment.</p>
            <p> 
                <bold>5. References and Citation Audit</bold>
            </p>
            <p> 
                <bold>5.1: Several references appear problematic or potentially unverifiable</bold>
            </p>
            <p> The manuscript contains references that require verification because DOI, indexing, or stable retrieval evidence is unclear.</p>
            <p> Potentially problematic references include:</p>
            <p> 
                <bold>Reference 16</bold>
            </p>
            <p> &#x201c;Chen D, Li P, Khoshkish E, et al.: AutoReporter...&#x201d;</p>
            <p> medRxiv. 2025; 2025.04.18.25326076.</p>
            <p> Needs verification: 
                <list list-type="bullet">
                    <list-item>
                        <p>DOI not shown</p>
                    </list-item>
                    <list-item>
                        <p>unstable preprint identification format</p>
                    </list-item>
                    <list-item>
                        <p>unclear peer-review status</p>
                    </list-item>
                </list> 
                <bold>Reference 18</bold>
            </p>
            <p> &#x201c;He J, Rungta M, Koleczek D, et al.; Does prompt formatting have any impact on llm performance?. arXiv preprint arXiv:241110541.&#x201d;</p>
            <p> Problems: 
                <list list-type="bullet">
                    <list-item>
                        <p>arXiv identifier formatting appears incorrect</p>
                    </list-item>
                    <list-item>
                        <p>should likely contain decimal structure</p>
                    </list-item>
                    <list-item>
                        <p>title formatting inconsistent</p>
                    </list-item>
                    <list-item>
                        <p>capitalization inconsistent</p>
                    </list-item>
                </list> This reference requires correction and verification.</p>
            <p> 
                <bold>Reference 23</bold>
            </p>
            <p> &#x201c;Kataoka Y, So R, Banno M, et al.: Large language models for automated PRISMA 2020 adherence checking. arXiv preprint arXiv:251116707. 2025.&#x201d;</p>
            <p> Potential issue: 
                <list list-type="bullet">
                    <list-item>
                        <p>arXiv identifier format likely invalid</p>
                    </list-item>
                </list> 
                <bold>Reference 24</bold>
            </p>
            <p> &#x201c;He Z, Bian M, Zhu J, et al.: Evaluating the Ability of Large Language Models...&#x201d;</p>
            <p> arXiv preprint arXiv:251113107. 2025.</p>
            <p> Potential issue: 
                <list list-type="bullet">
                    <list-item>
                        <p>malformed arXiv identifier</p>
                    </list-item>
                </list> 
                <bold>5.2: Inconsistent citation formatting</bold>
            </p>
            <p> Several references show inconsistent formatting regarding: 
                <list list-type="bullet">
                    <list-item>
                        <p>title capitalization</p>
                    </list-item>
                    <list-item>
                        <p>journal style</p>
                    </list-item>
                    <list-item>
                        <p>DOI presentation</p>
                    </list-item>
                    <list-item>
                        <p>punctuation</p>
                    </list-item>
                    <list-item>
                        <p>URL presentation</p>
                    </list-item>
                </list> The reference list requires full standardization.</p>
            <p>Is the study design appropriate for the research question?</p>
            <p>Partly</p>
            <p>Is the rationale for, and objectives of, the study clearly described?</p>
            <p>Yes</p>
            <p>Are sufficient details of the methods provided to allow replication by others?</p>
            <p>Partly</p>
            <p>Are the datasets clearly presented in a useable and accessible format?</p>
            <p>Not applicable</p>
            <p>Reviewer Expertise:</p>
            <p>AI</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
    </sub-article>
</article>
