<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.130042.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Research Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Predicting diabetic ketoacidosis in pediatric patients using machine learning</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 1 approved with reservations]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Eid</surname>
                        <given-names>Waad Mohammed</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-3874-0667</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Alharthi</surname>
                        <given-names>Hana</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="corresp" rid="c2">b</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Aslam</surname>
                        <given-names>Nida</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Abdur rab</surname>
                        <given-names>Irfan Ullah</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Madani</surname>
                        <given-names>Alaa</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Resources</role>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Department of Health Information Management Technology, College of Public Health, Imam Abdulrahman Bin Faisal University, Dammam, Saudi Arabia</aff>
                <aff id="a2">
                    <label>2</label>Department of Computer Science, College of Computer Science and Information Technology, Imam Abdulrahman Bin Faisal University, Dammam, Saudi Arabia</aff>
                <aff id="a3">
                    <label>3</label>Department of Health Education, King Fahad Medical City, Riyadh, Saudi Arabia</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:waadeid@gmail.com">waadeid@gmail.com</email>
                </corresp>
                <corresp id="c2">
                    <label>b</label>
                    <email xlink:href="mailto:halharthi@iau.edu.sa">halharthi@iau.edu.sa</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>6</day>
                <month>6</month>
                <year>2023</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2023</year>
            </pub-date>
            <volume>12</volume>
            <elocation-id>611</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>14</day>
                    <month>2</month>
                    <year>2023</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2023 Eid WM et al.</copyright-statement>
                <copyright-year>2023</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/12-611/pdf"/>
            <abstract>
                <p>Background</p>
                <p>Machine learning is a powerful tool to define relationships between large data variables through computing algorithms. In medicine, machine learning can find the association between a given disease and disease-related complications such as the relationship between Diabetes and development of diabetic ketoacidosis (DKA). The aim of this study is to develop and evaluate a predicting model for diabetic ketoacidosis among pediatric cases to define the leading factors that can predict diabetic ketoacidosis.</p>
                <p>Methods</p>
                <p>We evaluated the medical records of 3737 pediatric patients between the ages of 0 and 18 years who attended diabetic clinics and were diagnosed with diabetes. After the initial data preprocessing, we used Orange, an open source software, for data visualization, and machine learning for data analysis. The study used six prediction models: Decision Tree, Random Forest, kNN, Gradient Boosting, CN2 rule inducer and AdaBoost. Data imbalance was managed using oversampling technique. Variables analyzed included age, sex, hemoglobin A1C level, visits to the diabetic education clinic, and number of appointments to diabetic clinic. Models were evaluated based on the Area under the Curve (AUC), accuracy, precision, recall and F1-score using the stratified 5-fold cross validation technique.</p>
                <p>Results</p>
                <p>The results show that the Random Forest is the highest performance classifier (AUC=0.98; F1 score=0.92; and recall=0.93). Furthermore, HbA1c was the most contributing factor to the prediction model.</p>
                <p>Conclusion</p>
                <p>This study shows the importance and effectiveness of machine learning modeling to predict the association between diabetes and the development of DKA. Flagging those patients who are at a higher risk of developing DKA provides a better point of care for these patients.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>Predictive analytics</kwd>
                <kwd>classifiers</kwd>
                <kwd>Diabetes</kwd>
                <kwd>DKA</kwd>
            </kwd-group>
            <funding-group>
                <funding-statement>The author(s) declared that no grants were involved in supporting this work.</funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec id="sec1" sec-type="intro">
            <title>Introduction</title>
            <p>Type 1 diabetes, an autoimmune disease of insulin resistance, is predicted to affect one person per 10 individuals in the world by the year 2040.
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup> It was formerly known as juvenile diabetes because it is typically diagnosed during childhood. Complications with diabetes can adversely affect multiple organs such as the heart, the brain, the kidneys, eyes, and even the limbs, such as diabetic foot ulcers that can lead to foot amputations. Uncontrolled diabetes increases the risk of Alzheimer&#x2019;s disease.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> One of the most serious complications of type 1 diabetes is ketoacidosis (DKA). DKA occurs when the body has high levels of sugar for a long period of time and the body then produces blood acids called ketones. Ketoacidosis can disrupt the normal body workflow which causes serious complications such as pulmonary and cerebral edema, hypokalemia and organ damages.
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> DKA can cause neurocognitive impairment in children, such as memory loss, poor concentration, and/or deficits in learning and emotional connection.
                <sup>
                    <xref ref-type="bibr" rid="ref4">4</xref>
                </sup> High occurrence of DKA can also increase patients&#x2019; admission to the hospital which results in higher management cost which creates an economic burden on the healthcare system.
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>
                </sup> Machine learning is the scientific branch of artificial intelligence that focuses on how computers learn from data to define relationships between data variables through computing algorithms.
                <sup>
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> In medicine, machine learning can be used to study diagnosis and disease patterns in large patient datasets. For example, machine learning can predict how fast a disease can develop. Also, it can predict which patients are at a higher risk of developing a condition or disease progression. These predictions can support physicians in their point of care decisions, whether it is preventive care or disease management, to provide a high level of care to these patients to improve healthcare outcomes.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> As such, machine learning can be used to flag patients with health risks and enable the healthcare team to provide the best course of treatment for their patients. In a study which used machine learning to predict the likelihood of diabetes occurrence in patients. Specifically, three classification algorithms, decision tree (DT), the support vector machine (SVM), and the naive Bayes (NB), were employed. The data used is a diabetes dataset named PIDD which is taken from the UCI machine learning repository. The data included 768 female patients with two values: 1 as positive for diabetes and 0 as negative. In addition, multiple attributes or risk factors, were included such as number of times pregnant, plasma glucose concentration, diastolic blood pressure, skinfold thickness, 2 hours serum insulin level, BMI ratio, diabetes pedigree function and age. Moreover, researchers tested the three algorithms performance evaluating precision, accuracy, F-measures and recall. The result shows NB has the highest accuracy level with 76.30% in comparison with other algorithms.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> However, the attributes included in the test did not include the known diagnostic tests such as hemoglobin (A1C) form of hemoglobin that is chemically linked to a sugar, random blood glucose, and fasting blood glucose; this requires further research using the same algorithms and models. Another study used the National Health and Nutrition Examination Survey data (NHANES) to predict patients at risk of diabetes and cardiovascular diseases.
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> NHANES is a comprehensive national program in the United States to assess the status of health and nutrition among its population. Data from NHANES were used to predict diabetes and cardiovascular diseases. In the study, scientists used different models such as SVM,RF,GBT,WEM to classify patients at risk of diabetes and cardiovascular diseases, they provided the program with the training data which contained the observations and labels for the category of the observations. This can give the algorithm the ability to predict the output label associated with a new observation if presented to the program. Results showed that machine learning models based on the survey used can provide an automated identification method for patients at risk for diabetes and cardiovascular diseases and they were also able to identify major contributors to the prediction results.
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> Given that this study was based on extraction of variables from a national survey rather than electronic health records data, the findings underscore the challenges of data set for machine learning as data from surveys can point to findings that are different from data extracted from electronic health records data. Additional studies aimed to create a prediction program which can detect high risk group who are more likely to develop type 2 diabetes. One in particular used the Synthetic Minority Over-sampling Technique to balance the dataset and included six features (body mass index (BMI), diet, smoking, blood pressure, sex and geographic region. The study evaluated the algorithms using the balanced data, they used nine classifiers which are, Logistic Regression (LR), Average Perceptron (AP), Na&#x00ef;ve Bayes (NB), Neural Network (NN), Support Vector Machine (SVM), LD, Decision Jungle (DJ), Decision Forest (DF), and Boosted Decision tree (BDT). The Decision Forest (DF) model had better performance than other classifiers with an accuracy rate of 83%. The results of this study can help to establish a web-based service to assess a disease risk in preventative medical care.
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> Another study, aimed to detect diabetic retinopathy where various classifiers were used such as, RF, kNN, SVM, LDA and RRF. The RF model showed the best performance among other classifiers, with an accuracy of 86%.
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> Collectively, these studies underscore the potential of machine learning to be used in preventative medicine as well as in assistive decision making to improve healthcare. 
                <xref ref-type="table" rid="T1">Table 1</xref> summarizes the literature gap of machine learning in diabetic field.</p>
            <table-wrap id="T1" orientation="portrait" position="float">
                <label>Table 1. </label>
                <caption>
                    <title>Summary of the literature gap of machine learning in diabetic field.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Study (arranged in reference number)</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Models used</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Highest accuracy</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Number of features</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Sample size</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Target</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">DT, SVM, NB</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">76.30%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">768</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">Diabetes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">SVM, RF, GBT, WEM</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">73.7%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">5000</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">Diabetes</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">11</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">LR, AP, NB, NN, SVM, LD, DJ, DF, BDT</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">83%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4896</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">Type 2 Diabetes Mellitus</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">12</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">LDA, SVM, KNN, RF, and Ranger Random Forest (RRF)</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">86%</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">14</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">327</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">Diabetic retinopathy</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <p>In a project conducted in Texas children&#x2019;s hospital in the United States to provide the best care for high risk patients with type 1 diabetes. The hospital developed a model using machine learning classifiers, which can predict the occurrence of DKA. The project aimed to reduce the number of hospitalizations related to diabetes or DKA from 9.5% to 5% by the year 2018 and to reduce the admissions of DKA by at least 1% every year to reach a goal of maximum 5% DKA admissions per year. A predicting risk model for DKA was developed. The model used data such as risk index for poor glycemic control (RIPGC), socioeconomic status, clinical data such as fasting blood glucose level, hemoglobin A1C, and number of clinical visits per year. The team then proceeded with developing a risk stratification tool and divided patients into four tiers; high risk, moderate risk, mild risk, and lowest risk. They then provided care according to their risk prediction model. This targeted approach resulted in decreasing the recurrent DKA cases admission by 30.9% per year and it showed higher documentation rate of RIPGC in the electronic system. In addition to a risk index for DKA for all the patients.
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup>
            </p>
            <p>In this study we used machine learning as a tool to predict DKA occurrence among a pediatric population and identify the most important factors in predicting DKA.</p>
        </sec>
        <sec id="sec2" sec-type="methods">
            <title>Methods</title>
            <p>This research was ethically reviewed and approved by the institutional review board at Imam Abdulrahman bin Faisal University (IRB-PGS-2020-03-431). It was also approved by the institutional review board at King Fahad Medical city. (IRB Log Number: 21-186E) This study is an experimental study aimed to create a predicting model for diabetic ketoacidosis among pediatrics cases and find the most important factors predicting diabetic ketoacidosis. The target variable is the DKA and the attributes are sex, age, HbA1c levels, number of patient appointments in the diabetic clinic, number of patient appointments in the health education clinic, and the number of patients those who do not attend appointments at the health education clinic. The dataset included the medical records of pediatric patients aged 18 and younger who attended the Diabetic clinic in King Fahad medical city from starting January 2018 to until December 2020. We excluded any patients who were above 18 at the time of data collection and patients who did not have any laboratory results registered in the system. The total sample size was 1537 patients.</p>
            <sec id="sec3">
                <title>Data pre-processing</title>
                <p>The dataset was received in excel format from King Fahad Medical City health information system in Riyadh, Saudi Arabia. It was divided into four sections. The first section was the list of appointments in the pediatric Diabetes clinic, which also included whether the patient attended the appointment or was registered as a no-show, and the demographics (
                    <italic toggle="yes">e.g.</italic>, nationality, sex). The first section data size was 3737. The second section was the laboratory results of the patients, which contained their Hemoglobin A1c levels. The third section was the list of patients who were diagnosed with DKA. The fourth section is the list of patients&#x2019; ages. All patient identifications have been removed to ensure patients privacy and confidentiality. We created a new spreadsheet to consolidate this information. It included the lab results where duplicated Medical record number (MRN) numbers were removed using the remove duplicates function in excel. We used the VLOOKUP function which looks up a value in the columns of a table and returns the value in the same row from a column which the user specifies. Using the patient&#x2019;s MRN, we matched the patients&#x2019; age, sex and DKA diagnosis to their laboratory results and the data size was reduced to 1543 data records. we used the COUNTIFS function in excel, which counts the number of cells specified by a given set of conditions or criteria, to count the number of appointment visits 
                    <italic toggle="yes">versus</italic> no shows. The variables assessed included sex, age, HbA1c level, number of appointments, number of health education clinic appointment and number of no shows to health education clinic appointment The target variable was DKA status with two values of yes and no.</p>
            </sec>
            <sec id="sec4">
                <title>Model development and evaluation</title>
                <p>To analyze the data we used Orange Data Mining (RRID:SCR_019811) V3.30. Orange is an open source software which is used for data visualization, machine learning and data mining purposes. There are different classifiers available in Orange, which include: Random Forest, which creates a set of decision trees. Every tree is created from a small sample from the training data. When the classifier develops an individual tree, a random subset of attributes is drawn then the best attribute is selected. The final model is based on the majority selected individual developed trees in the forest. KNN, which uses algorithms to search for the closest training examples in a feature and uses the average to form the prediction. AdaBoost, is an algorithm that merges weak learners and adapts to each training sample. CN2 rule inducer uses an algorithm as a classification technique through making of simple, comprehensible rules.
                    <sup>
                        <xref ref-type="bibr" rid="ref13">13</xref>
                    </sup> Tree simply uses an algorithm to separate the data into nodes. It is similar to Random Forest. Gradient Boosting is a technique that produces a prediction model in the form of an joined of weak prediction models, typically decision trees.</p>
            </sec>
            <sec id="sec5">
                <title>Model evaluation</title>
                <p>To analyze descriptive statistics for the variables, we used a feature statistics tool in Orange, also ranked the attributes to demonstrate the most contributing factor to DKA among pediatric patients.</p>
            </sec>
            <sec id="sec6">
                <title>Data sampler</title>
                <p>dataset showed an imbalance among DKA cases (17.5%) and none DKA (82.5%). To balance the data set, a data sampler tool in Orange V3.30 was used. This tool is used to develop different types of complementary samples from the input data. The fixed sample size method develops a certain number of data instances with replacements, which means always sampling from the entire dataset and does not delete instances from the subset data. We also maintained the sampling pattern by checking replicable sampling settings in the data sampler. This technique to oversample the DKA instances. The positive DKA instances were replicated by 1308 to equalize it with the negative 1308 DKA instances and overcome the data imbalance. Furthermore, Python was used to oversample the data as a comparison method with the data sampler in Orange. The oversampling technique has been used in research which aimed to evaluate the performance of supervised learning algorithms on imbalanced class datasets.
                    <sup>
                        <xref ref-type="bibr" rid="ref14">14</xref>
                    </sup>
                </p>
            </sec>
            <sec id="sec7">
                <title>Cross validation</title>
                <p>We used stratified 5-folds to cross validate our data which is the default parameters in Orange. This technique splits the dataset into folds such as N. One-fold will be used for testing while the remaining N-1 will be used for training in each N iterations. In the current study the dataset is divided into 5 stratified folds and in each fold there are approximately equal number of samples for each class.</p>
            </sec>
            <sec id="sec8">
                <title>Confusion matrix</title>
                <p>This matrix shows the number and proportion of instances in the predicted and actual class. This allows the reporting of cases that were misclassified or were accurately classified.</p>
            </sec>
            <sec id="sec9">
                <title>Rank</title>
                <p>It scores variables that can be calculated using the information from the confusion matrix.</p>
            </sec>
            <sec id="sec10">
                <title>Area under the curve (AUC)</title>
                <p>Is model performance evaluation technique that indicate the ability of the classifier to distinguish between classes. The higher the AUC score, the better performance for the classifier to distinguish between true positives and true negatives.</p>
            </sec>
            <sec id="sec11">
                <title>Classification accuracy (CA)</title>
                <p>Is a measure to evaluate the performance of a classifier by calculating the number of correct predictions divided by the total number of predictions.</p>
            </sec>
            <sec id="sec12">
                <title>Precision</title>
                <p>Is the number of true positives among instances classified as positive.</p>
            </sec>
            <sec id="sec13">
                <title>Recall</title>
                <p>Is the number of correctly predicted positive class sample, among all the positive class in the dataset.</p>
            </sec>
            <sec id="sec14">
                <title>F1-Score</title>
                <p>It represents the harmonic mean among the precision and the recall.</p>
            </sec>
        </sec>
        <sec id="sec15" sec-type="results">
            <title>Results</title>
            <p>Several Orange classifiers were used to predict the incidences of DKA among a pediatric cohort. 
                <xref ref-type="fig" rid="f1">Figure 1</xref> illustrates the workflow performed in Orange. The workflow begins with imported data followed by outlier&#x2019;s extraction. After the extraction, we balanced the data using the data sampler widget. The data is inserted into the six classifiers and evaluated by Area under the curve (AUC) level (test and score widget). A total of 1536 patient data points were imported into the program. Data showed an 82% imbalance. To overcome this imbalance, a data sampler tool in the Orange program was used. Moreover, Python was additionally used as another oversampling technique which showed similar performance results to the Orange data sampler tool with the Random Forest being the best predicting model with an AUC higher than 0.9. Female and male distribution are approximately equal as shown in 
                <xref ref-type="table" rid="T2">Table 2</xref> and 
                <xref ref-type="fig" rid="f2">Figure 2</xref>. Incidences of DKA distribution were normalized after applying the over sampling technique as shown in 
                <xref ref-type="fig" rid="f3">Figure 3</xref>. For age, the youngest patient was 2 years old and the oldest was 18 years old with a mean of 12 years old. For the HbA1c the maximum level was 16.3 and the minimum was 4 with a mean of 9.99 as shown in 
                <xref ref-type="table" rid="T2">Table 2</xref>. To test the prediction performance on our data, we used six classifiers which are Random Forest, AdaBoost, CN2 rule inducer, kNN, Gradient Boosting and Decision Tree. Through the test and score feature we evaluated the classifiers prediction performance through cross validation technique and the AUC score as it is a highly reliable method to evaluate the performance. The result showed that Random Forest had the highest performance result with an AUC score of 0.98 followed by AdaBoost and CN2 rule inducer with a score of 0.97 and 0.93, respectively As shown in 
                <xref ref-type="table" rid="T3">Table 3</xref>. The confusion matrix for the performance of the Random Forest classifiers is shown in 
                <xref ref-type="table" rid="T4">Table 4</xref>. The Random Forest classifier made 86.5 % (n=885) correct predictions, 27.16 % (n=33) false prediction for the no incidence of DKA and 88.6% (n=1077) correct prediction and 11.35 (n=138) false prediction for the incidence of DKA. Additionally, the HbA1c level was the most contributing attribute to the occurrence of DKA followed by the health education appointments as shown in 
                <xref ref-type="table" rid="T5">Table 5</xref>. Furthermore, sex was the least contributing attribute (
                <xref ref-type="table" rid="T5">Table 5</xref>).
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>Figure 1. </label>
                    <caption>
                        <title>Workflow of the prediction model in Orange.</title>
                    </caption>
                    <graphic id="gr1" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_figure1.gif"/>
                </fig>
            </p>
            <table-wrap id="T2" orientation="portrait" position="float">
                <label>Table 2. </label>
                <caption>
                    <title>Descriptive statistics of the results.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Name</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Distribution</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Mean</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Median</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Dispersion</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Min.</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Max.</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Missing</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">HbA1C</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR01.gif"/>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">9.992</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10.5</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.285</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4.0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">16.3</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">113 (4%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Appointment number</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR02.gif"/>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4.80</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">5</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.49</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">10</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Health education appointments</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR03.gif"/>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1.35</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1.19</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">6</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Health education appointments no-show</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR04.gif"/>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.07</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">3.68</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Age</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR05.gif"/>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">12.01</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">13</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.33</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">20</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Sex</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR06.gif"/>
                            </td>
                            <td colspan="1" rowspan="1"/>
                            <td align="left" colspan="1" rowspan="1" valign="top">Female</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.691</td>
                            <td colspan="1" rowspan="1"/>
                            <td colspan="1" rowspan="1"/>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">DKA</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <inline-graphic xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_GR07.gif"/>
                            </td>
                            <td colspan="1" rowspan="1"/>
                            <td align="left" colspan="1" rowspan="1" valign="top">No</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.693</td>
                            <td colspan="1" rowspan="1"/>
                            <td colspan="1" rowspan="1"/>
                            <td align="left" colspan="1" rowspan="1" valign="top">0 (0%)</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                <label>Figure 2. </label>
                <caption>
                    <title>Sex distribution which shows a balance distribution between male and females.</title>
                </caption>
                <graphic id="gr2" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_figure2.gif"/>
            </fig>
            <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                <label>Figure 3. </label>
                <caption>
                    <title>Incidence of DKA distribution after balancing data.</title>
                </caption>
                <graphic id="gr3" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/142769/3f6c4beb-6c7c-4b85-a53f-29ba3f5091fb_figure3.gif"/>
            </fig>
            <table-wrap id="T3" orientation="portrait" position="float">
                <label>Table 3. </label>
                <caption>
                    <title>Stratified 5-fold Cross validation for the prediction models.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">Model</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">AUC</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">CA</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">F1</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Precision</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Recall</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Tree</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.916</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.891</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.891</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.894</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.891</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Random Forest</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.983</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.930</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.930</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.934</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.930</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">kNN</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.898</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.823</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.820</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.842</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.823</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Gradient Boosting</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.884</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.807</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.806</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.815</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.807</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">CN2 rule inducer</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.936</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.922</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.922</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.931</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.922</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">AdaBoost</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.978</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.949</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.948</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.952</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.949</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <table-wrap id="T4" orientation="portrait" position="float">
                <label>Table 4. </label>
                <caption>
                    <title>Confusion matrix for Random Forest (showing number of instances).</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="2" valign="bottom">Actual</th>
                            <th align="left" colspan="3" rowspan="1" valign="top">Predicted</th>
                        </tr>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top">No</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Yes</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">&#x2211;</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <bold>No</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">885</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">330</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1215</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <bold>Yes</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">138</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1077</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1215</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">
                                <bold>&#x2211;</bold>
                            </td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1023</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1407</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">2430</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
            <table-wrap id="T5" orientation="portrait" position="float">
                <label>Table 5. </label>
                <caption>
                    <title>Attributes ranking.</title>
                </caption>
                <table content-type="article-table" frame="hsides">
                    <thead>
                        <tr>
                            <th align="left" colspan="1" rowspan="1" valign="top"/>
                            <th align="left" colspan="1" rowspan="1" valign="top">#</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Gain ratio</th>
                            <th align="left" colspan="1" rowspan="1" valign="top">Gini</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">HbA1C</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">1</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0987</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.1246</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Health Education appointments</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0175</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0217</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Health Education appointments no-show</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">3</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0109</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0026</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Appointment number</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">4</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0052</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0071</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">AGE</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">5</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0039</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0053</td>
                        </tr>
                        <tr>
                            <td align="left" colspan="1" rowspan="1" valign="top">Sex</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">6</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0029</td>
                            <td align="left" colspan="1" rowspan="1" valign="top">0.0020</td>
                        </tr>
                    </tbody>
                </table>
            </table-wrap>
        </sec>
        <sec id="sec16" sec-type="discussion">
            <title>Discussion</title>
            <p>Recurrent admission of patients due to DKA can be prevented with proper education and targeted strategic programs.
                <sup>
                    <xref ref-type="bibr" rid="ref15">15</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref16">16</xref>
                </sup> In this research, we focused on developing a prediction model which can detect high risk groups for DKA to decrease admission cost to the healthcare organization and prevent the readmission of patients. These predictions can inform better patient quality of life and help to develop better education programs for these patients. Our data showed an 82% imbalance. To overcome this imbalance, a data sampler tool in the Orange program was used. This program is used as a tool to oversample the DKA incidence and replicate the positive DKA instances to equalize it with the non DKA incidence. Moreover, Python was additionally used as another oversampling technique which showed similar performance results to the Orange data sampler tool with the Random Forest being the best predicting model with an AUC higher than 0.9. The ability of different machine learning prediction models to predict DKA incidence using data imported from the electronic medical records. We found that the HbA1c level was the most contributing attribute to the occurrence of DKA followed by the health education appointments. Although six attributes were used in this research, other similar studies have used other clinical attributes such as the fasting blood glucose, HbA1c, vital signs, injection therapy vs pump therapy. It also included sex, race and ethnicity, BMI, healthy diet, smoking status.
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref17">17</xref>
                </sup> However, not all of these attributes can be found in the electronic medical records such as race and ethnicity, smoking status, healthy diet in our study. Our data is consistent with others.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup>
                <sup>&#x2013;</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup> Similar research has been conducted previously which explored similar models and their performances on predicting DKA. All demonstrated an AUC level of 0.7 and higher.
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref17">17</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> On the other hand, the highest AUC level scored in the current was by the Random Forest model with a score of 0.97 followed by AdaBoost and CN2 rule inducer with a score of 0.95 and 0.92 respectively. Lastly, although this research showed promising results in the prediction performance, it had some limitations which could be improved in future studies. To refine the prediction model additional clinical attributes should be included such as the fasting blood glucose, BMI, and medication adherence and type of treatment.</p>
        </sec>
        <sec id="sec17" sec-type="conclusions">
            <title>Conclusions</title>
            <p>DKA is considered to be a serious complication of diabetes which can be prevented with proper education and targeted strategic health care delivery. We aimed to create a predicting model for diabetic ketoacidosis among pediatric cases. Therefore, a real dataset was collected from Fahad Medical City health information system in Riyadh, Saudi Arabia. Several machine learning models have been used such as Random Forest (RF), Decision Tree (DT), kNN, Gradient Boosting (GB), CN2 rule inducer and AdaBoost. Furthermore, several preprocessing and data sampling techniques were applied. We found Random Forest model achieved the highest performance with the AUC of 0.98. Furthermore, HbA1c was the most contributing factor to the prediction model. Further research is required to refine the prediction model with additional clinical attributes such as the fasting blood glucose, BMI, medication adherence and type of treatment. Moreover, it is required to test the model&#x2019;s performance with the multi-center balanced patient&#x2019;s sample.</p>
        </sec>
        <sec id="sec18">
            <title>Author contributions</title>
            <p>Waad Eid: conception and design of the paper and data analysis</p>
            <p>Hana Alharthi: conception and design of the paper, data analysis, critical and final revision of the article</p>
            <p>Nida Aslam: Data analysis and model framework.</p>
            <p>Irfan Ullah Abdur rab: Data analysis and model framework.</p>
            <p>Alaa Madani: acquisition of data.</p>
        </sec>
        <sec id="sec19">
            <title>Consent</title>
            <p>The authors report that all patient data used in this research is anonymous, thus no consent for publication was required, and no alterations was done that would distort scientific meaning</p>
        </sec>
    </body>
    <back>
        <sec id="sec22" sec-type="data-availability">
            <title>Data availability statement</title>
            <p>The data that support the findings of this study are available from King Fahad Medical City but restrictions apply to the availability of these data, which were used under license for the current study, and so are not publicly available. Data are however available from the authors (Waad Eid, Email: 
                <email xlink:href="mailto:wmeid@iau.edu.sa">wmeid@iau.edu.sa</email> &amp; 
                <email xlink:href="mailto:waadeid@gmail.com">waadeid@gmail.com</email>) upon reasonable request and with permission of King Fahad Medical City.</p>
        </sec>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Zou</surname>
                            <given-names>Q</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kaiyang</surname>
                            <given-names>Q</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Luo</surname>
                            <given-names>Y</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Predicting diabetes mellitus with machine learning techniques.</article-title>
                    <source>

                        <italic toggle="yes">Front. Genet.</italic>
</source>
                    <year>2018</year>;<volume>9</volume>:<fpage>11</fpage>.
                    <pub-id pub-id-type="pmid">30459809</pub-id>
                    <pub-id pub-id-type="doi">10.3389/fgene.2018.00515</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6232260</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="other">
                    <collab>staff Mayo clinic</collab>:
                    <article-title>Diabetic ketoacidosis.</article-title>
                    <year>2019</year>.</mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="other">
                    <collab>staff Mayo clinic</collab>:
                    <article-title>Type 1 diabetes in children.</article-title>
                    <year>2020</year>.</mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ghetti</surname>
                            <given-names>SPD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>JKBA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sims</surname>
                            <given-names>CEBA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Diabetic ketoacidosis and memory dysfunction in children with type 1 diabetes.</article-title>
                    <source>

                        <italic toggle="yes">J. Pediatr.</italic>
</source>
                    <year>2010</year>;<volume>156</volume>(<issue>1</issue>):<fpage>109</fpage>&#x2013;<lpage>114</lpage>.
                    <pub-id pub-id-type="pmid">19833353</pub-id>
                    <pub-id pub-id-type="doi">10.1016/j.jpeds.2009.07.054</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="https://go.exlibris.link/SgcGsg1x">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Maldonado</surname>
                            <given-names>MR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Chong</surname>
                            <given-names>ER</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Oehl</surname>
                            <given-names>MA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Economic impact of diabetic ketoacidosis in a multiethnic indigent population.</article-title>
                    <source>

                        <italic toggle="yes">Diabetes Care.</italic>
</source>
                    <year>2003</year>;<volume>26</volume>(<issue>4</issue>):<fpage>1265</fpage>&#x2013;<lpage>1269</lpage>.
                    <pub-id pub-id-type="pmid">12663608</pub-id>
                    <pub-id pub-id-type="doi">10.2337/diacare.26.4.1265</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="http://care.diabetesjournals.org/content/26/4/1265.abstract">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Deo Rahul</surname>
                            <given-names>C</given-names>
                        </name>
</person-group>:
                    <article-title>Machine learning in medicine.</article-title>
                    <source>

                        <italic toggle="yes">Circulation.</italic>
</source>
                    <year>2015</year>;<volume>132</volume>(<issue>20</issue>):<fpage>1920</fpage>&#x2013;<lpage>1930</lpage>.
                    <pub-id pub-id-type="pmid">26572668</pub-id>
                    <pub-id pub-id-type="doi">10.1161/CIRCULATIONAHA.115.001593</pub-id>
                    <pub-id pub-id-type="pmcid">PMC5831252</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Rajkomar</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dean</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kohane</surname>
                            <given-names>I</given-names>
                        </name>
</person-group>:
                    <article-title>Machine learning in medicine.</article-title>
                    <source>

                        <italic toggle="yes">N. Engl. J. Med.</italic>
</source>
                    <year>2019</year>;<volume>380</volume>(<issue>14</issue>):<fpage>1347</fpage>&#x2013;<lpage>1358</lpage>. Copyright - Copyright &#x00a9; 2019 Massachusetts Medical Society. All rights reserved.
                    <pub-id pub-id-type="pmid">30943338</pub-id>
                    <pub-id pub-id-type="doi">10.1056/NEJMra1814259</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="https://library.iau.edu.sa/docview/2213944150?accountid=136546">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sisodia</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sisodia</surname>
                            <given-names>DS</given-names>
                        </name>
</person-group>:
                    <article-title>Prediction of diabetes using classification algorithms.</article-title>
                    <source>

                        <italic toggle="yes">Procedia Comput. Sci.</italic>
</source>
                    <year>2018</year>;<volume>132</volume>:<fpage>1578</fpage>&#x2013;<lpage>1585</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.procs.2018.05.122</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="http://www.sciencedirect.com/science/article/pii/S1877050918308548">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Dinh</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Miertschin</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Young</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A data-driven approach to predicting diabetes and cardiovascular disease with machine learning.</article-title>
                    <source>

                        <italic toggle="yes">BMC Med. Inform. Decis. Mak.</italic>
</source>
                    <year>2019</year>;<volume>19</volume>(<issue>1</issue>):<fpage>211</fpage>.
                    <issn>1472-6947</issn>.
                    <pub-id pub-id-type="pmid">31694707</pub-id>
                    <pub-id pub-id-type="doi">10.1186/s12911-019-0918-5</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6836338</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Syed</surname>
                            <given-names>AH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Khan</surname>
                            <given-names>T</given-names>
                        </name>
</person-group>:
                    <article-title>Machine learning-based application for predicting risk of type 2 diabetes mellitus (t2dm) in saudi arabia: A retrospective cross-sectional study.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2020</year>;<volume>8</volume>:<fpage>199539</fpage>&#x2013;<lpage>199561</lpage>.
                    <issn>2169-3536</issn>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3035026</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Alabdulwahhab</surname>
                            <given-names>KM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sami</surname>
                            <given-names>W</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Mehmood</surname>
                            <given-names>T</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Automated detection of diabetic retinopathy using machine learning classifiers.</article-title>
                    <source>

                        <italic toggle="yes">Eur. Rev. Med. Pharmacol. Sci.</italic>
</source>
                    <year>2021</year>;<volume>25</volume>(<issue>2</issue>):<fpage>583</fpage>&#x2013;<lpage>590</lpage>.
                    <pub-id pub-id-type="doi">10.26355/eurrev_202101_24615</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="https://go.exlibris.link/mhZzdxZw">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="other">
                    <collab>Health Catalyst</collab>:
                    <article-title>Texas children&#x2019;s take the reins in preventing dka in high risk pediatrics patients.</article-title>
                    <year>2016</year>.</mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="other">
                    <collab>orange</collab>:
                    <article-title>widget-catalog.</article-title>
                    <year>2021</year>.
                    <ext-link ext-link-type="uri" xlink:href="https://orangedatamining.com/">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kaur</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Kumari</surname>
                            <given-names>V</given-names>
                        </name>
</person-group>:
                    <article-title>Predictive modelling and analytics for diabetes using a machine learning approach.</article-title>
                    <source>

                        <italic toggle="yes">Appl. Comput. Inform.</italic>
</source>
                    <year>2018</year>;<volume>18</volume>:<fpage>90</fpage>&#x2013;<lpage>100</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.aci.2018.12.004</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vellanki</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Umpierrez</surname>
                            <given-names>GE</given-names>
                        </name>
</person-group>:
                    <article-title>Increasing hospitalizations for dka: A need for prevention programs.</article-title>
                    <source>

                        <italic toggle="yes">Diabetes Care.</italic>
</source>
                    <year>2018</year>;<volume>41</volume>(<issue>9</issue>):<fpage>1839</fpage>&#x2013;<lpage>1841</lpage>.
                    <pub-id pub-id-type="pmid">30135197</pub-id>
                    <pub-id pub-id-type="doi">10.2337/dci18-0004</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6105328</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="http://care.diabetesjournals.org/content/41/9/1839.abstract">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Dhatariya</surname>
                            <given-names>KK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Nunney</surname>
                            <given-names>I</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Higgins</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>National survey of the management of diabetic ketoacidosis (dka) in the uk in 2014.</article-title>
                    <source>

                        <italic toggle="yes">Diabet. Med.</italic>
</source>
                    <year>2016</year>;<volume>33</volume>(<issue>2</issue>):<fpage>252</fpage>&#x2013;<lpage>260</lpage>.
                    <pub-id pub-id-type="pmid">26286235</pub-id>
                    <pub-id pub-id-type="doi">10.1111/dme.12875</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Williams</surname>
                            <given-names>DD</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Dass</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bass</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>1303-p: Comparative performance of a recurrent neural network (rnn) and logistic regression (lr) model to predict diabetic ketoacidosis (dka) among youth postdiagnosis with type 1 diabetes (t1d).</article-title>
                    <source>

                        <italic toggle="yes">Diabetes.</italic>
</source>
                    <year>2020</year>;<volume>69</volume>(<issue>Supplement 1</issue>):<fpage>1303-P</fpage>.
                    <pub-id pub-id-type="doi">10.2337/db20-1303-P</pub-id>
                    <ext-link ext-link-type="uri" xlink:href="http://diabetes.diabetesjournals.org/content/69/Supplement_1/1303-P.abstract">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Li</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lee</surname>
                            <given-names>C-C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Zhou</surname>
                            <given-names>FL</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Performance assessment of different machine learning approaches in predicting diabetic ketoacidosis in adults with type 1 diabetes using electronic health records data.</article-title>
                    <source>

                        <italic toggle="yes">Pharmacoepidemiol. Drug Saf.</italic>
</source>
                    <year>2021</year>;<volume>30</volume>(<issue>5</issue>):<fpage>610</fpage>&#x2013;<lpage>618</lpage>.
                    <pub-id pub-id-type="pmid">33480091</pub-id>
                    <pub-id pub-id-type="doi">10.1002/pds.5199</pub-id>
                    <pub-id pub-id-type="pmcid">PMC8049019</pub-id>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report249422">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.142769.r249422</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Cichosz</surname>
                        <given-names>Simon</given-names>
                    </name>
                    <xref ref-type="aff" rid="r249422a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-3484-7571</uri>
                </contrib>
                <aff id="r249422a1">
                    <label>1</label>Aalborg University, Aalborg, Denmark</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>30</day>
                <month>8</month>
                <year>2024</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2024 Cichosz S</copyright-statement>
                <copyright-year>2024</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport249422" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.130042.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The manuscript explores the potential for risk stratification of pediatric patients based on their likelihood of developing Diabetic Ketoacidosis (DKA), a model with significant clinical implications. However, several important concerns warrant attention:</p>
            <p> </p>
            <p> The authors employ oversampling techniques on both the training and test datasets, resulting in an unrealistic performance assessment for real-world applications. Oversampling to address imbalanced datasets should be limited to the training data exclusively.</p>
            <p> </p>
            <p> Authors are strongly encouraged to adhere to &amp; submit the TRIPOD checklist or a similar guideline to ensure essential information is included in the manuscript.</p>
            <p> </p>
            <p> The discussion section requires expansion, including a dedicated segment on the limitations of the study.</p>
            <p> </p>
            <p> The manuscript could benefit from a discussion of its findings in the context of related studies concerning the prediction of adverse events in diabetes, such as DKA. For instance:(Cichosz SL, et al, 2024) (Ref-1)</p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>No</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>No</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>No</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>No</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>No</p>
            <p>Reviewer Expertise:</p>
            <p>Diabetes, technology, machine learning</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-249422-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Development of Machine Learning Models for the Identification of Elevated Ketone Bodies During Hyperglycemia in Patients with Type 1 Diabetes.</article-title>
                        <source>
                            <italic>Diabetes Technol Ther</italic>
                        </source>.<year>2024</year>;
                        <elocation-id>10.1089/dia.2023.0531</elocation-id>
                        <pub-id pub-id-type="pmid">38456910</pub-id>
                        <pub-id pub-id-type="doi">10.1089/dia.2023.0531</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
    </sub-article>
</article>
