<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="methods-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.73613.1</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Method Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Smartic: A smart tool for Big Data analytics and IoT</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 1; peer review: 2 approved]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Sayeed</surname>
                        <given-names>Shohel</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0052-4870</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Ahmad</surname>
                        <given-names>Abu Fuad</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Peng</surname>
                        <given-names>Tan Choo</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-2350-7755</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Faculty of Information Science and Technology, Multimedia University, Melaka, Melaka, 75450, Malaysia</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:shohel.sayeed@mmu.edu.my">shohel.sayeed@mmu.edu.my</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>7</day>
                <month>1</month>
                <year>2022</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2022</year>
            </pub-date>
            <volume>11</volume>
            <elocation-id>17</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>23</day>
                    <month>11</month>
                    <year>2021</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2022 Sayeed S et al.</copyright-statement>
                <copyright-year>2022</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/11-17/pdf"/>
            <abstract>
                <p>The Internet of Things (IoT) is leading the physical and digital world of technology to converge. Real-time and massive scale connections produce a large amount of versatile data, where Big Data comes into the picture. Big Data refers to large, diverse sets of information with dimensions that go beyond the capabilities of widely used database management systems, or standard data processing software tools to manage within a given limit. Almost every big dataset is dirty and may contain missing data, mistyping, inaccuracies, and many more issues that impact Big Data analytics performances. One of the biggest challenges in Big Data analytics is to discover and repair dirty data; failure to do this can lead to inaccurate analytics results and unpredictable conclusions. We experimented with different missing value imputation techniques and compared machine learning (ML) model performances with different imputation methods. We propose a hybrid model for missing value imputation combining ML and sample-based statistical techniques. Furthermore, we continued with the best missing value inputted dataset, chosen based on ML model performance for feature engineering and hyperparameter tuning. We used k-means clustering and principal component analysis. Accuracy, the evaluated outcome, improved dramatically and proved that the XGBoost model gives very high accuracy at around 0.125 root mean squared logarithmic error (RMSLE). To overcome overfitting, we used K-fold cross-validation.</p>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>IoT</kwd>
                <kwd>Big Data Analytics</kwd>
                <kwd>Data Cleaning</kwd>
                <kwd>Data Imputation</kwd>
                <kwd>Feature Engineering</kwd>
            </kwd-group>
            <funding-group>
                <award-group id="fund-1" xlink:href="http://dx.doi.org/10.13039/100012024">
                    <funding-source>Multimedia University</funding-source>
                </award-group>
                <funding-statement>The author(s) declared that no grants were involved in supporting this work.</funding-statement>
            </funding-group>
        </article-meta>
    </front>
    <body>
        <sec id="sec1" sec-type="intro">
            <title>Introduction</title>
            <p>The Internet of Things (IoT) is reshaping communication with technologies and is becoming a vital part of the development of a smart environment dedicated to make our lives convenient and comfortable.
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup> Several IoT application sectors like smart homes, smart cities,
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>
                </sup> smart healthcare, assisted driving, smart retail, and consumer goods like wearables and smartphones are already available.
                <sup>
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup>
                <sup>&#x2013;</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>
                </sup> IoT is built with electronics hardware, software, and connectivity, which enables device interaction and transfer of data. The IoT ecosystem generates massive amounts of data. This data could be analyzed to make business decisions,
                <sup>
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> predict consumer behavior, or to bring solutions to problems that might exist.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>
                </sup> Big Data offers the solutions to handle various types of data on a large scale.</p>
            <p>Big Data extends the possibility to conduct extensive and rich analyses utilizing a vast amount of data.
                <sup>
                    <xref ref-type="bibr" rid="ref4">4</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> Standard data processing tools are limited in data management capacity, where Big Data goes beyond the capabilities of traditional database management systems (DBMS).
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> Big Data comprises a large volume of information that is complex (structured and unstructured) in nature. Data are often being generated in real-time and can be of uncertain provenance.
                <sup>
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup> New Big Data technologies are being developed to meet the demands for processing massive amounts of heterogeneous data. Big Data management benefits are significant and sometimes far-reaching, and many companies have started operating with Big Data to translate a large amount of data into valuable insights.
                <sup>
                    <xref ref-type="bibr" rid="ref11">11</xref>
                </sup>
            </p>
            <p>The bulky and heterogeneous nature of Big Data requires investigation using Big Data Analytics (BDA). These data will yield meaningful outcomes by using methods of dissection in BDA,
                <sup>
                    <xref ref-type="bibr" rid="ref9">9</xref>
                </sup> which help to discover concealed patterns, anonymous relationships, trends of the current market situation, consumer preferences and other aspects of data that can assist institutes and companies to make up to date, faster and better decisions for their business. However, the biggest issue with available datasets is the data quality itself. The data quality issues differ depending on the data source; they could be duplicated records, spelling errors or more complex issues relating to unit misuse. A mixture of clean and dirty records in data can mislead to the well-known Simpsons Paradox
                <sup>
                    <xref ref-type="bibr" rid="ref12">12</xref>
                </sup>
                <sup>,</sup>
                <sup>
                    <xref ref-type="bibr" rid="ref13">13</xref>
                </sup> in which a pattern appears in a particular dataset but disappears or reverses when datasets are combined. A mixture of dirty and clean data could poorly fit an ML model; 
                <xref ref-type="fig" rid="f1">Figure 1</xref> shows the different ML models fitting with different sets of mixed data. This would lead to unreliable analysis results. Hence, data pre-processing is an important factor in the data analysis process.</p>
            <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                <label>Figure 1. </label>
                <caption>
                    <title>Simpsons Paradox.</title>
                </caption>
                <graphic id="gr1" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure1.gif"/>
            </fig>
            <p>To make the data suitable for analysis we must clean it. Identifying dirty records and cleaning data sometimes require manual data inspection, which is time-consuming and can be costly. Pre-processing includes several steps, for example, (1) loading the data from the file, (2) cleaning it to fix inconsistencies or errors, (3) encoding the numeric and categorical data types, and finally, (4) the missing value imputation. Missing values can be handled in different ways. Columns or rows containing missing values can be dropped or a value can be imputed in each cell with a missing or improper value. Sometimes, crowdsourcing is used to correct some types of errors, which costs a significant amount of time and human-level works. Some researchers have used statistical computing such as mean, median, sum, among others, and approximate query processing to pre-process data. Some researchers have used sample-based cleaning techniques which can gradually improve data quality. Machine learning is an expanding research area, and is being used in some cases of data cleaning. We propose a hybrid model of the data pre-processing technique called Smartic, which is the combination of sample-based statistical techniques and ML. While sample-based statistical techniques lead to faster execution, ML models provide great accuracy. Our research contribution on Smartic will mitigate challenges related to dirty data cleaning and imputing missing values with better performance accuracy, within a reasonable time frame.</p>
            <p>In this paper we present a tool for IoT data preparation and BDA with ML. After data pre-processing, we carried out some feature engineering. This consists in checking which features are highly informative and which are less informative, and then considering features for the analytic purpose. Highly informative features will usually have the most benefits during feature development, while uninformative features can lead to overfitting. The main sections of this study are listed below:
                <list list-type="bullet">
                    <list-item>
                        <label>&#x2022;</label>
                        <p>A review of the recent literature</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Presentation of a BDA framework</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Discussion of the data preparation issues and solution</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Presentation of techniques to improve analysis performance</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Comparison of different solutions and discussion of the results</p>
                    </list-item>
                    <list-item>
                        <label>&#x2022;</label>
                        <p>Conclusions and future research directions</p>
                    </list-item>
                </list>
            </p>
        </sec>
        <sec id="sec2">
            <title>Related work</title>
            <p>Ahmad 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref14">14</xref>
                </sup> reviewed the recent literature on IoT and BDA. Massive data production in IoT environments, and the versatile nature of the data, make Big Data a suitable solution for IoT systems. They discussed the opportunities for organizations to get valuable insights about their customers and help predict upcoming trends. BDA and ML
                <sup>
                    <xref ref-type="bibr" rid="ref15">15</xref>
                </sup> tools like classification, clustering and predictive modeling, provide data mining solutions that create many more opportunities to expose variability, improve decision-making habits and boost performance.
                <sup>
                    <xref ref-type="bibr" rid="ref16">16</xref>
                </sup> Cross-domain data gathered from different IoT appliances can be fed into BDA that can provide efficient solutions for different domains.</p>
            <p>To overcome the challenges of collecting, processing, and examining the massive-scale, real-time data produced by smart homes, Bashir and Gill
                <sup>
                    <xref ref-type="bibr" rid="ref17">17</xref>
                </sup> offered an analytical framework composed of IoT, Big Data administration, and data analytics. The purpose of data analytics in their study was to automatically maintain the oxygen level consistency, detect hazardous gases or smoke, and control light conditions or quality. The work scheme was executed in the Cloudera Hadoop distribution platform, where 
                <ext-link ext-link-type="uri" xlink:href="https://spark.apache.org/docs/latest/api/python/index.html">PySpark</ext-link>
                <sup>
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> was used for big data analysis. The outcomes revealed that the proposed scheme could be used for smart building management with BDA.</p>
            <p>Idrees 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> proposed a two-step data cleaning method, using Big Data on a network of IoT wireless sensor devices. They attempted to minimize communication cost, save energy, and expand the lifespan of sensors by cleaning and reducing the redundant data. Their proposed two-level data reduction and cleaning approach in IoT wireless sensor networks includes a sensor level and an aggregator level. The aggregator level merged a near- similar data sets by implementing a divide and conquer technique. The reduced data sets were retransmitted to the sink, then the leader cluster algorithm-based cleaning method was applied to remove redundant data.</p>
            <p>Salloum 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref20">20</xref>
                </sup> proposed a Random Sample Partition (RSP) Explore technique, to explore Big Data iteratively on small computing clusters. Their work included three main tasks: statistical estimation, error detection, and data cleaning. They partitioned the entire data into ready-to-use RSP blocks using an RSP-distributed data model. To get samples of clean data, they used block-level samples to understand the data and detect any potential value errors. Their experimental results showed that a sample RSP block cleaning is enough to get an estimation of the statistical properties of any dataset, and the approximate results from RSP-Explore can rapidly converge toward the true values.</p>
            <p>Garc&#x00ed;a-Gil 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref21">21</xref>
                </sup> worked on data pre-processing to transform raw data into high-quality, clean data. The quality of the data used in any knowledge discovery process directly impacts the output. They experimented with classification problems due to the presence of noise affecting data quality, particularly a very disruptive feature of data known as incorrect labelling of training dataset. They proposed two Big Data pre-processing techniques with a special emphasis on their scalability and performance traits. The filters they used to remove noisy data were a homogeneous ensemble and a heterogeneous ensemble filter. The results from their experiments show that anyone can retain a smart dataset efficiently from any Big Data classification problem using these proposed filters.</p>
            <p>Snineh 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref22">22</xref>
                </sup> proposed a solution that can be performed in real time to handle the frequent errors of Big Data flows. They proposed a repository for each given domain in their two-step model to store the metadata, cleaning and correction algorithms, and an error log. An advisor was appointed to supervise the system for the first step. The advisor could estimate the algorithm corresponding to error cleaning for a given context. At the second step, the system became autonomous in the selection algorithm procedure based on its learning module. That capability was obtained by using a strategy pattern-based approach. The pattern allowed the building of a family of algorithms, which are interchangeable and evolve independently of the context of use.</p>
            <p>Jesmeen 
                <italic toggle="yes">et al</italic>.
                <sup>
                    <xref ref-type="bibr" rid="ref23">23</xref>
                </sup> presented a comparison between currently used algorithms and their proposed tool, Auto-CDD, to handle missing values. The developed system improved overall data processing and guaranteed to overcome processing unwanted outcomes in data analysis. Their intelligent tool used Gini index values of random forest for feature selection. Experimental evaluation results showed that the random forest classifier led to a high accuracy on a diabetes dataset from UCI.
                <sup>
                    <xref ref-type="bibr" rid="ref24">24</xref>
                </sup> They also imputed the missing values on a student database and performed logistic regression analysis on students&#x2019; performance.</p>
            <p>Shah 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref25">25</xref>
                </sup> investigated the research gaps in understanding Big Data characteristics generated by industrial IoT sensors, and studied the challenges to processing data analytics. They studied the characteristics of the Big Data generated from an in-house developed IoT-enabled manufacturing testbed. They explored the role of feature engineering for predicting the key process variables in effective machine learning models. The comparison with different levels or extent of feature engineering in between simple statistical learning approaches and complex deep learning approaches, shows potential for industrial IoT-enabled manufacturing applications.</p>
            <p>El-Hasnony 
                <italic toggle="yes">et al.</italic>
                <sup>
                    <xref ref-type="bibr" rid="ref26">26</xref>
                </sup> presented challenges in building an optimal feature selection model for Big Data applications, due to the complexity and high dimensionality of the data sets. They used particle swarm optimization and grey wolf optimization to build a new binary variant of a wrapper feature selection. The optimal solution was found with the help of the K-nearest neighbour classifier and Euclidean separation matrices. The overfitting issue was checked using K-fold cross-validation, and the performance and the effectiveness of the model were validated by conducting statistical analyses.</p>
        </sec>
        <sec id="sec3">
            <title>Big data analytics</title>
            <p>BDA follows some steps towards getting meaningful insights. Data analytics start with a non-trivial step of problem definition and evaluation. Research on expected gains and costs for reasonable solutions is needed. Generally, a data analytics framework is defined by five main steps:</p>
            <sec id="sec4">
                <title>Data acquisition</title>
                <p>Data acquisition, the key to the data life cycle, defines the data product profile. At this stage, structured and unstructured data are gathered from different sources and different types of unstructured or dirty data are pre-processed. Short data loading times are crucial for BDA due to its naturally exponential growth rate.</p>
            </sec>
            <sec id="sec5">
                <title>Data mining and cleansing</title>
                <p>The most essential stage of processing Big Data is to implement a method to extract the necessary data from the loaded, un-structured Big Data. A data analyst spends the most time on cleaning dirty data. Analysing dirty data could lead to erroneous results. To get high-quality data, faulty records, duplicates, unwanted records, and outliers need to be removed. Typos must be fixed and the data requires structuring. An exploratory analysis could investigate the initial characteristics of data and helps refining the hypothesis.</p>
            </sec>
            <sec id="sec6">
                <title>Data aggregation and integration</title>
                <p>The cleaned data obtained needs to be aggregated for processing numerical and categorical types of data, followed by data integration. Different types of data in various shapes and sizes obtained from different sources need to be integrated to prepare for analysis. To unify some data features, we may need to convert between formats. For example, one source collecting ratings on a five-star scale, and another source collecting data as &#x201c;up&#x201d; and &#x201c;down&#x201d; vote only. The response variable could be,
                    <disp-formula id="e1">
                        <mml:math display="block">
                            <mml:mi mathvariant="normal">y</mml:mi>
                            <mml:mo>&#x2208;</mml:mo>
                            <mml:mfenced close="}" open="{" separators=",,,,">
                                <mml:mn>1</mml:mn>
                                <mml:mn>2</mml:mn>
                                <mml:mn>3</mml:mn>
                                <mml:mn>4</mml:mn>
                                <mml:mn>5</mml:mn>
                            </mml:mfenced>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>and
                    <disp-formula id="e2">
                        <mml:math display="block">
                            <mml:mi mathvariant="normal">y</mml:mi>
                            <mml:mo>&#x2208;</mml:mo>
                            <mml:mfenced close="}" open="{" separators=",">
                                <mml:mtext>positive</mml:mtext>
                                <mml:mtext>negative</mml:mtext>
                            </mml:mfenced>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Before integrating both source data, we need to make an equivalent response representation, possibly by converting the first source to the second representation format, considering three stars and above as the positive ratings and the rest of them as the negative ratings. Properly integrated data becomes less complex, more centralized and more valuable.</p>
            </sec>
            <sec id="sec7">
                <title>Data analysis and modelling</title>
                <p>From the perspective of Big Data, the goal is to produce meaningful insights that will be invaluable for business, through the analysis of data which may fluctuate depending on analytics technique and data types. Reports investigating the data must be constructed to help the business for better and faster decision-making.</p>
            </sec>
            <sec id="sec8">
                <title>Data interpretation</title>
                <p>Data interpretation allows to present data in an understandable format for users, for example, presenting data using analysis and modelling results to make decisions by interpreting the outcomes and extracting knowledge. Data interpretation queries are categorized together and indicate the same table, diagram graph or other data demonstration options.</p>
            </sec>
        </sec>
        <sec id="sec9">
            <title>Proposed method</title>
            <sec id="sec10">
                <title>Data collection and storage</title>
                <p>We collected data from the UCI repository
                    <sup>
                        <xref ref-type="bibr" rid="ref24">24</xref>
                    </sup> and from publicly available datasets in the Kaggle database.
                    <sup>
                        <xref ref-type="bibr" rid="ref27">27</xref>
                    </sup> We stored these datasets on Kaggle&#x2019;s server and worked on these data on the database&#x2019;s kernels. All collected datasets were in CSV (Comma-separated values) format.</p>
            </sec>
            <sec id="sec11">
                <title>Data preprocessing</title>
                <p>Data preprocessing is an important phase of data analysis. Raw data is manipulated to make it understandable. This is carried out in several steps, such as cleaning, encoding, imputing, among others. We handled these steps separately.</p>
            </sec>
            <sec id="sec12">
                <title>Data cleaning</title>
                <p>For this step, we tried to fix any errors and remove inconsistency. We fixed typos and different representations for any values in a common representation. We used 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:mtext>fuzzy matching</mml:mtext>
                        </mml:math>
                    </inline-formula> or 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:mtext>edit distance</mml:mtext>
                        </mml:math>
                    </inline-formula> algorithms to remove inconsistency. Outlier detection and removal help to get better accuracy. 
                    <xref ref-type="fig" rid="f2">Figure 2</xref> shows summary statistics (number summary) to represent data, such as, minimum, maximum, median, quartiles (Q1, Q3). The first quartile (Q1) is the middle value between the smallest value and the median (or the 50
                    <sup>th</sup> percentile, or Q2) of the dataset. A 25% portion of values in the dataset resides below the first quartile.
                    <disp-formula id="e3">
                        <mml:math display="block">
                            <mml:mtext>Interquartile Range</mml:mtext>
                            <mml:mo>,</mml:mo>
                            <mml:mi>IQR</mml:mi>
                            <mml:mo>=</mml:mo>
                            <mml:mi mathvariant="normal">Q</mml:mi>
                            <mml:mn>3</mml:mn>
                            <mml:mo>&#x2212;</mml:mo>
                            <mml:mi mathvariant="normal">Q</mml:mi>
                            <mml:mn>1</mml:mn>
                        </mml:math>
                    </disp-formula>
                    <disp-formula id="e4">
                        <mml:math display="block">
                            <mml:mtext>minimum</mml:mtext>
                            <mml:mo>=</mml:mo>
                            <mml:mi mathvariant="normal">Q</mml:mi>
                            <mml:mn>1</mml:mn>
                            <mml:mo>&#x2212;</mml:mo>
                            <mml:mn>1.5</mml:mn>
                            <mml:mo>&#x2217;</mml:mo>
                            <mml:mi>IQR</mml:mi>
                        </mml:math>
                    </disp-formula>
                    <disp-formula id="e5">
                        <mml:math display="block">
                            <mml:mtext>maximum</mml:mtext>
                            <mml:mo>=</mml:mo>
                            <mml:mi mathvariant="normal">Q</mml:mi>
                            <mml:mn>3</mml:mn>
                            <mml:mo>+</mml:mo>
                            <mml:mn>1.5</mml:mn>
                            <mml:mo>&#x2217;</mml:mo>
                            <mml:mi>IQR</mml:mi>
                        </mml:math>
                    </disp-formula>
                </p>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>Figure 2. </label>
                    <caption>
                        <title>Outlier detection.
                            <sup>
                                <xref ref-type="bibr" rid="ref28">28</xref>
                            </sup>
                        </title>
                    </caption>
                    <graphic id="gr2" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure2.gif"/>
                </fig>
                <p>IQR, or midspread, or middle 50%, is the statistical dispersion equal to the range from lower quartile (25th percentile) to the upper quartile (75th percentile). The values that do not reside within the range of the minimum and maximum value are defined as outliers (
                    <xref ref-type="fig" rid="f2">Figure 2</xref>).</p>
            </sec>
            <sec id="sec13">
                <title>Data encoding</title>
                <p>We prepared statistical data values in numerical and categorical values. The standard statistical types such as numeric and categorical had similar representations in Pandas
                    <sup>
                        <xref ref-type="bibr" rid="ref29">29</xref>
                    </sup> and Python (version 3.10). To treat each feature correctly, we encoded each column as its respective type of data, which helps to apply transformation consistency in further analytical processes.</p>
            </sec>
            <sec id="sec14">
                <title>Imputation</title>
                <p>We fixed the missing values in this step. We used 0 as the default value for missing numeric data and &#x2018;None&#x2019; as the default value for missing categorical data. We used different techniques to impute missing values and train the machine learning model by feeding these imputed datasets. Based on the model performances, we chose the best imputation technique and used it for further analytical process. The implemented algorithmic steps were as follow:</p>
                <p>Step 1: Retrieve sample clean dataset (
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Dataset</mml:mtext>
                                <mml:mtext mathvariant="italic">clean</mml:mtext>
                            </mml:msub>
                        </mml:math>
                    </inline-formula>) from the original dataset, excluding missing/incomplete values as much as possible.</p>
                <p>Step 2: Order Features (
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Features</mml:mtext>
                                <mml:mi>o</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula>) based on feature utility scores or mutual scores.</p>
                <p>Step 3: Select top features from 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Features</mml:mtext>
                                <mml:mi>o</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> and apply step 5.</p>
                <p>Step 4: Select the rest of the features from 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Features</mml:mtext>
                                <mml:mi>o</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> and apply step 6.</p>
                <p>Step 5: For a given feature 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>F</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> : label 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>F</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> as the target and the rest of the column in 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Dataset</mml:mtext>
                                <mml:mtext mathvariant="italic">clean</mml:mtext>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> as features, and train the ML model to obtain missing or incomplete values for the original dataset.</p>
                <p>Step 6: For a given feature 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>F</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> : calculate statistical parameters (mean or median) of the 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>F</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> column in 
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mtext mathvariant="italic">Dataset</mml:mtext>
                                <mml:mtext mathvariant="italic">clean</mml:mtext>
                            </mml:msub>
                        </mml:math>
                    </inline-formula>, and obtain missing or incomplete values for the original dataset.</p>
            </sec>
        </sec>
        <sec id="sec15">
            <title>Feature engineering</title>
            <p>We used mutual information (
                <xref ref-type="fig" rid="f3">Figure 3</xref>) to determine the importance of a feature. For this step, we created new features as well. We used target encoding for categorical features with higher cardinality. Target encoding involves replacing a categorical feature with the average target value of all data points for that category. We used several other techniques of feature engineering for this purpose.</p>
            <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                <label>Figure 3. </label>
                <caption>
                    <title>Feature utility scores of house prices dataset.
                        <sup>
                            <xref ref-type="bibr" rid="ref27">27</xref>
                        </sup>
                    </title>
                </caption>
                <graphic id="gr3" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure3.gif"/>
            </fig>
            <sec id="sec16">
                <title>Feature utility scores</title>
                <p>Using mutual score is a great way to determine a feature&#x2019;s potential. Feature utility scores help to determine important features and non-important ones as well. Based on scores, we discarded some features for a performance gain.</p>
            </sec>
            <sec id="sec17">
                <title>Feature creation and transformation</title>
                <p>Label encoding can be done to transform categorical features, as we are focusing on the tree-ensemble model; this works for both ordered and unordered data categories. Creating new features can be done in several ways such as, taking the product of two numerical features, the square root of a feature, normalize by applying logarithms, determining the group statistics of a feature, etc.</p>
            </sec>
            <sec id="sec18">
                <title>K-means clustering</title>
                <p>The unsupervised algorithm k-means clustering can be used to create features as well. Cluster labels or the distance of each entity to each cluster can be used as features. Sometimes, these help to untangle complicated relationships between features, engineered features or targets.</p>
            </sec>
            <sec id="sec19">
                <title>Principal component analysis (PCA)</title>
                <p>We can use another unsupervised principal component analysis (PCA) model for feature creation, which can decompose a variational structure. The PCA algorithm gave us loadings which described each component of a variation, and the components which were the transformed datapoints. The loadings can suggest features to create and the components we can directly use as features. Clustering can be done using one or more components.</p>
            </sec>
            <sec id="sec20">
                <title>Target encoding</title>
                <p>It is an encoding of categorical into numeric values derived from the target. It resembles a supervised feature engineering technique. We used mean and median values for this purpose.</p>
            </sec>
            <sec id="sec21">
                <title>Hyperparameter tuning</title>
                <p>A great way of boosting performance is carrying out hyperparameter tuning. For our ML model XGBoost, we set max_depth to 6, learning_rate to 0.01, n_estimators to 1000.</p>
            </sec>
            <sec id="sec22">
                <title>Evaluation criteria</title>
                <p>We adopted the K-fold cross-validation for performance evaluation. Cross-validation divided the data set into training and a testing data set to train the model and test its performance using two distinct data sets. Training and testing on the same data create overfitting issues. To avoid this, we used K-fold cross-validation with a value of 5 for K (
                    <xref ref-type="fig" rid="f4">Figure 4</xref>). All our experimental results were five-fold cross-validated.</p>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>Figure 4. </label>
                    <caption>
                        <title>K-fold cross-validation (K=5).</title>
                    </caption>
                    <graphic id="gr4" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure4.gif"/>
                </fig>
                <p>The XGBoost model performance was evaluated using the root mean squared logarithmic error (RMSLE) metric. The formula for RMSLE is represented as follows:
                    <disp-formula id="e6">
                        <mml:math display="block">
                            <mml:mtext>RMSLE</mml:mtext>
                            <mml:mo>=</mml:mo>
                            <mml:msqrt>
                                <mml:mrow>
                                    <mml:mspace width="0.25em"/>
                                    <mml:mfrac>
                                        <mml:mn>1</mml:mn>
                                        <mml:mi>n</mml:mi>
                                    </mml:mfrac>
                                    <mml:mspace width="0.25em"/>
                                    <mml:msubsup>
                                        <mml:mo>&#x2211;</mml:mo>
                                        <mml:mrow>
                                            <mml:mi>i</mml:mi>
                                            <mml:mo>=</mml:mo>
                                            <mml:mn>1</mml:mn>
                                        </mml:mrow>
                                        <mml:mi>n</mml:mi>
                                    </mml:msubsup>
                                    <mml:msup>
                                        <mml:mfenced close=")" open="(">
                                            <mml:mrow>
                                                <mml:mo>log</mml:mo>
                                                <mml:mfenced close=")" open="(">
                                                    <mml:mrow>
                                                        <mml:msub>
                                                            <mml:mi>p</mml:mi>
                                                            <mml:mi>i</mml:mi>
                                                        </mml:msub>
                                                        <mml:mo>+</mml:mo>
                                                        <mml:mn>1</mml:mn>
                                                    </mml:mrow>
                                                </mml:mfenced>
                                                <mml:mo>&#x2212;</mml:mo>
                                                <mml:mo>log</mml:mo>
                                                <mml:mfenced close=")" open="(">
                                                    <mml:mrow>
                                                        <mml:msub>
                                                            <mml:mi>a</mml:mi>
                                                            <mml:mi>i</mml:mi>
                                                        </mml:msub>
                                                        <mml:mo>+</mml:mo>
                                                        <mml:mn>1</mml:mn>
                                                    </mml:mrow>
                                                </mml:mfenced>
                                            </mml:mrow>
                                        </mml:mfenced>
                                        <mml:mn>2</mml:mn>
                                    </mml:msup>
                                </mml:mrow>
                            </mml:msqrt>
                        </mml:math>
                    </disp-formula>
                </p>
                <p>Where:</p>
                <p>
                    <italic toggle="yes">n</italic> is the number of observations in the dataset</p>
                <p>
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>p</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> is the prediction of target</p>
                <p>
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mi>a</mml:mi>
                                <mml:mi>i</mml:mi>
                            </mml:msub>
                        </mml:math>
                    </inline-formula> is the actual target for 
                    <italic toggle="yes">i</italic>.</p>
                <p>log(
                    <italic toggle="yes">x</italic>) is the natural logarithm of 
                    <italic toggle="yes">x</italic> (
                    <inline-formula>
                        <mml:math display="inline">
                            <mml:msub>
                                <mml:mo>log</mml:mo>
                                <mml:mi>e</mml:mi>
                            </mml:msub>
                            <mml:mfenced close=")" open="(">
                                <mml:mi>x</mml:mi>
                            </mml:mfenced>
                        </mml:math>
                    </inline-formula>).</p>
            </sec>
        </sec>
        <sec id="sec23">
            <title>Experiments</title>
            <sec id="sec24">
                <title>Discarding missing values</title>
                <p>Discarding columns or rows is a technique for handling missing values. Our model performance in RMSLE was 0.14249 after discarding columns with missing values.</p>
            </sec>
            <sec id="sec25">
                <title>Missing value imputation</title>
                <p>We evaluated our XGBoost model using datasets with imputed missing values using different types of imputation techniques. When we filled non-muerical (NAN, not a number) values with a 0, we obtained an RMSLE score of 0.14351, while when filling missing values with the next valid value on the same column, we obtained a score of 0.14348. If we use the statistical mean of a feature column to impute missing values in that column, we notice a performance increment with an RMSLE score of 0.14157.</p>
            </sec>
            <sec id="sec26">
                <title>Feature engineering</title>
                <p>As we carried out feature transformation and target encoding based on feature utility scores, we yielded better performances. The use of K-means clustering and PCA led to a better performance as well. We obtained a value of 0.14044 for the RMSLE score.</p>
            </sec>
            <sec id="sec27">
                <title>Hyperparameter tuning</title>
                <p>Hyperparameter tuning gave a performance boost in the final performance evaluation. 
                    <xref ref-type="fig" rid="f5">Figure 5</xref> shows the performance improvements after feature engineering and hyperparameter tuning. After fine-tuning some parameters, we obtained our highest RMSLE score with0.12426.</p>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>Figure 5. </label>
                    <caption>
                        <title>RMSLE metric score in different stage of analytics.</title>
                    </caption>
                    <graphic id="gr5" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure5.gif"/>
                </fig>
            </sec>
        </sec>
        <sec id="sec28" sec-type="results|discussion">
            <title>Results and discussion</title>
            <p>We calculated the mean RMSLE value of five trials of train/test splits while varying the training dataset size from 0.1 to 0.9 (10% to 90%). In 
                <xref ref-type="fig" rid="f6">Figure 6(b)</xref>, we notice that the ML-based missing value imputation technique outperformed all other traditional imputation methods. Imputing 0 in place of the missing value performed worst in our experiment (see 
                <xref ref-type="fig" rid="f6">Figure 6a</xref>). Replacing missing values of any feature column with the median of that column performed slightly better than imputing the mean of that feature column.</p>
            <fig fig-type="figure" id="f6" orientation="portrait" position="float">
                <label>Figure 6. </label>
                <caption>
                    <title>(a) Performance measurement of different techniques of missing value imputation. (b) Closer view of the best imputation techniques.</title>
                </caption>
                <graphic id="gr6" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure6.gif"/>
            </fig>
            <p>As the ML-based imputation technique outperformed state-of-the-art baseline methods, we evaluated missing value imputation performance of different ML models, such as LinearRegression, DecisionTreeRegressor, LinearSVR, GaussianNB, BaggingRegressor, KNeighborsRegressor, AdaBoostRegressor, XGBRegressor, among others. Although all ML models delivered higher accuracy with the continuous increment of training dataset size, we saw (
                <xref ref-type="fig" rid="f7">Figure 7</xref>) more uniform and sheer increasing patterns in XGBRegressor and BaggingRegressor models. It proves that with sufficiently large datasets, the XGBRegressor model can outperform the other ML methods. In addition, we noticed that the XGBRegressor model showed a more stable performance with the varying training data size.</p>
            <fig fig-type="figure" id="f7" orientation="portrait" position="float">
                <label>Figure 7. </label>
                <caption>
                    <title>Mean squared error (MSE) of different machine learning (ML) models.</title>
                </caption>
                <graphic id="gr7" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/77276/90da320d-f3f3-4efb-8dc0-740407cd304f_figure7.gif"/>
            </fig>
        </sec>
        <sec id="sec29" sec-type="conclusion">
            <title>Conclusion</title>
            <p>Almost every data set available may contain missing values, which are essential to analyze and understand the data. Dealing with these types of dirty data is difficult, and getting a robust analytical ML models is more challenging. We used statistical methods equipped to fix the datasets, and the sample-based approximate query processing was integrated to alleviate errors in analysis and predictions. The data fixed using different imputation techniques were fed into ML analytical models, and accuracy was compared against different data preparation techniques. Smartic&#x2019;s data value imputation was faster than the ML-based missing value imputation model. The ML model, trained on data cleaned using a sample-based technique, showed a significantly better and more stable performance. In the future, evaluation can be done with data collected directly from IoT environments in real time.</p>
        </sec>
        <sec id="sec30">
            <title>Data availability</title>
            <sec id="sec31">
                <title>Underlying data</title>
                <p>
                    <list list-type="bullet">
                        <list-item>
                            <label>-</label>
                            <p>Ames housing dataset: house sales data in Ames, Iowa between 2006 and 2010. Compiled by Dean De Cock; used for educational purposes. We used a version of that dataset available at 
                                <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data">https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data</ext-link>
                            </p>
                        </list-item>
                        <list-item>
                            <label>-</label>
                            <p>Diabetes dataset: The dataset represents clinical care at 130 US hospitals between years 1999-2008. This dataset was prepared to predict whether a patient&#x2019;s re-admission. Dataset available from UC Irvine Machine Learning repository, 
                                <ext-link ext-link-type="uri" xlink:href="https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008#">https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008#</ext-link>
                            </p>
                        </list-item>
                    </list>
                </p>
            </sec>
            <sec id="sec32">
                <title>Extended data</title>
                <p>Analysis code available from: 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/FuadAhmad/smartic">https://github.com/FuadAhmad/smartic</ext-link>
                </p>
                <p>Archived analysis code at time of publication: 
                    <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/badge/latestdoi/420156995">https://zenodo.org/badge/latestdoi/420156995</ext-link>
                </p>
                <p>License: (must be open access) 
                    <ext-link ext-link-type="uri" xlink:href="https://github.com/FuadAhmad/smartic/blob/main/LICENSE">Apache-2.0 License</ext-link>
                </p>
            </sec>
        </sec>
    </body>
    <back>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Risteska Stojkoska</surname>
                            <given-names>BL</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Trivodaliev</surname>
                            <given-names>KV</given-names>
                        </name>
</person-group>:
                    <article-title>A review of Internet of Things for smart home: Challenges and solutions.</article-title>
                    <source>

                        <italic toggle="yes">J. Clean. Prod.</italic>
</source>
                    <year>2017</year>;<volume>140</volume>:<fpage>1454</fpage>&#x2013;<lpage>1464</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.jclepro.2016.10.006</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Zanella</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bui</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Castellani</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Internet of things for smart cities.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Internet Things J.</italic>
</source>
                    <year>2014</year>;<volume>1</volume>:<fpage>22</fpage>&#x2013;<lpage>32</lpage>.
                    <pub-id pub-id-type="doi">10.1109/JIOT.2014.2306328</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Asghari</surname>
                            <given-names>P</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rahmani</surname>
                            <given-names>AM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Javadi</surname>
                            <given-names>HHS</given-names>
                        </name>
</person-group>:
                    <article-title>Internet of Things applications: A systematic review.</article-title>
                    <source>

                        <italic toggle="yes">Comput. Netw.</italic>
</source>
                    <year>2019</year>;<volume>148</volume>:<fpage>241</fpage>&#x2013;<lpage>261</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.comnet.2018.12.008</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hariri</surname>
                            <given-names>RH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Fredericks</surname>
                            <given-names>EM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bowers</surname>
                            <given-names>KM</given-names>
                        </name>
</person-group>:
                    <article-title>Uncertainty in big data analytics: survey, opportunities, and challenges.</article-title>
                    <source>

                        <italic toggle="yes">J. Big Data.</italic>
</source>
                    <year>2019</year>;<volume>6</volume>.
                    <pub-id pub-id-type="doi">10.1186/s40537-019-0206-3</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sepasgozar</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A systematic content review of artificial intelligence and the internet of things applications in smart home.</article-title>
                    <source>

                        <italic toggle="yes">Appl. Sci.</italic>
</source>
                    <year>2020</year>;<volume>10</volume>.
                    <pub-id pub-id-type="doi">10.3390/app10093074</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Sestino</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Prete</surname>
                            <given-names>MI</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Piper</surname>
                            <given-names>L</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Internet of Things and Big Data as enablers for business digitalization strategies.</article-title>
                    <source>

                        <italic toggle="yes">Technovation.</italic>
</source>
                    <year>2020</year>;<volume>98</volume>:<fpage>102173</fpage>.
                    <pub-id pub-id-type="doi">10.1016/j.technovation.2020.102173</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ahmed</surname>
                            <given-names>E</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>The role of big data analytics in Internet of Things.</article-title>
                    <source>

                        <italic toggle="yes">Comput. Netw.</italic>
</source>
                    <year>2017</year>;<volume>129</volume>:<fpage>459</fpage>&#x2013;<lpage>471</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.comnet.2017.06.013</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Amalina</surname>
                            <given-names>F</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Blending Big Data Analytics: Review on Challenges and a Recent Study.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2020</year>;<volume>8</volume>:<fpage>3629</fpage>&#x2013;<lpage>3645</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2019.2923270</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Marjani</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Big IoT Data Analytics: Architecture, Opportunities, and Open Research Challenges.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2017</year>;<volume>5</volume>:<fpage>5247</fpage>&#x2013;<lpage>5261</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2017.2689040</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vikash</surname>
                            <given-names>LM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Varma</surname>
                            <given-names>S</given-names>
                        </name>
</person-group>:
                    <article-title>Performance evaluation of real-time stream processing systems for Internet of Things applications.</article-title>
                    <source>

                        <italic toggle="yes">Futur. Gener. Comput. Syst.</italic>
</source>
                    <year>2020</year>;<volume>113</volume>:<fpage>207</fpage>&#x2013;<lpage>217</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.future.2020.07.012</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ge</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bangui</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Buhnova</surname>
                            <given-names>B</given-names>
                        </name>
</person-group>:
                    <article-title>Big Data for Internet of Things: A Survey.</article-title>
                    <source>

                        <italic toggle="yes">Futur. Gener. Comput. Syst.</italic>
</source>
                    <year>2018</year>;<volume>87</volume>:<fpage>601</fpage>&#x2013;<lpage>614</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.future.2018.04.053</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Fabris</surname>
                            <given-names>CC</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Freitas</surname>
                            <given-names>AA</given-names>
                        </name>
</person-group>:
                    <article-title>Discovering Surprising Patterns by Detecting Occurrences of Simpson&#x2019;s Paradox.</article-title>
                    <source>

                        <italic toggle="yes">Research and Development in Intelligent Systems XVI.</italic>
</source>
                    <year>2000</year>.</mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Scarsini</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Spizzichino</surname>
                            <given-names>F</given-names>
                        </name>
</person-group>:
                    <article-title>Simpson-type paradoxes, dependence, and ageing.</article-title>
                    <source>

                        <italic toggle="yes">J. Appl. Probab.</italic>
</source>
                    <year>1999</year>;<volume>36</volume>:<fpage>119</fpage>&#x2013;<lpage>131</lpage>.
                    <pub-id pub-id-type="doi">10.1017/S0021900200016892</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Ahmad</surname>
                            <given-names>AF</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sayeed</surname>
                            <given-names>MS</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Tan</surname>
                            <given-names>CP</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A Review on IoT with Big Data Analytics.</article-title>
                    <year>2021</year>.
                    <pub-id pub-id-type="doi">10.1109/ICoICT52021.2021.9527503</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Al-Garadi</surname>
                            <given-names>MA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Mohamed</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Al-Ali</surname>
                            <given-names>AK</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>A Survey of Machine and Deep Learning Methods for Internet of Things (IoT) Security.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Commun. Surv. Tutorials.</italic>
</source>
                    <year>2020</year>;<volume>22</volume>:<fpage>1646</fpage>&#x2013;<lpage>1685</lpage>.
                    <pub-id pub-id-type="doi">10.1109/COMST.2020.2988293</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>L&#x2019;Heureux</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Grolinger</surname>
                            <given-names>K</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Elyamany</surname>
                            <given-names>HF</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Machine Learning with Big Data: Challenges and Approaches.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2017</year>;<volume>5</volume>:<fpage>7776</fpage>&#x2013;<lpage>7797</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2017.2696365</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bashir</surname>
                            <given-names>MR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gill</surname>
                            <given-names>AQ</given-names>
                        </name>
</person-group>:
                    <article-title>Towards an IoT big data analytics framework: Smart buildings systems.</article-title>
                    <year>2017</year>.
                    <pub-id pub-id-type="doi">10.1109/HPCC-SmartCity-DSS.2016.0188</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="other">
                    <collab>PySpark Documentation</collab>.
                    <ext-link ext-link-type="uri" xlink:href="https://spark.apache.org/docs/latest/api/python/index.html">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Idrees</surname>
                            <given-names>AK</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Jaoude</surname>
                            <given-names>CA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Al-Qurabat</surname>
                            <given-names>AKM</given-names>
                        </name>
</person-group>:
                    <article-title>Data reduction and cleaning approach for energy-saving in wireless sensors networks of IoT.</article-title>
                    <year>2020</year>.
                    <pub-id pub-id-type="doi">10.1109/WiMob50308.2020.9253429</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Salloum</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Huang</surname>
                            <given-names>JZ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>He</surname>
                            <given-names>Y</given-names>
                        </name>
</person-group>:
                    <article-title>Exploring and cleaning big data with random sample data blocks.</article-title>
                    <source>

                        <italic toggle="yes">J. Big Data.</italic>
</source>
                    <year>2019</year>;<volume>6</volume>.
                    <pub-id pub-id-type="doi">10.1186/s40537-019-0205-4</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Garc&#x00ed;a-Gil</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Luengo</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Garc&#x00ed;a</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Enabling Smart Data: Noise filtering in Big Data classification.</article-title>
                    <source>

                        <italic toggle="yes">Inf. Sci. (Ny).</italic>
</source>
                    <year>2019</year>;<volume>479</volume>:<fpage>135</fpage>&#x2013;<lpage>152</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.ins.2018.12.002</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Snineh</surname>
                            <given-names>SM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Youssfi</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bouattane</surname>
                            <given-names>O</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Daaif</surname>
                            <given-names>A</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Real-Time management model for frequent Big Data errors: Automatic Clean Repository for Big Data (ACR).</article-title>
                    <year>2018</year>.
                    <pub-id pub-id-type="doi">10.1109/ICMCS.2018.8525920</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Jesmeen</surname>
                            <given-names>MZH</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>AUTO-CDD: Automatic cleaning dirty data using machine learning techniques.</article-title>
                    <source>

                        <italic toggle="yes">Telkomnika (Telecommunication Comput. Electron. Control).</italic>
</source>
                    <year>2019</year>;<volume>17</volume>:<fpage>2076</fpage>.
                    <pub-id pub-id-type="doi">10.12928/TELKOMNIKA.v17i4.12780</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Strack</surname>
                            <given-names>B</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Impact of HbA1c measurement on hospital readmission rates: Analysis of 70,000 clinical database patient records.</article-title>
                    <source>

                        <italic toggle="yes">Biomed. Res. Int.</italic>
</source>
                    <year>2014</year>;<volume>2014</volume>:<fpage>1</fpage>&#x2013;<lpage>11</lpage>.
                    <pub-id pub-id-type="pmid">24804245</pub-id>
                    <pub-id pub-id-type="doi">10.1155/2014/781670</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Shah</surname>
                            <given-names>D</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wang</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>He</surname>
                            <given-names>QP</given-names>
                        </name>
</person-group>:
                    <article-title>Feature engineering in big data analytics for IoT-enabled smart manufacturing &#x2013; Comparison between deep learning and statistical learning.</article-title>
                    <source>

                        <italic toggle="yes">Comput. Chem. Eng.</italic>
</source>
                    <year>2020</year>;<volume>141</volume>:<fpage>106970</fpage>.
                    <pub-id pub-id-type="doi">10.1016/j.compchemeng.2020.106970</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref26">
                <label>26</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>El-Hasnony</surname>
                            <given-names>IM</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Barakat</surname>
                            <given-names>SI</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Elhoseny</surname>
                            <given-names>M</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Improved Feature Selection Model for Big Data Analytics.</article-title>
                    <source>

                        <italic toggle="yes">IEEE Access.</italic>
</source>
                    <year>2020</year>;<volume>8</volume>:<fpage>66989</fpage>&#x2013;<lpage>67004</lpage>.
                    <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2986232</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref27">
                <label>27</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>De Cock</surname>
                            <given-names>D</given-names>
                        </name>
</person-group>:
                    <article-title>Ames, Iowa: Alternative to the boston housing data as an end of semester regression project.</article-title>
                    <source>

                        <italic toggle="yes">J. Stat. Educ.</italic>
</source>
                    <year>2011</year>;<volume>19</volume>.
                    <pub-id pub-id-type="doi">10.1080/10691898.2011.11889627</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref28">
                <label>28</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Galarnyk</surname>
                            <given-names>M</given-names>
                        </name>
</person-group>:
                    <article-title>Understanding Boxplots.</article-title>(accessed Oct. 21, 2021).
                    <ext-link ext-link-type="uri" xlink:href="https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51">Reference Source</ext-link>
                </mixed-citation>
            </ref>
            <ref id="ref29">
                <label>29</label>
                <mixed-citation publication-type="other">
                    <collab>pandas</collab>:
                    <ext-link ext-link-type="uri" xlink:href="https://pandas.pydata.org">Reference Source</ext-link>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report136161">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.77276.r136161</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Mohammed</surname>
                        <given-names>Muzammil Hussain</given-names>
                    </name>
                    <xref ref-type="aff" rid="r136161a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-3577-7210</uri>
                </contrib>
                <aff id="r136161a1">
                    <label>1</label>Department of Information Technology, College of Computers and Information Technology, Taif University, Taif, Saudi Arabia</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>23</day>
                <month>1</month>
                <year>2024</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2024 Mohammed MH</copyright-statement>
                <copyright-year>2024</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport136161" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.73613.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>
                <list list-type="bullet">
                    <list-item>
                        <p>I think this research paper well explains the content.</p>
                    </list-item>
                    <list-item>
                        <p>The abstract is well defined.</p>
                    </list-item>
                    <list-item>
                        <p>All the figures are clear and well understood.</p>
                    </list-item>
                    <list-item>
                        <p>Related work is needed to add some more points to improve understanding. But overall it is ok.</p>
                    </list-item>
                    <list-item>
                        <p>The proposed method is well explained with equations.</p>
                    </list-item>
                    <list-item>
                        <p>Results and discussion are also explained good.</p>
                    </list-item>
                </list> </p>
            <p> Overall this paper is well structured and good research work done by the authors.</p>
            <p>Is the rationale for developing the new method (or application) clearly explained?</p>
            <p>Yes</p>
            <p>Is the description of the method technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions about the method and its performance adequately supported by the findings presented in the article?</p>
            <p>Yes</p>
            <p>If any results are presented, are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Are sufficient details provided to allow replication of the method development and its use by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Software Engineering, Cyber Security, Database and Networking</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report136164">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.77276.r136164</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Krishnamoorthy</surname>
                        <given-names>Sujatha</given-names>
                    </name>
                    <xref ref-type="aff" rid="r136164a1">1</xref>
                    <role>Referee</role>
                    <uri content-type="orcid">https://orcid.org/0000-0002-0122-6357</uri>
                </contrib>
                <aff id="r136164a1">
                    <label>1</label>Department of Computer Science, Wenzhou-Kean University, Wenzhou, China</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>30</day>
                <month>5</month>
                <year>2022</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2022 Krishnamoorthy S</copyright-statement>
                <copyright-year>2022</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport136164" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.73613.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>I appreciate the authors for the work that has been carried out, as we are all dealing with the big data day to day in our life, which is very important. Big Data refers to large, diverse sets of information with dimensions that go beyond the capabilities of widely used database management systems, or standard data processing software tools to manage within a given limit. Almost every big dataset is dirty and may contain missing data, mistyping, inaccuracies, and many more issues that impact Big Data analytics performances. One of the biggest challenges in Big Data analytics is to discover and repair dirty data; failure to do this can lead to inaccurate analytics results and unpredictable conclusions. This experimented with different missing value imputation techniques and compared machine learning (ML) model performances with different imputation methods. The authors have proposed a hybrid model for missing value imputation combining ML and sample-based statistical techniques. The authors have used k-means clustering and principal component analysis. Accuracy, the evaluated outcome, improved dramatically and proved that the Boost model gives very high accuracy at around 0.125 root mean squared logarithmic error (RMSLE). To overcome overfitting, we use K-fold cross-validation.</p>
            <p> </p>
            <p> As per the quality of the work, the code was impressive and also the source is attached to paper, which is very useful for the people who would like to take up the research in this field.</p>
            <p> </p>
            <p> The only suggestion from my side is the flow of the paper - the authors have mentioned &#x201c;we &#x201c;in multiple places. Hence, I suggest them to change to passive, which can make readability of the paper clearer.</p>
            <p> </p>
            <p> The methods in the data analytics are explained in the section 3 and also the same is clearly explained in the way that is being implemented in the paper.</p>
            <p> </p>
            <p> The font in the imputation steps could be formatted to give a unified look. Add a proper citation to the K-fold cross-validation since the data is directly given from the experiment.</p>
            <p> </p>
            <p> Overall the paper structure is good enough for indexing.</p>
            <p>Is the rationale for developing the new method (or application) clearly explained?</p>
            <p>Yes</p>
            <p>Is the description of the method technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions about the method and its performance adequately supported by the findings presented in the article?</p>
            <p>Yes</p>
            <p>If any results are presented, are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Are sufficient details provided to allow replication of the method development and its use by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>IoT, Machine learning ,Computer vision</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.</p>
        </body>
        <sub-article article-type="response" id="comment10960-136164">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Sayeed</surname>
                            <given-names>Md Shohel</given-names>
                        </name>
                        <aff>Multimedia University, Malaysia</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>27</day>
                    <month>1</month>
                    <year>2024</year>
                </pub-date>
            </front-stub>
            <body>
                <p>The manuscript has been revised based on the reviewer's comments.</p>
            </body>
        </sub-article>
    </sub-article>
</article>
