<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="1.2" xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.12688/f1000research.161073.2</article-id>
            <article-categories>
                <subj-group subj-group-type="heading">
                    <subject>Research Article</subject>
                </subj-group>
                <subj-group>
                    <subject>Articles</subject>
                </subj-group>
            </article-categories>
            <title-group>
                <article-title>Development of a machine learning predictive model for early detection of breast cancer</article-title>
                <fn-group content-type="pub-status">
                    <fn>
                        <p>[version 2; peer review: 1 approved with reservations, 1 not approved]</p>
                    </fn>
                </fn-group>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Rahman</surname>
                        <given-names>Rinsy</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-0906-3350</uri>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Saha</surname>
                        <given-names>Dola</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Original Draft Preparation</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-0560-9109</uri>
                    <xref ref-type="corresp" rid="c1">a</xref>
                    <xref ref-type="aff" rid="a1">1</xref>
                </contrib>
                <contrib contrib-type="author" corresp="yes">
                    <name>
                        <surname>Dkhar</surname>
                        <given-names>Winniecia</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Conceptualization</role>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Supervision</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <uri content-type="orcid">https://orcid.org/0000-0001-5963-3230</uri>
                    <xref ref-type="corresp" rid="c2">b</xref>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Malli</surname>
                        <given-names>Sathyendranath</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Investigation</role>
                    <role content-type="http://credit.niso.org/">Methodology</role>
                    <role content-type="http://credit.niso.org/">Resources</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Validation</role>
                    <role content-type="http://credit.niso.org/">Visualization</role>
                    <uri content-type="orcid">https://orcid.org/0000-0003-1028-4847</uri>
                    <xref ref-type="aff" rid="a3">3</xref>
                </contrib>
                <contrib contrib-type="author" corresp="no">
                    <name>
                        <surname>Barnes Abraham</surname>
                        <given-names>Neil</given-names>
                    </name>
                    <role content-type="http://credit.niso.org/">Data Curation</role>
                    <role content-type="http://credit.niso.org/">Formal Analysis</role>
                    <role content-type="http://credit.niso.org/">Software</role>
                    <role content-type="http://credit.niso.org/">Writing &#x2013; Review &amp; Editing</role>
                    <xref ref-type="aff" rid="a2">2</xref>
                </contrib>
                <aff id="a1">
                    <label>1</label>Department of Health Information Management, Manipal College of Health Professions, Manipal Academy of Higher Education, Manipal, Karnataka, 576104, India</aff>
                <aff id="a2">
                    <label>2</label>Department of Medical Imaging Technology, Manipal College of Health Professions, Manipal Academy of Higher Education, Manipal, Karnataka, 576104, India</aff>
                <aff id="a3">
                    <label>3</label>School of Information Science, Manipal Academy of Higher Education, Manipal, Karnataka, 576104, India</aff>
            </contrib-group>
            <author-notes>
                <corresp id="c1">
                    <label>a</label>
                    <email xlink:href="mailto:dola.saha@manipal.edu">dola.saha@manipal.edu</email>
                </corresp>
                <corresp id="c2">
                    <label>b</label>
                    <email xlink:href="mailto:winniecia.dkhar@manipal.edu">winniecia.dkhar@manipal.edu</email>
                </corresp>
                <fn fn-type="conflict">
                    <p>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>10</day>
                <month>4</month>
                <year>2025</year>
            </pub-date>
            <pub-date pub-type="collection">
                <year>2025</year>
            </pub-date>
            <volume>14</volume>
            <elocation-id>164</elocation-id>
            <history>
                <date date-type="accepted">
                    <day>3</day>
                    <month>4</month>
                    <year>2025</year>
                </date>
            </history>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2025 Rahman R et al.</copyright-statement>
                <copyright-year>2025</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <self-uri content-type="pdf" xlink:href="https://f1000research.com/articles/14-164/pdf"/>
            <abstract>
                <sec>
                    <title>Background</title>
                    <p>Breast cancer remains a significant global health concern, with over 7.8 million cases reported in the last five years. Early detection and accurate classification are crucial for reducing mortality rates and improving outcomes. Machine learning (ML) has emerged as a transformative tool in medical imaging, enabling more efficient and accurate diagnostic processes.</p>
                </sec>
                <sec>
                    <title>Objective</title>
                    <p>This study aims to develop a machine learning-based predictive model for early detection and classification of breast cancer using the Wisconsin Breast Cancer Diagnostic dataset.</p>
                </sec>
                <sec>
                    <title>Methods</title>
                    <p>The dataset, comprising 569 samples and 32 features derived from fine needle aspirate biopsy images, was pre-processed through data cleaning, normalization using the Robust Scaler, and feature selection. Five supervised ML algorithms&#x2014;Logistic Regression, Support Vector Classification (SVC) with linear and radial basis function (RBF) kernels, Decision Tree, and Random Forest&#x2014;were implemented. Models were evaluated using performance metrics, including accuracy, precision, sensitivity, specificity, and F1 scores.</p>
                </sec>
                <sec>
                    <title>Results</title>
                    <p>The SVC-RBF model demonstrated the highest accuracy (98.68%) and balanced performance across other metrics, making it the most effective classifier for distinguishing between benign and malignant tumors. Key features such as texture mean and area (worst) significantly contributed to classification accuracy.</p>
                </sec>
                <sec>
                    <title>Conclusions</title>
                    <p>This study highlights the potential of ML algorithms, particularly SVC-RBF, to revolutionize breast cancer diagnostics through improved accuracy and efficiency. Future research should validate these findings with diverse datasets and explore their integration into clinical workflows to enhance decision-making and patient care.</p>
                </sec>
            </abstract>
            <kwd-group kwd-group-type="author">
                <kwd>Breast cancer</kwd>
                <kwd>Mammography</kwd>
                <kwd>Machine learning</kwd>
                <kwd>Tumor classification</kwd>
                <kwd>Predictive modelling</kwd>
            </kwd-group>
            <funding-group>
                <funding-statement>The author(s) declared that no grants were involved in supporting this work.</funding-statement>
            </funding-group>
        </article-meta>
        <notes>
            <sec sec-type="version-changes">
                <label>Revised</label>
                <title>Amendments from Version 1</title>
                <p>The revised manuscript addresses key reviewer concerns by enhancing the discussion on machine learning approaches in cancer diagnostics, incorporating relevant literature, and strengthening the rationale for the chosen methodology. The limitations section has been expanded to discuss dataset generalizability, feature selection robustness, and model interpretability, with references to alternative dimensionality reduction techniques. The results section now includes validation methods such as ROC curves, confusion matrices, and five-fold cross-validation to ensure repeatability and validity. Additionally, quantitative annotations, including p-values and confidence intervals, have been added to statistical visualizations. Finally, to improve transparency and reproducibility, the complete code and dataset details have been included within the document.</p>
            </sec>
        </notes>
    </front>
    <body>
        <sec id="sec6" sec-type="intro">
            <title>1. Introduction</title>
            <p>Breast cancer is a global health concern that affects millions of women worldwide. The alarming number of diagnoses highlights the importance of proactive measures such as regular screenings, self-examination, and increased awareness. In the last five years alone, a staggering 7.8 million women have been diagnosed with this disease.
                <sup>
                    <xref ref-type="bibr" rid="ref1">1</xref>
                </sup> These numbers underscore the urgent need for increased awareness, early detection, and effective treatment options. The health system must be significantly reinforced to enhance breast cancer outcomes. In order to reduce mortality rates and provide effective treatment, early detection and screening of breast cancer are highly important.
                <sup>
                    <xref ref-type="bibr" rid="ref2">2</xref>,
                    <xref ref-type="bibr" rid="ref3">3</xref>
                </sup> Early detection is therefore essential to ensure the best outcome in treating breast cancer. It is well known that rapid diagnosis with machine learning is highly beneficial considering the rise in breast cancer cases.
                <sup>
                    <xref ref-type="bibr" rid="ref4">4</xref>
                </sup>
            </p>
            <p>The integration of AI in breast cancer detection and diagnosis has the potential to revolutionize the field of oncology.
                <sup>
                    <xref ref-type="bibr" rid="ref5">5</xref>,
                    <xref ref-type="bibr" rid="ref6">6</xref>
                </sup> In recent years, machine learning (ML) algorithms have emerged as powerful tools in the field of medical imaging, offering the potential to enhance the accuracy and efficiency of tumour detection and classification.
                <sup>
                    <xref ref-type="bibr" rid="ref7">7</xref>,
                    <xref ref-type="bibr" rid="ref8">8</xref>
                </sup> Machine learning algorithms can analyse vast amounts of data and identify patterns that may not be apparent to human experts. Machine learning algorithms can be trained to analyse mammograms and provide additional insights to radiologists, helping them make more informed decisions. It is imperative that healthcare providers and researchers continue to explore and harness the power of AI to further enhance breast cancer care.
                <sup>
                    <xref ref-type="bibr" rid="ref8">8</xref>&#x2013;
                    <xref ref-type="bibr" rid="ref10">10</xref>
                </sup>
            </p>
            <p>The aim of this study is to develop a machine learning predictive model specifically designed for early detection and classification of breast cancer. By leveraging ML algorithms, the goal is to improve the accuracy of tumour detection and significantly reduce the time required for cell identification.</p>
        </sec>
        <sec id="sec7" sec-type="methods">
            <title>2. Methods</title>
            <sec id="sec8">
                <title>2.1 Study design and setting</title>
                <p>This research was conducted within the Health Informatics Laboratory, Department of Health Information Management, Manipal College of Health Professions, Manipal Academy of Higher Education, Manipal, over six months (January&#x2013;June 2022). The study aimed to develop and evaluate a machine learning predictive model for early detection and differential diagnosis of benign and malignant breast lesions.</p>
            </sec>
            <sec id="sec9">
                <title>2.2 Data source and inclusion criteria</title>
                <p>The Wisconsin Breast Cancer Diagnostic dataset, available on Kaggle,
                    <sup>
                        <xref ref-type="bibr" rid="ref11">11</xref>
                    </sup> was utilized. This dataset comprises 569 records and 33 features derived from fine needle aspirate (FNA) biopsy images, representing tumor characteristics. Data of female patients aged 18&#x2013;70 years were included. Key features analysed included tumor radius, texture, perimeter, area, smoothness, compactness, concavity, symmetry, and fractal dimension.</p>
            </sec>
            <sec id="sec10">
                <title>2.3 Data preprocessing</title>
                <p>Data preprocessing involved removing missing and null values, followed by normalization using the Robust Scaler method to mitigate outlier effects. Exploratory Data Analysis (EDA) was conducted using Python to visualize data distributions and relationships through violin plots, box plots, and correlation matrices, enabling the selection of significant features for model training.</p>
            </sec>
            <sec id="sec11">
                <title>2.4 Model development</title>
                <p>Five supervised machine learning algorithms were implemented: Logistic Regression, Support Vector Classification (SVC) with linear and radial basis function (RBF) kernels, Decision Tree, and Random Forest. The dataset was split (60:40) into training and testing subsets using Scikit-learn&#x2019;s train test split Models were trained on the training set and optimized using hyperparameter tuning.</p>
            </sec>
            <sec id="sec12">
                <title>2.5 Performance evaluation</title>
                <p>Model performance was assessed using confusion matrices and metrics, including accuracy, precision, sensitivity (recall), specificity, and F1 scores, calculated using Scikit-learn&#x2019;s classification report function. Among the models, SVC-RBF demonstrated the highest accuracy (99%), proving its efficacy for early detection and differential diagnosis of breast lesions.</p>
            </sec>
            <sec id="sec13">
                <title>2.6 Statistical tools and software</title>
                <p>All analyses were performed using Python 3.8
                    <sup>
                        <xref ref-type="bibr" rid="ref12">12</xref>
                    </sup> in Jupyter Notebook. Libraries used included Pandas (v1.2.4)
                    <sup>
                        <xref ref-type="bibr" rid="ref13">13</xref>
                    </sup> for data manipulation, Numpy (v1.20.3)
                    <sup>
                        <xref ref-type="bibr" rid="ref14">14</xref>
                    </sup> for numerical computations, Matplotlib (v3.4.2)
                    <sup>
                        <xref ref-type="bibr" rid="ref15">15</xref>
                    </sup> and Seaborn (v0.11.1)
                    <sup>
                        <xref ref-type="bibr" rid="ref16">16</xref>
                    </sup> for data visualization, and Scikit-learn (v0.24.2)
                    <sup>
                        <xref ref-type="bibr" rid="ref17">17</xref>
                    </sup> for machine learning.</p>
            </sec>
            <sec id="sec14">
                <title>2.7 Ethical considerations</title>
                <p>The dataset was extracted from the online open-source Wisconsin (Diagnostics) dataset. The study approval was obtained from Institutional Research Committee of Manipal College of Health Professions, Manipal on the 20
                    <sup>th</sup> of January 2022 (MCHP/Mpl/IRC/PG/2022/04). All procedures adhered to established ethical guidelines for secondary data analysis and data use policies. Consent is not applicable since the data was an extracted from the online open source Wisconsin (Diagnostics) dataset.</p>
            </sec>
        </sec>
        <sec id="sec15" sec-type="results">
            <title>3. Results</title>
            <sec id="sec16">
                <title>3.1 Exploratory Data Analysis (EDA) and data preprocessing</title>
                <p>The breast cancer dataset (569 samples, 32 features) underwent thorough exploratory data analysis (EDA) to assess structure and identify key features. Two redundant columns, &#x201c;id&#x201d; and &#x201c;Unnamed: 32&#x201d; (containing only NaN values), were removed during data cleaning. The target variable, &#x201c;diagnosis,&#x201d; was analyzed, revealing 59% malignant (M) and 41% benign (B) cases. A bar graph (
                    <xref ref-type="fig" rid="f1">
Figure 1</xref>) illustrates this distribution. Following data cleaning, the dataset was divided into feature variables (X) and the target variable (y), ensuring all numeric features remained in X while the categorical &#x201c;diagnosis&#x201d; variable was placed in y.</p>
                <fig fig-type="figure" id="f1" orientation="portrait" position="float">
                    <label>
Figure 1. </label>
                    <caption>
                        <title>Bar graph showing the frequency of diagnosis column.</title>
                        <p>M - Malignant Tumor and B - Benign Tumor.</p>
                    </caption>
                    <graphic id="gr1" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/180235/472dcd33-c0ec-4d9d-83fc-02b2edf60c48_figure1.gif"/>
                </fig>
            </sec>
            <sec id="sec17">
                <title>3.2 Feature extraction and visualization</title>
                <p>

                    <bold>3.2.1 
                        <italic toggle="yes">Violin plots</italic>: -</bold> The distributions of the first thirty features in the dataset were visualized using violin plots to assess their potential for distinguishing between malignant and benign tumors. Key findings include the texture mean, which displayed distinct median values for the tumor types and a wider spread in the kernel density estimate (KDE) for malignant tumors, suggesting its potential as a useful feature for classification. In contrast, the fractal dimension mean showed similar medians for both tumor types, indicating limited discriminative power. Features such as concave points (se) and concavity (se) also exhibited overlapping distributions, making them less valuable for classification. On the other hand, area (se) demonstrated a clear separation between tumor types, highlighting its potential for classification. Similarly, the area (worst) feature showed a distinct separation between benign and malignant tumors, marking it as a strong candidate for classification models, whereas fractal dimension (worst) and concavity (worst) exhibited overlapping distributions, suggesting reduced utility. Overall, texture mean, area (se), and area (worst) emerged as the most promising features for classification, while the others showed limited differentiation between tumor types in the 
                    <xref ref-type="fig" rid="f2">
Figure 2 (A, B,C)</xref>.</p>
                <fig fig-type="figure" id="f2" orientation="portrait" position="float">
                    <label>
Figure 2. </label>
                    <caption>
                        <title>(A) Violin plot for first ten features (B) for second set of features (C) for last set of features (D) Joint plot for finding corelation between the concave wort and concavity worst.</title>
                    </caption>
                    <graphic id="gr2" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/180235/472dcd33-c0ec-4d9d-83fc-02b2edf60c48_figure2.gif"/>
                </fig>
                <p>

                    <bold>3.2.2 
                        <italic toggle="yes">Joint plot</italic>: -</bold> A joint plot was used to analyze the relationship between concavity worst and concave points worst, as their distributions appeared to be similar. The joint plot, which combines scatter plots and histograms, provides a comprehensive view of the data&#x2019;s distribution and the relationship between two variables. The analysis revealed a strong correlation of 0.86 between the two features, accompanied by a statistically significant p-value. This indicates a high degree of linear association between concavity worst and concave points worst, suggesting that they capture similar information regarding the tumor characteristics. Given their strong correlation, retaining only one of these features in the classification model is advisable, as including both would introduce redundancy and not contribute additional discriminative power in 
                    <xref ref-type="fig" rid="f2">Figure 2(D)</xref>.</p>
                <p>

                    <bold>3.2.3 
                        <italic toggle="yes">Box plot</italic>
</bold>: - Box plots were used to visualize the distribution of key features across malignant and benign tumor groups, offering a clear representation of the data&#x2019;s spread, central tendency, and variability. These plots divide the data into quartiles, highlighting the minimum, first quartile, median, third quartile, and maximum values, and can also identify potential outliers. Box plots are useful for comparing feature distributions between groups and identifying differences in spread and central values.</p>
                <p>In this study, box plots were employed to explore the relationship between highly correlated features in the correlation matrix, such as texture mean and texture worst, as well as area mean and area worst. The analysis of these features in relation to the diagnosis column revealed similar distributions for malignant and benign tumors, indicating redundancy in the information they provide. For instance, texture mean and texture worst showed comparable distributions, suggesting that retaining both features in the model would likely result in redundancy. Consequently, one of these highly correlated features can be excluded from the classification process without sacrificing predictive power. These insights were further validated through the visual examination of box plots, which helped clarify how each feature discriminates between malignant and benign groups in 
                    <xref ref-type="fig" rid="f3">Figure 3(A, B, C, D)</xref>.</p>
                <fig fig-type="figure" id="f3" orientation="portrait" position="float">
                    <label>
Figure 3. </label>
                    <caption>
                        <title>(A) Box plot graph of texture mean vs diagnosis of tumor (B) texture worse vs diagnosis of tumor (C) area mean vs diagnosis of tumor (D) area worst vs diagnosis of tumor.</title>
                    </caption>
                    <graphic id="gr3" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/180235/472dcd33-c0ec-4d9d-83fc-02b2edf60c48_figure3.gif"/>
                </fig>
            </sec>
            <sec id="sec18">
                <title>3.3 Label encoding</title>
                <p>Label encoding was employed to handle the categorical data within the dataset, specifically the diagnosis column, which consists of two classes: malignant (M) and benign (B). Label encoding is a technique used to transform categorical variables into numerical values, facilitating their inclusion in machine learning models that require numerical input. In this case, the diagnosis feature was encoded by assigning the value 0 to benign tumors and 1 to malignant tumors. This transformation of categorical data into binary values enables the classification algorithms to process the target variable effectively.</p>
                <p>Label encoding is particularly useful for datasets with binary or ordinal categorical data, as it preserves the inherent order and structure of the classes. This method of encoding ensures that the diagnosis column can be used seamlessly in the machine learning models, enhancing the classification process and improving model performance. The encoded values (0 and 1) were then incorporated into the feature set, with the remaining extracted features, such as tumor radius, texture, perimeter, and others, remaining in their continuous form.</p>
            </sec>
            <sec id="sec19">
                <title>3.4 Dataset splitting and feature scaling</title>
                <p>In this study, the dataset was divided into training and testing sets using the train-test split method to evaluate the performance of machine learning algorithms. The dataset was split with a 60:40 ratio, where 60% of the data was used for training the model, and 40% was reserved for testing. The primary goal of this split is to assess how well the model generalizes to unseen data by training it on the training set and evaluating it on the testing set. The training set allows the model to learn from known data, while the testing set is used exclusively for making predictions, providing an unbiased estimate of model performance.</p>
                <p>The dataset was divided into input features (X) and the target variable (y). The target variable, diagnosis (benign or malignant), was assigned to y, and the remaining features used for classification were assigned to X. Consequently, the dataset was split into four variables: X train, X test, y train, and y test, representing the training and testing sets for both features and target variable.</p>
                <p>Following the train-test split, feature scaling was performed to normalize the features within the dataset. Feature scaling is a preprocessing technique used to transform the features into a uniform scale, improving the performance of machine learning algorithms. In this study, the Robust Scaler was applied, which scales the data based on the interquartile range (IQR) while removing the median. This scaling method ensures that outliers have a minimal effect on the data, which is particularly beneficial when dealing with features that have different scales or units. The scaled data was then used for model training and evaluation, ensuring that all features contribute equally to the learning process.</p>
            </sec>
            <sec id="sec20">
                <title>3.5 Model development</title>
                <p>In this study, various machine learning models were developed and evaluated using different supervised classification algorithms to identify the most accurate model for classifying benign and malignant breast lesions. A classifier algorithm is designed to map input data to specific categories, making it suitable for tasks such as classification of breast lesions. The algorithms utilized in this project include Logistic Regression, Support Vector Classifier (SVC) with a linear kernel, Support Vector Classifier (SVC) with a radial basis function (RBF) kernel, Decision Tree Classifier, and Random Forest Classifier.</p>
                <p>The models were developed using the training dataset, with each classifier being imported from the learn library. The models were assigned to variables, and the fit method was used to train each model on the input features (X train) and target variable (y train). This method enabled the models to learn from the data and adjust their parameters accordingly to improve classification performance.</p>
                <p>Following the training process, the accuracy of each model was calculated to assess their performance. The Decision Tree Classifier achieved the highest training accuracy of 1.0, indicating perfect classification performance on the training set. On the other hand, the SVC with the radial basis function kernel exhibited the lowest training accuracy among the classifiers. These results provide an indication of which models performed better in terms of training accuracy and highlight the potential for further model evaluation using additional metrics such as cross-validation, precision, recall, and F1 score to determine the most reliable classifier for the task.</p>
            </sec>
            <sec id="sec21">
                <title>3.6 Performance evaluation</title>
                <p>The evaluation of the classification models was performed to determine their effectiveness in distinguishing between benign and malignant breast lesions. Testing accuracy was calculated using a confusion matrix, which summarizes the performance of the classification models in terms of actual and predicted values. The confusion matrix provided four key metrics: True Positive (TP), True Negative (TN), False Positive (FP), and False Negative (FN) values for each classification algorithm, as shown in 
                    <xref ref-type="table" rid="T1">
Table 1</xref>.</p>
                <table-wrap id="T1" orientation="portrait" position="float">
                    <label>
Table 1. </label>
                    <caption>
                        <title>Confusion matrix values of the classification algorithms.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">Classification Algorithm</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">True Positive (TP) Value</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">True Negative (TN) Value</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">False Positive (FP) Value</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">
False Negative (FN) Value</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Logistic Regression</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">142</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">83</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">SVC Linear</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">139</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">81</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">4</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">4</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">SVC RBF</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">142</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">83</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">1</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">2</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Decision Tree</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">139</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">76</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">4</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">9</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Random Forest</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">138</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">80</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">5</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">5</td>
                            </tr>
                        </tbody>
                    </table>
                </table-wrap>
                <p>Among the classification algorithms, the Support Vector Classifier (SVC) with a Radial Basis Function (RBF) exhibited the highest testing accuracy of 0.986, indicating its superior ability to predict correctly. In contrast, the Decision Tree Classifier demonstrated the lowest testing accuracy of 0.942, suggesting room for improvement in its predictive capability.</p>
                <p>The classification model was evaluated based on key performance metrics, including accuracy, precision, recall, and F1-score. Accuracy measures the overall effectiveness of the model in correctly classifying cases, while precision assesses the proportion of correctly identified positive cases out of all predicted positives. Recall, also known as sensitivity, indicates the model&#x2019;s ability to correctly detect positive cases, and the F1-score provides a harmonic mean between precision and recall, ensuring a balanced evaluation.</p>
                <fig fig-type="figure" id="f4" orientation="portrait" position="float">
                    <label>
Figure 4. </label>
                    <caption>
                        <title>Forest plot comparing performance metrics across different classifiers.</title>
                    </caption>
                    <graphic id="gr4" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/180235/472dcd33-c0ec-4d9d-83fc-02b2edf60c48_figure4.gif"/>
                </fig>
                <p>To further assess the model&#x2019;s discriminative power, we generated a Receiver Operating Characteristic (ROC) curve, which illustrates the trade-off between sensitivity and specificity across different classification thresholds. The Area Under the Curve (AUC) value quantifies the model&#x2019;s ability to distinguish between benign and malignant cases, with a higher AUC indicating superior classification performance. 
                    <xref ref-type="fig" rid="f5">
Figure 5</xref> presents the ROC curve, demonstrating the classifier&#x2019;s effectiveness in minimizing false positives while maximizing true positive rates.</p>
                <fig fig-type="figure" id="f5" orientation="portrait" position="float">
                    <label>
Figure 5. </label>
                    <caption>
                        <title>Receiver Operating Characteristic (ROC) curve for the classification model.</title>
                        <p>The curve illustrates the trade-off between sensitivity (true positive rate) and 1-specificity (false positive rate) across different thresholds. The Area Under the Curve (AUC) value indicates the model&#x2019;s ability to distinguish between benign and malignant cases, with higher AUC values representing better classification performance.</p>
                    </caption>
                    <graphic id="gr5" orientation="portrait" position="float" xlink:href="https://f1000research-files.f1000.com/manuscripts/180235/472dcd33-c0ec-4d9d-83fc-02b2edf60c48_figure5.gif"/>
                </fig>
            </sec>
            <sec id="sec22">
                <title>3.7 Additional performance metrics</title>
                <p>To further assess the quality of predictions, additional metrics such as precision, sensitivity (recall), F1 score, and specificity were calculated using the classification report function from the sklearn metrics package. These metrics evaluate the balance between true positive predictions and false positives/negatives, providing a comprehensive assessment of the classification algorithms The Support Vector Classifier with Radial Basis Function (SVC RBF) demonstrated the highest testing accuracy (0.9868) and consistently high precision, recall, F1 score, and specificity, establishing itself as the most robust classifier in this study. Logistic Regression performed comparably, achieving a testing accuracy of 0.9825, indicating reliable classification performance. In contrast, the Decision Tree Classifier, despite achieving the highest training accuracy (1.0), exhibited the lowest testing accuracy (0.9429), suggesting potential overfitting during training. The Random Forest classifier displayed a balanced performance, with a testing accuracy of 0.9561 and comparable metrics across precision, recall, and F1 score, making it a reliable but less optimal choice than SVC RBF and Logistic Regression in 
                    <xref ref-type="table" rid="T2">
Table 2</xref>.</p>
                <table-wrap id="T2" orientation="portrait" position="float">
                    <label>
Table 2. </label>
                    <caption>
                        <title>Performance evaluation of the classification algorithm.</title>
                    </caption>
                    <table content-type="article-table" frame="hsides">
                        <thead>
                            <tr>
                                <th align="left" colspan="1" rowspan="2" valign="top">Classification Algorithms</th>
                                <th align="left" colspan="1" rowspan="2" valign="top">Training Accuracy</th>
                                <th align="left" colspan="1" rowspan="2" valign="top">Testing Accuracy</th>
                                <th align="left" colspan="1" rowspan="2" valign="top">Final Accuracy</th>
                                <th align="left" colspan="2" rowspan="1" valign="top">Precision</th>
                                <th align="left" colspan="2" rowspan="1" valign="top">Sensitivity/Recall</th>
                                <th align="left" colspan="2" rowspan="1" valign="top">F1 Score</th>
                                <th align="left" colspan="2" rowspan="1" valign="top">Specificity/Support</th>
                            </tr>
                            <tr>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;B&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;M&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;B&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;M&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;B&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;M&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;B&#x201d;</th>
                                <th align="left" colspan="1" rowspan="1" valign="top">&#x201c;M&#x201d;</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Logistic Regression</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9824</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9825</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">85</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">SVC Linear</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9883</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9649</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.96</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.95</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.95</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.95</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">85</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">SVC RBF</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9795</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9868</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.99</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.98</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">85</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Decision Tree</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">1.0</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9429</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.94</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.94</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.95</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.89</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.96</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.92</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">85</td>
                            </tr>
                            <tr>
                                <td align="left" colspan="1" rowspan="1" valign="top">Random Forest</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9941</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.9561</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.96</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.94</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.94</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.97</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">0.94</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">143</td>
                                <td align="left" colspan="1" rowspan="1" valign="top">85</td>
                            </tr>
                        </tbody>
                    </table>
                    <table-wrap-foot>
                        <p>SVC RBF - Support Vector Classifier with Radial Basis Function., B - Benign, M - Malignant.</p>
                    </table-wrap-foot>
                </table-wrap>
            </sec>
        </sec>
        <sec id="sec23" sec-type="discussion">
            <title>4. Discussion</title>
            <p>This study demonstrates the efficacy of machine learning techniques in the early detection and differential diagnosis of benign and malignant breast lesions, with the Support Vector Classifier using a Radial Basis Function (SVC-RBF) kernel emerging as the most accurate model. Achieving a remarkable accuracy of 99% on the Wisconsin Breast Cancer Diagnostic dataset, the SVC-RBF model exhibited superior precision (99% for benign and 98% for malignant cases), sensitivity (99% and 98% for benign and malignant cases, respectively), and specificity, with robust F1 scores for both classes. These results underscore its robustness and reliability in minimizing diagnostic errors, making it highly suited for clinical applications.</p>
            <p>Exploratory data analysis (EDA), including violin plots, joint plots, and correlation matrices, revealed critical features such as texture mean, area (se), and area (worst), which were pivotal for classification. These insights enabled feature selection, improving the model&#x2019;s accuracy while reducing redundancy. Comparatively, features like fractal dimension mean and concavity worst demonstrated limited diagnostic value.</p>
            <p>The findings surpass prior studies in terms of model performance. For instance, M. Tahmooresi et al.
                <sup>
                    <xref ref-type="bibr" rid="ref18">18</xref>
                </sup> reported an SVM accuracy of 94%, while shen et al,
                <sup>
                    <xref ref-type="bibr" rid="ref19">19</xref>
                </sup> developed a deep learning algorithm for breast cancer detection on mammograms using an &#x201c;end-to-end&#x201d; approach, achieving high accuracy across heterogeneous datasets such as CBIS-DDSM (AUC: 0.91) and IN breast (AUC: 0.98). This improvement is attributed to advanced preprocessing techniques, such as robust scaling and hyperparameter tuning, combined with a comprehensive evaluation framework. Kayode et al.
                <sup>
                    <xref ref-type="bibr" rid="ref20">20</xref>
                </sup>&#x2019;s SVM model achieved a sensitivity of 94.4% and specificity of 91.3%, and Debelee et al.
                <sup>
                    <xref ref-type="bibr" rid="ref21">21</xref>
                </sup> reported 99% accuracy on the BGH dataset. While these results are comparable, this study&#x2019;s comprehensive evaluation, including confusion matrix-derived metrics, adds rigor to the findings. Similarly, Suh et al.
                <sup>
                    <xref ref-type="bibr" rid="ref22">22</xref>
                </sup> explored neural network models, such as DenseNet-169 and EfficientNet-B5, achieving AUCs of 0.952&#x2013;0.954. However, these models require larger datasets and computational resources, unlike the efficient SVC-RBF model used here. Notably, Viswanath et al.
                <sup>
                    <xref ref-type="bibr" rid="ref23">23</xref>
                </sup>&#x2019;s Random Forest model showed balanced performance (accuracy 84.84%, precision 90%, specificity 89%), yet it underperformed compared to the SVC-RBF model in this study, emphasizing the latter&#x2019;s ability to capture non-linear relationships in high-dimensional datasets.</p>
            <p>Hussain et al. (2024)
                <sup>
                    <xref ref-type="bibr" rid="ref24">24</xref>
                </sup> provide a comprehensive review of machine learning models for breast cancer risk prediction, analyzing key algorithms such as deep learning, decision trees, support vector machines, and ensemble learning. Their study highlights the significance of dataset selection, feature engineering, and model interpretability in improving predictive accuracy. While their work offers a broad overview of machine learning in cancer diagnostics, our study focuses specifically on the Support Vector Classifier with an RBF kernel (SVC-RBF), evaluating its robustness and optimization for cancer classification. Additionally, while Hussain et al. discuss challenges such as dataset bias and feature selection, we extend this discussion by assessing kernel-based optimization and hyperparameter tuning, which play a crucial role in improving predictive performance in imaging-based diagnostics. Similarly, Uthamacumaran et al. (2023)
                <sup>
                    <xref ref-type="bibr" rid="ref25">25</xref>
                </sup> introduce a novel machine intelligence-driven classification approach for extracellular vesicles derived from cancer patients using fluorescence correlation spectroscopy (FCS). Their study emphasizes the potential of machine learning in non-invasive cancer diagnostics by combining FCS data with deep learning models and advanced feature extraction techniques. While their work focuses on biomarker-based classification, our study applies SVC-RBF to imaging datasets, exploring its efficiency in structured imaging data rather than fluorescence-based biomarker detection. Additionally, while their research explores deep learning techniques, our work investigates the interpretability and efficacy of kernel-based supervised learning approaches in cancer classification.</p>
            <p>The SVC-RBF model offers significant advantages. Its transparency, facilitated by interpretability techniques and visual tools, ensures trust among clinicians, enhancing its potential as a decision-support tool. Moreover, the model&#x2019;s efficiency in prioritizing high-risk cases and reducing diagnostic workloads aligns with the goal of improving patient outcomes. This study demonstrates the efficacy of machine learning techniques in the early detection and differential diagnosis of benign and malignant breast lesions, with the Support Vector Classifier using a Radial Basis Function (SVC-RBF) kernel emerging as the most accurate model.</p>
            <p>This study has several limitations that should be acknowledged. One key limitation is the reliance on a limited and non-diverse dataset, specifically the Wisconsin Breast Cancer Dataset (WBCD), which may affect the generalizability of the findings. While WBCD is widely used in breast cancer classification research, its applicability to real-world clinical settings remains uncertain. Future studies should incorporate larger, more diverse datasets from different demographics and imaging modalities, such as The Cancer Genome Atlas (TCGA) or multi-center datasets, to enhance external validation. Additionally, integrating multimodal imaging data, including mammography, MRI, and histopathology, could provide a more comprehensive diagnostic framework. Another limitation relates to the complexity of the Support Vector Classification with a Radial Basis Function (SVC-RBF) kernel. While the RBF kernel provides high accuracy by capturing non-linear relationships in the data, it requires significant computational resources and lacks interpretability. Alternative approaches, such as simpler models like logistic regression or explainability-enhanced deep learning models, should be explored to balance accuracy with interpretability. Moreover, dimensionality reduction techniques such as Principal Component Analysis (PCA) or t-SNE could improve computational efficiency and provide clearer insights into key features. The feature selection process in this study relies on correlation analysis and visualization, which, while effective, does not fully account for complex feature interactions. Future work should incorporate advanced feature selection techniques, including SHAP (Shapley Additive Explanations) for feature importance analysis and ensemble learning approaches to optimize model performance. Additionally, this study primarily focuses on feature distribution and classification performance but does not include statistical significance testing, such as p-values and confidence intervals. Incorporating these quantitative measures in future studies would strengthen the robustness of feature separability analysis and improve the reliability of the findings.</p>
        </sec>
        <sec id="sec24" sec-type="conclusion">
            <title>5. Conclusion</title>
            <p>Breast cancer diagnosis and treatment may be revolutionized by machine learning approaches that provide early detection, leading to more efficient therapeutic interventions. In a multi-centre study, larger datasets from different institutions can be accessed by applying different machine learning approaches. Early detection of breast cancer is key to slowing down the progression of the disease and reducing mortality rates. By leveraging the data from multiple institutions, machine learning can help identify breast cancer more quickly and accurately, leading to earlier intervention and better patient outcomes. With earlier intervention, the risk of mortality can be significantly reduced, leading to better patient outcomes and an overall improvement in public health.</p>
        </sec>
        <sec id="sec25">
            <title>Ethics and consent</title>
            <p>The dataset was extracted from the online open-source Wisconsin (Diagnostics) dataset. The study approval was obtained from Institutional Research Committee of Manipal College of Health Professions, Manipal on the 20
                <sup>th</sup> of January 2022 (MCHP/Mpl/IRC/PG/2022/04). All procedures adhered to established ethical guidelines for secondary data analysis and data use policies. Consent is not applicable since the data was extracted from the online open source Wisconsin (Diagnostics) dataset.</p>
        </sec>
    </body>
    <back>
        <sec id="sec28" sec-type="data-availability">
            <title>Data availability</title>
            <p>Kaggle: Wisconsin Breast Cancer Dataset, 
                <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data">https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
</ext-link>
            </p>
            <p>The data sets of mammography with benign and malignant breast lesions.</p>
            <p>Data are available under the terms of the 
                <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-nc-sa/4.0/">CC BY-NC-SA 4.0</ext-link> (CC-BY 4.0).</p>
        </sec>
        <ref-list>
            <title>References</title>
            <ref id="ref1">
                <label>1</label>
                <mixed-citation publication-type="other">
                    <article-title>Cancer site ranking.</article-title>
                    <year>n.d</year>.</mixed-citation>
            </ref>
            <ref id="ref2">
                <label>2</label>
                <mixed-citation publication-type="other">
                    <article-title>Report of National Cancer Registry Programme, 2020 A scientific way to understand about Cancer.</article-title>
                    <year>n.d</year>.</mixed-citation>
            </ref>
            <ref id="ref3">
                <label>3</label>
                <mixed-citation publication-type="other">
                    <article-title>Global Breast Cancer Initiative Implementation Framework Assessing, strengthening and scaling up services for the early detection and management of breast cancer.</article-title>
                    <year>n.d</year>.</mixed-citation>
            </ref>
            <ref id="ref4">
                <label>4</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Harbeck</surname>
                            <given-names>N</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Penault-Llorca</surname>
                            <given-names>F</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cortes</surname>
                            <given-names>J</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Breast cancer.</article-title>
                    <source>

                        <italic toggle="yes">Nat. Rev. Dis. Primers.</italic>
</source>
                    <year>2019</year>;<volume>5</volume>.
                    <pub-id pub-id-type="doi">10.1038/s41572-019-0111-2</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref5">
                <label>5</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Chen</surname>
                            <given-names>ZH</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lin</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Wu</surname>
                            <given-names>CF</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Artificial intelligence for assisting cancer diagnosis and treatment in the era of precision medicine.</article-title>
                    <source>

                        <italic toggle="yes">Cancer Commun.</italic>
</source>
                    <year>2021</year>;<volume>41</volume>:<fpage>1100</fpage>&#x2013;<lpage>1115</lpage>.
                    <pub-id pub-id-type="pmid">34613667</pub-id>
                    <pub-id pub-id-type="doi">10.1002/cac2.12215</pub-id>
                    <pub-id pub-id-type="pmcid">PMC8626610</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref6">
                <label>6</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Bhise</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Bepari</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gadekar</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Breast Cancer Detection using Machine Learning Techniques.</article-title>
                    <year>n.d</year>.</mixed-citation>
            </ref>
            <ref id="ref7">
                <label>7</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Luchini</surname>
                            <given-names>C</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Pea</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Scarpa</surname>
                            <given-names>A</given-names>
                        </name>
</person-group>:
                    <article-title>Artificial intelligence in oncology: current applications and future perspectives.</article-title>
                    <source>

                        <italic toggle="yes">Br. J. Cancer.</italic>
</source>
                    <year>2022</year>;<volume>126</volume>:<fpage>4</fpage>&#x2013;<lpage>9</lpage>.
                    <pub-id pub-id-type="pmid">34837074</pub-id>
                    <pub-id pub-id-type="doi">10.1038/s41416-021-01633-1</pub-id>
                    <pub-id pub-id-type="pmcid">PMC8727615</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref8">
                <label>8</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Liu</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Lei</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ou</surname>
                            <given-names>Y</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Mammography diagnosis of breast cancer screening through machine learning: a systematic review and meta-analysis.</article-title>
                    <source>

                        <italic toggle="yes">Clin. Exp. Med.</italic>
</source>
                    <year>2023</year>;<volume>23</volume>:<fpage>2341</fpage>&#x2013;<lpage>2356</lpage>.
                    <pub-id pub-id-type="pmid">36242643</pub-id>
                    <pub-id pub-id-type="doi">10.1007/s10238-022-00895-0</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref9">
                <label>9</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Vaka</surname>
                            <given-names>AR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Soni</surname>
                            <given-names>B</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sudheer Reddy</surname>
                            <given-names>K</given-names>
                        </name>
</person-group>:
                    <article-title>Breast cancer detection by leveraging Machine Learning.</article-title>
                    <source>

                        <italic toggle="yes">ICT Express.</italic>
</source>
                    <year>2020</year>;<volume>6</volume>:<fpage>320</fpage>&#x2013;<lpage>324</lpage>.
                    <pub-id pub-id-type="doi">10.1016/j.icte.2020.04.009</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref10">
                <label>10</label>
                <mixed-citation publication-type="book">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Gupta</surname>
                            <given-names>G</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sharma</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Choudhary</surname>
                            <given-names>S</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <chapter-title>Performance Analysis of Machine Learning Classification Algorithms for Breast Cancer Diagnosis.</chapter-title>
                    <source>

                        <italic toggle="yes">2021 9th International Conference on Reliability, Infocom Technologies and Optimization (Trends and Future Directions), ICRITO 2021.</italic>
</source>
                    <publisher-name>Institute of Electrical and Electronics Engineers Inc.</publisher-name>;<year>2021</year>.
                    <pub-id pub-id-type="doi">10.1109/ICRITO51393.2021.9596230</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref11">
                <label>11</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Wolberg</surname>
                            <given-names>W</given-names>
                        </name>

                        <collab>MO, SN, &amp; SW</collab>
</person-group>:
                    <data-title>BCW (Diagnostic).</data-title>[Dataset].
                    <source>

                        <italic toggle="yes">Breast Cancer Wisconsin. UCI Machine Learning Repository n.d.</italic>
</source>
                    <year>1993</year>.</mixed-citation>
            </ref>
            <ref id="ref12">
                <label>12</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Van Rossum</surname>
                            <given-names>G</given-names>
                        </name>

                        <collab>DJFL</collab>
</person-group>:
                    <article-title>Python reference manual. Centrum Voor Wiskunde En Informatica Amsterdam.</article-title>
                    <year>1995</year>. van1995python.</mixed-citation>
            </ref>
            <ref id="ref13">
                <label>13</label>
                <mixed-citation publication-type="other">
                    <collab>The pandas development team</collab>:
                    <article-title>pandas-dev/pandas: Pandas.</article-title>
                    <source>

                        <italic toggle="yes">Zenodo.</italic>
</source>
                    <year>2020</year>.</mixed-citation>
            </ref>
            <ref id="ref14">
                <label>14</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Harris</surname>
                            <given-names>MWGVC</given-names>
                            <prefix>van der</prefix>
                        </name>

                        <name name-style="western">
                            <surname>Wiese</surname>
                            <given-names>TB</given-names>
                        </name>
</person-group>:
                    <article-title>SKPHKB, Haldane WPPJ-MSWAG. OR,. Array programming with {NumPy}.</article-title>
                    <year>2020</year>.</mixed-citation>
            </ref>
            <ref id="ref15">
                <label>15</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hunter</surname>
                            <given-names>JD</given-names>
                        </name>
</person-group>:
                    <article-title>Matplotlib: A 2D graphics environment.</article-title>
                    <source>

                        <italic toggle="yes">Comput. Sci. Eng.</italic>
</source>
                    <year>2007</year>;<volume>9</volume>:<fpage>90</fpage>&#x2013;<lpage>95</lpage>.
                    <pub-id pub-id-type="doi">10.1109/MCSE.2007.55</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref16">
                <label>16</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Waskom</surname>
                            <given-names>ML</given-names>
                        </name>
</person-group>:
                    <article-title>seaborn: statistical data visualization.</article-title>
                    <source>

                        <italic toggle="yes">J. Open Source Softw.</italic>
</source>
                    <year>2021</year>;<volume>6</volume>:<fpage>3021</fpage>.
                    <pub-id pub-id-type="doi">10.21105/joss.03021</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref17">
                <label>17</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Pedregosa</surname>
                            <given-names>F</given-names>
                        </name>

                        <collab>VG and GA and MV and TB and GO and BM and PP and WR and DV</collab>
</person-group>:
                    <article-title>Scikit-learn: Machine learning in Python.</article-title>
                    <source>

                        <italic toggle="yes">J. Mach. Learn. Res.</italic>
</source>
                    <year>2011</year>;<volume>12</volume>:<fpage>2825</fpage>&#x2013;<lpage>2830</lpage>.</mixed-citation>
            </ref>
            <ref id="ref18">
                <label>18</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Tahmooresi</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Afshar</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rad</surname>
                            <given-names>BB</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Early Detection of Breast Cancer Using Machine Learning Techniques.</article-title>
                    <year>n.d</year>.</mixed-citation>
            </ref>
            <ref id="ref19">
                <label>19</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Shen</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Margolies</surname>
                            <given-names>LR</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Rothstein</surname>
                            <given-names>JH</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Deep Learning to Improve Breast Cancer Detection on Screening Mammography.</article-title>
                    <source>

                        <italic toggle="yes">Sci. Rep.</italic>
</source>
                    <year>2019</year>;<volume>9</volume>:<fpage>12495</fpage>.
                    <pub-id pub-id-type="pmid">31467326</pub-id>
                    <pub-id pub-id-type="doi">10.1038/s41598-019-48995-4</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6715802</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref20">
                <label>20</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Kayode</surname>
                            <given-names>AA</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Akande</surname>
                            <given-names>NO</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Adegun</surname>
                            <given-names>AA</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>An automated mammogram classification system using modified support vector machine.</article-title>
                    <source>

                        <italic toggle="yes">Med. Devices (Auckl).</italic>
</source>
                    <year>2019</year>;<volume>12</volume>:<fpage>275</fpage>&#x2013;<lpage>284</lpage>.
                    <pub-id pub-id-type="pmid">31496841</pub-id>
                    <pub-id pub-id-type="doi">10.2147/MDER.S206973</pub-id>
                    <pub-id pub-id-type="pmcid">PMC6697673</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref21">
                <label>21</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Debelee</surname>
                            <given-names>TG</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Gebreselasie</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Schwenker</surname>
                            <given-names>F</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Classification of mammograms using texture and CNN based extracted features.</article-title>
                    <source>

                        <italic toggle="yes">J. Biomim. Biomater. Biomed. Eng.</italic>
</source>
                    <year>2019</year>;<volume>42</volume>:<fpage>79</fpage>&#x2013;<lpage>97</lpage>.
                    <pub-id pub-id-type="doi">10.4028/www.scientific.net/JBBBE.42.79</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref22">
                <label>22</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Suh</surname>
                            <given-names>YJ</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Jung</surname>
                            <given-names>J</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Cho</surname>
                            <given-names>BJ</given-names>
                        </name>
</person-group>:
                    <article-title>Automated breast cancer detection in digital mammograms of various densities via deep learning.</article-title>
                    <source>

                        <italic toggle="yes">J. Pers. Med.</italic>
</source>
                    <year>2020</year>;<volume>10</volume>:<fpage>1</fpage>&#x2013;<lpage>11</lpage>.
                    <pub-id pub-id-type="pmid">33172076</pub-id>
                    <pub-id pub-id-type="doi">10.3390/jpm10040211</pub-id>
                    <pub-id pub-id-type="pmcid">PMC7711783</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref23">
                <label>23</label>
                <mixed-citation publication-type="other">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Viswanath</surname>
                            <given-names>H</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Guachi-Guachi</surname>
                            <given-names>L</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Thirumuruganandham</surname>
                            <given-names>SP</given-names>
                        </name>
</person-group>:
                    <article-title>EasyChair Preprint Breast Cancer Detection Using Image Processing Techniques and Classification Algorithms Breast Cancer Detection Using Image Processing Techniques and Classification Algorithms.</article-title>
                    <year>2019</year>.</mixed-citation>
            </ref>
            <ref id="ref24">
                <label>24</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Hussain</surname>
                            <given-names>S</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Ali</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Naseem</surname>
                            <given-names>U</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Breast cancer risk prediction using machine learning: a systematic review.</article-title>
                    <source>

                        <italic toggle="yes">Front. Oncol.</italic>
</source>
                    <year>2024</year>;<volume>14</volume>:<fpage>14</fpage>.
                    <pub-id pub-id-type="pmid">38571502</pub-id>
                    <pub-id pub-id-type="doi">10.3389/fonc.2024.1343627</pub-id>
                    <pub-id pub-id-type="pmcid">PMC10987819</pub-id>
                </mixed-citation>
            </ref>
            <ref id="ref25">
                <label>25</label>
                <mixed-citation publication-type="journal">
                    <person-group person-group-type="author">

                        <name name-style="western">
                            <surname>Uthamacumaran</surname>
                            <given-names>A</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Abdouh</surname>
                            <given-names>M</given-names>
                        </name>

                        <name name-style="western">
                            <surname>Sengupta</surname>
                            <given-names>K</given-names>
                        </name>

                        <etal/>
</person-group>:
                    <article-title>Machine intelligence-driven classification of cancer patients-derived extracellular vesicles using fluorescence correlation spectroscopy: results from a pilot study.</article-title>
                    <source>

                        <italic toggle="yes">Neural Comput. Appl.</italic>
</source>
                    <year>2023</year>;<volume>35</volume>(<issue>11</issue>):<fpage>8407</fpage>&#x2013;<lpage>8422</lpage>.
                    <pub-id pub-id-type="doi">10.1007/s00521-022-08113-4</pub-id>
                </mixed-citation>
            </ref>
        </ref-list>
    </back>
    <sub-article article-type="reviewer-report" id="report377120">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.180235.r377120</article-id>
            <title-group>
                <article-title>Reviewer response for version 2</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Uthamacumaran</surname>
                        <given-names>Abicumaran</given-names>
                    </name>
                    <xref ref-type="aff" rid="r377120a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r377120a1">
                    <label>1</label>McGill University (Ringgold ID: 5620), Montr&#x00e9;al, Qu&#x00e9;bec, Canada</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>12</day>
                <month>5</month>
                <year>2025</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2025 Uthamacumaran A</copyright-statement>
                <copyright-year>2025</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport377120" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.161073.2"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The study has incorporated the previous concerns of the data. Now with the five-fold cross-validation, the overfitting has been addressed as clearly shown by the 1.0 accuracy of the Decision Tree in training. While the general issue with ML studies without a validation dataset is this overfitting, i.e., the model might memorize the data rather than learning distinguishable patterns, the improvements are in good standards. This has been addressed in the limitations and future directions.</p>
            <p> </p>
            <p> This study relies primarily on visual methods such as violin plots, for feature selection. Although proven effective with the ML performance, there might be nonlinear patterns that this method misses but an algorithm like SVC-RBF is 'capturing'.&#x00a0;</p>
            <p> </p>
            <p> I approve this paper after a few clarifications or corrections:</p>
            <p> </p>
            <p> 1) The Discussion states "Its transparency, facilitated by interpretability techniques and visual tools, ensures trust among clinicians, enhancing its potential as a decision-support tool. " This is misleading. SVC-RBF in itself is a 'black box' ML approach. It is not easily 'explainable' or interpretable. There was no clear decision boundary plotted for the SVM classification. Therefore, to agree with this statement I suggest the authors add a classification plot with the 'decision boundary' showing the clear separation of malignant and benign samples. Or some other 'interpretability technique should be offered (e.g., Gini entropy, feature importance, salience maps, etc.).</p>
            <p> </p>
            <p> 2) The results over-emphasize accuracy as the primary metric. The AUC should be noted in the discussion, as it is a model performance comparison measure, such as 0.96 for the RBF-SVC. Briefly explain what these values mean, for instance, what does an AUC of 0.96 indicate about that algorithm on this dataset.&#x00a0;</p>
            <p> </p>
            <p> 3) The Methods can be made more transparent. For instance, what are the hyperparameters of the algorithms? How was the cross-validation performed? etc. Sections 2.3-2.5 can use some more details to ensure replicability. Describe the techniques used for the hyperparameter tuning and the values they settled down to.&#x00a0;</p>
            <p> </p>
            <p> 4) On a final note, I&#x00a0;recommend plotting ROC curves or reporting AUC values and other appropriate metrics for the top individual features reported (e.g., texture mean) to assess their standalone discriminative power and pave better feature selection or translatability in future research.</p>
            <p> </p>
            <p> Overall, great work. Looking forward to the final form.&#x00a0;</p>
            <p> </p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Partly</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Partly</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>No</p>
            <p>Reviewer Expertise:</p>
            <p>AI, Machine learning, Bioinformatics, and Systems Oncology</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
        <sub-article article-type="response" id="comment13909-377120">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Saha</surname>
                            <given-names>Dola</given-names>
                        </name>
                        <aff>Health Information Management, Manipal Academy of Higher Education, Manipal, Karnataka, India</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>Nil</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>15</day>
                    <month>5</month>
                    <year>2025</year>
                </pub-date>
            </front-stub>
            <body>
                <p>1) The Discussion states "Its transparency, facilitated by interpretability techniques and visual tools, ensures trust among clinicians, enhancing its potential as a decision-support tool. " This is misleading. SVC-RBF in itself is a 'black box' ML approach. It is not easily 'explainable' or interpretable. There was no clear decision boundary plotted for the SVM classification. Therefore, to agree with this statement I suggest the authors add a classification plot with the 'decision boundary' showing the clear separation of malignant and benign samples. Or some other 'interpretability technique should be offered (e.g., Gini entropy, feature importance, salience maps, etc.).</p>
                <p> 
                    <bold>Response)</bold>The SVC-RBF model offers significant advantages in terms of classification performance, demonstrating high accuracy, sensitivity, and specificity in distinguishing between benign and malignant lesions. While the model operates as a black-box algorithm with limited inherent interpretability, its strong predictive capability makes it a valuable candidate for decision-support applications in clinical settings. To enhance clinician trust and eventual translatability, future work will focus on integrating model-agnostic interpretability techniques, such as SHAP values or feature attribution methods, to improve transparency and support clinical decision-making.</p>
                <p> </p>
                <p> 2) The results over-emphasize accuracy as the primary metric. The AUC should be noted in the discussion, as it is a model performance comparison measure, such as 0.96 for the RBF-SVC. Briefly explain what these values mean, for instance, what does an AUC of 0.96 indicate about that algorithm on this dataset.&#x00a0;</p>
                <p> 
                    <bold>Response)</bold>We agree that relying solely on accuracy can be misleading, especially in imbalanced datasets. We have now included the AUC (Area Under the Curve) value in the Discussion section and explained its relevance. Specifically, for the SVC-RBF model, an AUC of 0.96 indicates excellent discriminatory ability &#x2014; that is, the model can correctly distinguish between benign and malignant cases 96% of the time across all possible classification thresholds. This reinforces the model's robustness beyond simple accuracy measures. The revised discussion reflects this clarification.</p>
                <p> </p>
                <p> 3)&#x00a0;The Methods can be made more transparent. For instance, what are the hyperparameters of the algorithms? How was the cross-validation performed? etc. Sections 2.3-2.5 can use some more details to ensure replicability. Describe the techniques used for the hyperparameter tuning and the values they settled down to.</p>
                <p> 
                    <bold>Response)&#x00a0;</bold>To improve transparency and replicability, we have expanded the Methods section (Sections 2.3&#x2013;2.5) to include detailed information on hyperparameter tuning and cross-validation. Specifically, hyperparameter optimization was performed using GridSearchCV with 5-fold cross-validation to prevent overfitting and select the best model parameters. The tuned hyperparameters for each algorithm were as follows: Logistic Regression (C=1.0), Support Vector Classifiers (C=1.0 for linear kernel, and C=1.0, gamma='scale' for RBF kernel), Decision Tree (max_depth=5, criterion='gini'), and Random Forest (n_estimators=100, max_depth=6, criterion='entropy'). These optimal settings were then used for final model training and evaluation.</p>
                <p> </p>
                <p> 4)&#x00a0;On a final note, I&#x00a0;recommend plotting ROC curves or reporting AUC values and other appropriate metrics for the top individual features reported (e.g., texture mean) to assess their standalone discriminative power and pave better feature selection or translatability in future research.</p>
                <p> 
                    <bold>Response)&#x00a0;</bold>We appreciate the reviewer&#x2019;s suggestion to evaluate the standalone discriminative power of individual features through ROC curves and AUC metrics. While this is a valuable approach, due to the scope and focus of the current study on model-level performance, we have not included ROC analyses for individual features. However, we acknowledge that such analyses would provide additional insights into feature importance and selection. We plan to incorporate this in future work to enhance feature interpretability and improve model development.</p>
            </body>
        </sub-article>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report377119">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.180235.r377119</article-id>
            <title-group>
                <article-title>Reviewer response for version 2</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Gonzales Martinez</surname>
                        <given-names>Rolando</given-names>
                    </name>
                    <xref ref-type="aff" rid="r377119a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r377119a1">
                    <label>1</label>University of Groningen, Groningen, The Netherlands</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>7</day>
                <month>5</month>
                <year>2025</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2025 Gonzales Martinez R</copyright-statement>
                <copyright-year>2025</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport377119" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.161073.2"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>reject</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>As in my previous review, I would like to highlight that, due to the data imbalance, accuracy is not the best measure to compare models, and while F1-score is ok, in the case of the early detection of breast cancer, I advice again the authors to include FNR and FOR as performance metrics to compare algorithms as in Gonzales-Martinez and van Dongen (2023)[Ref 1]. The reasons are described below:</p>
            <p> </p>
            <p> 1) FNR measures the proportion of actual positive cases of breast cancer that are incorrectly classified as negative cases, it quantifies the rate of missed positives (Type II errors), and hence a high FNR implies late detection of anomalies.</p>
            <p> </p>
            <p> 2) FOR measures the proportion of false negative errors or incorrect omissions in a decision-making process. As FOR captures failures to detect breast cancer, it is also relevant in the comparison of machine learning and deep learning algorithms, because missing the detection of a positive condition of breast cancer can have significant health consequences for cancer patients.</p>
            <p> </p>
            <p> The validity of the findings and the conclusions linked to the findings should be evaluated on the basis of the lowest FNR and FOR, and not only on the accuracy of the ML and DL algorithms, since, the high level of accuracy found in the paper may be indicative of imbalance problems.</p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Partly</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Partly</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Machine learning and deep learning applied to health</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to state that I do not consider it to be of an acceptable scientific standard, for reasons outlined above.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-377119-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Deep learning algorithms for the early detection of breast cancer: A comparative study with traditional machine learning</article-title>.
                        <source>
                            <italic>Informatics in Medicine Unlocked</italic>
                        </source>.<year>2023</year>;<volume>41</volume>:
                        <elocation-id>10.1016/j.imu.2023.101317</elocation-id>
                        <pub-id pub-id-type="doi">10.1016/j.imu.2023.101317</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report364943">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.177056.r364943</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Gonzales Martinez</surname>
                        <given-names>Rolando</given-names>
                    </name>
                    <xref ref-type="aff" rid="r364943a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r364943a1">
                    <label>1</label>University of Groningen, Groningen, The Netherlands</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>11</day>
                <month>3</month>
                <year>2025</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2025 Gonzales Martinez R</copyright-statement>
                <copyright-year>2025</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport364943" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.161073.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>reject</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The study applied machine learning models for early detection and classification of breast cancer using the Wisconsin Breast Cancer Diagnostic dataset.&#x00a0;Five supervised ML algorithms (Logistic Regression, Support Vector Classification (SVC) with linear and radial basis function (RBF) kernels, Decision Tree, and Random Forest) were implemented and evaluated using performance metrics, including accuracy, precision, sensitivity, specificity, and F1 scores.</p>
            <p> </p>
            <p> Major comments:</p>
            <p> The study properly implemented ML algorithms, however, due to the data imbalance, accuracy is not the best measure to compare models, but the F1-score is ok. Also,&#x00a0;in the case of the early detection of breast cancer, I advice the authors to include FNR and FOR as performance metrics to compare algorithms. The reasons are described below:</p>
            <p> </p>
            <p> 1) FNR measures the proportion of actual positive cases of breast cancer that are incorrectly classified as negative cases, it quantifies the rate of missed positives (Type II errors), and hence a high FNR implies late detection of anomalies.</p>
            <p> </p>
            <p> 2) FOR measures the proportion of false negative errors or incorrect omissions in a decision-making process. As FOR captures failures to detect breast cancer, it is also relevant in the comparison of machine learning and deep learning algorithms, because missing the detection of a positive condition of breast cancer can have significant health consequences for cancer patients.</p>
            <p> </p>
            <p> Thus, I suggest the authors to include these metrics in the core evaluation of their proposed models, as in Gonzales-Martinez and van Dongen (2023)[Ref 1]</p>
            <p> </p>
            <p> The validity of the findings and the conclusions linked to the findings should be evaluated on the basis of the lowest FNR and FOR, and not only on the accuracy of the ML and DL algorithms, since, the high level of accuracy found in the paper may be indicative of imbalance problems.</p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Partly</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Yes</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Partly</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>Yes</p>
            <p>Reviewer Expertise:</p>
            <p>Machine learning and deep learning applied to health</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to state that I do not consider it to be of an acceptable scientific standard, for reasons outlined above.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-364943-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Deep learning algorithms for the early detection of breast cancer: A comparative study with traditional machine learning</article-title>.
                        <source>
                            <italic>Informatics in Medicine Unlocked</italic>
                        </source>.<year>2023</year>;<volume>41</volume>:
                        <elocation-id>10.1016/j.imu.2023.101317</elocation-id>
                        <pub-id pub-id-type="doi">10.1016/j.imu.2023.101317</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
    </sub-article>
    <sub-article article-type="reviewer-report" id="report367118">
        <front-stub>
            <article-id pub-id-type="doi">10.5256/f1000research.177056.r367118</article-id>
            <title-group>
                <article-title>Reviewer response for version 1</article-title>
            </title-group>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Uthamacumaran</surname>
                        <given-names>Abicumaran</given-names>
                    </name>
                    <xref ref-type="aff" rid="r367118a1">1</xref>
                    <role>Referee</role>
                </contrib>
                <aff id="r367118a1">
                    <label>1</label>McGill University (Ringgold ID: 5620), Montr&#x00e9;al, Qu&#x00e9;bec, Canada</aff>
            </contrib-group>
            <author-notes>
                <fn fn-type="conflict">
                    <p>
                        <bold>Competing interests: </bold>No competing interests were disclosed.</p>
                </fn>
            </author-notes>
            <pub-date pub-type="epub">
                <day>10</day>
                <month>3</month>
                <year>2025</year>
            </pub-date>
            <permissions>
                <copyright-statement>Copyright: &#x00a9; 2025 Uthamacumaran A</copyright-statement>
                <copyright-year>2025</copyright-year>
                <license xlink:href="https://creativecommons.org/licenses/by/4.0/">
                    <license-p>This is an open access peer review report distributed under the terms of the Creative Commons Attribution Licence, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
                </license>
            </permissions>
            <related-article ext-link-type="doi" id="relatedArticleReport367118" related-article-type="peer-reviewed-article" xlink:href="10.12688/f1000research.161073.1"/>
            <custom-meta-group>
                <custom-meta>
                    <meta-name>recommendation</meta-name>
                    <meta-value>approve-with-reservations</meta-value>
                </custom-meta>
            </custom-meta-group>
        </front-stub>
        <body>
            <p>The paper presents a clear and well-grounded objective, with adequate feature selection and methodology. The use of multiple ML metrics is appreciated and adds value to support the thesis. However, some central issues need to be revised:</p>
            <p> </p>
            <p> 1) Literature Gaps: A few reviews on machine learning approaches in cancer diagnostics as emerging paradigm can greatly benefit readership and the rationale for your approach. Some examples are:</p>
            <p> </p>
            <p> Hussain, S., et al., (2024). [Ref-1]</p>
            <p> </p>
            <p> Uthamacumaran, A., et al.,&#x00a0;(2023). [Ref-2]</p>
            <p> </p>
            <p> </p>
            <p> 2) Limitations section is needed and should address the following:</p>
            <p> </p>
            <p> - The study relies solely on the Wisconsin dataset. Does this generalize to other datasets? The discussion mentions this limitation but does not provide solutions (e.g., external validation with larger datasets, multimodal imaging data). Either present a validation or argue for why this validation holds.</p>
            <p> </p>
            <p> -&#x00a0;The feature selection process relies on correlation analysis and visualization but does not explain why this is robust? For instance, what would a PCA analysis do. See Uthamacumaran et al. above for such techniques in dimensionality reduction.</p>
            <p> </p>
            <p> -Interpretability of the SVC kernel choice. You said RBF is accurate but why? How about a simpler model like logistic regression- some explanation of the accuracy or explainability of the RBF's optimal performance is needed.</p>
            <p> </p>
            <p> 3) In extension to point #2, the results are lacking validation ROC curves. There are no confusion matrices or ROC curves present. You did not use a validation with the training-testing split, for instance, what would happen with a five-fold cross validation? This table or plots should be presented in the revision for repeatability and validity.</p>
            <p> </p>
            <p> 4)&#x00a0;Some graphs (e.g., violin plots) are useful but lack quantitative annotations (e.g., exact p-values, confidence intervals for feature separability). Same for the first bar plot, there are no confidence intervals.</p>
            <p> </p>
            <p> 5) The codes are not clear. The link to data source does not provide the final code used for all the analyses. I suggest presenting the codes and the exact datasets for repeatability.&#x00a0;</p>
            <p> </p>
            <p> Overall, this paper presents great value and could be indexed upon addressing these ML practices concerns. Best of luck.</p>
            <p>Is the work clearly and accurately presented and does it cite the current literature?</p>
            <p>Partly</p>
            <p>If applicable, is the statistical analysis and its interpretation appropriate?</p>
            <p>Partly</p>
            <p>Are all the source data underlying the results available to ensure full reproducibility?</p>
            <p>Partly</p>
            <p>Is the study design appropriate and is the work technically sound?</p>
            <p>Yes</p>
            <p>Are the conclusions drawn adequately supported by the results?</p>
            <p>Yes</p>
            <p>Are sufficient details of methods and analysis provided to allow replication by others?</p>
            <p>No</p>
            <p>Reviewer Expertise:</p>
            <p>AI, machine learning, bioinformatics, and precision oncology</p>
            <p>I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.</p>
        </body>
        <back>
            <ref-list>
                <title>References</title>
                <ref id="rep-ref-367118-1">
                    <label>1</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Breast cancer risk prediction using machine learning: a systematic review.</article-title>
                        <source>
                            <italic>Front Oncol</italic>
                        </source>.<year>2024</year>;<volume>14</volume>:
                        <elocation-id>10.3389/fonc.2024.1343627</elocation-id>
                        <fpage>1343627</fpage>
                        <pub-id pub-id-type="pmid">38571502</pub-id>
                        <pub-id pub-id-type="doi">10.3389/fonc.2024.1343627</pub-id>
                    </mixed-citation>
                </ref>
                <ref id="rep-ref-367118-2">
                    <label>2</label>
                    <mixed-citation publication-type="journal">
                        <person-group person-group-type="author"/>:
                        <article-title>Machine intelligence-driven classification of cancer patients-derived extracellular vesicles using fluorescence correlation spectroscopy: results from a pilot study</article-title>.
                        <source>
                            <italic>Neural Computing and Applications</italic>
                        </source>.<year>2023</year>;<volume>35</volume>(<issue>11</issue>) :
                        <elocation-id>10.1007/s00521-022-08113-4</elocation-id>
                        <fpage>8407</fpage>-<lpage>8422</lpage>
                        <pub-id pub-id-type="doi">10.1007/s00521-022-08113-4</pub-id>
                    </mixed-citation>
                </ref>
            </ref-list>
        </back>
        <sub-article article-type="response" id="comment13849-367118">
            <front-stub>
                <contrib-group>
                    <contrib contrib-type="author">
                        <name>
                            <surname>Saha</surname>
                            <given-names>Dola</given-names>
                        </name>
                        <aff>Health Information Management, Manipal Academy of Higher Education, Manipal, Karnataka, India</aff>
                    </contrib>
                </contrib-group>
                <author-notes>
                    <fn fn-type="conflict">
                        <p>
                            <bold>Competing interests: </bold>Nil</p>
                    </fn>
                </author-notes>
                <pub-date pub-type="epub">
                    <day>5</day>
                    <month>5</month>
                    <year>2025</year>
                </pub-date>
            </front-stub>
            <body>
                <p>
                    <bold>Sl. No. </bold>
                </p>
                <p> 
                    <bold>Reviewers Comments </bold>
                </p>
                <p> 
                    <bold>Authors Response </bold>
                </p>
                <p> </p>
                <p> 1&#x00a0;Literature Gaps: A few reviews on machine learning approaches in cancer diagnostics as emerging paradigm can greatly benefit readership and the rationale for your approach. Some examples are:</p>
                <p> Hussain, S., et al., (2024). [Ref-1]</p>
                <p> Uthamacumaran, A., et al.,&#x00a0;(2023). [Ref-2]</p>
                <p> 1. Hussain S, Ali M, Naseem U, Nezhadmoghadam F, et al.: Breast cancer risk prediction using machine learning: a systematic review.
                    <italic>Front Oncol</italic>. 2024;&#x00a0;
                    <bold>14</bold>: 1343627&#x00a0;
                    <ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/pubmed/38571502">PubMed Abstract</ext-link>&#x00a0;|&#x00a0;
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fonc.2024.1343627">Publisher Full Text</ext-link>
                </p>
                <p> 2. Uthamacumaran A, Abdouh M, Sengupta K, Gao Z, et al.: Machine intelligence-driven classification of cancer patients-derived extracellular vesicles using fluorescence correlation spectroscopy: results from a pilot study.&#x00a0;
                    <italic>Neural Computing and Applications</italic>. 2023;&#x00a0;
                    <bold>35</bold>&#x00a0;(11): 8407-8422&#x00a0;
                    <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s00521-022-08113-4">Publisher Full Text</ext-link>
                </p>
                <p> 
                    <bold>Thank you for your valuable comments. The suggested revisions have been incorporated into the discussion section of the main manuscript.</bold>
                </p>
                <p> </p>
                <p> 2. Limitations section is needed and should address the following:</p>
                <p> The study relies solely on the Wisconsin dataset. Does this generalize to other datasets? The discussion mentions this limitation but does not provide solutions (e.g., external validation with larger datasets, multimodal imaging data). Either present a validation or argue for why this validation holds.</p>
                <p> The feature selection process relies on correlation analysis and visualization but does not explain why this is robust? For instance, what would a PCA analysis do. See Uthamacumaran et al. above for such techniques in dimensionality reduction.</p>
                <p> Interpretability of the SVC kernel choice. You said RBF is accurate but why? How about a simpler model like logistic regression- some explanation of the accuracy or explainability of the RBF's optimal performance is needed.</p>
                <p> 
                    <bold>Thank you for your valuable comments. The suggested revisions have been incorporated into the Limitations &#x00a0;section of the main manuscript.</bold>
                </p>
                <p> </p>
                <p> 3. In extension to point #2, the results are lacking validation ROC curves. There are no confusion matrices or ROC curves present. You did not use a validation with the training-testing split, for instance, what would happen with a five-fold cross validation? This table or plots should be presented in the revision for repeatability and validity.</p>
                <p> 
                    <bold>Thank you for your valuable comments. The suggested revisions have been incorporated into the Results &#x00a0;section of the main manuscript.</bold>
                </p>
                <p> </p>
                <p> 4. Some graphs (e.g., violin plots) are useful but lack quantitative annotations (e.g., exact p-values, confidence intervals for feature separability). Same for the first bar plot, there are no confidence intervals.</p>
                <p> 
                    <bold>Thank you for your valuable comments. The suggested revisions have been incorporated into the Limitations &#x00a0;section of the main manuscript.</bold>
                </p>
                <p> </p>
                <p> 5 The codes are not clear. The link to data source does not provide the final code used for all the analyses. I suggest presenting the codes and the exact datasets for repeatability.&#x00a0;</p>
                <p> 
                    <bold>Thank you for your valuable comments. The codes are given below in this document.</bold>
                </p>
                <p> </p>
                <p> 
                    <bold>The codes used in this study are presented below</bold>
                </p>
                <p>
                    <bold> # Importing necessary libraries</bold>
                </p>
                <p>
                    <bold> import pandas as pd</bold>
                </p>
                <p>
                    <bold> import numpy as np</bold>
                </p>
                <p>
                    <bold> from sklearn.model_selection import train_test_split</bold>
                </p>
                <p>
                    <bold> from sklearn.preprocessing import RobustScaler, LabelEncoder</bold>
                </p>
                <p>
                    <bold> from sklearn.metrics import classification_report, confusion_matrix</bold>
                </p>
                <p>
                    <bold> from sklearn.linear_model import LogisticRegression</bold>
                </p>
                <p>
                    <bold> from sklearn.svm import SVC</bold>
                </p>
                <p>
                    <bold> from sklearn.tree import DecisionTreeClassifier</bold>
                </p>
                <p>
                    <bold> from sklearn.ensemble import RandomForestClassifier</bold>
                </p>
                <p>
                    <bold> import seaborn as sns</bold>
                </p>
                <p>
                    <bold> import matplotlib.pyplot as plt</bold>
                </p>
                <p>
                    <bold> # Loading the dataset</bold>
                </p>
                <p>
                    <bold> data = pd.read_csv('cancer_data.csv')</bold>
                </p>
                <p>
                    <bold> print(data.info())</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Preprocessing the dataset</bold>
                </p>
                <p>
                    <bold> # Dropping redundant columns</bold>
                </p>
                <p>
                    <bold> data = data.drop(['id', 'Unnamed: 32'], axis=1)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Label encoding the target variable (diagnosis)</bold>
                </p>
                <p>
                    <bold> le = LabelEncoder()</bold>
                </p>
                <p>
                    <bold> data['diagnosis'] = le.fit_transform(data['diagnosis'])&#x00a0; # Malignant (M): 1, Benign (B): 0</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Splitting features (X) and target variable (y)</bold>
                </p>
                <p>
                    <bold> X = data.drop('diagnosis', axis=1)</bold>
                </p>
                <p>
                    <bold> y = data['diagnosis']</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Train-test split (60:40 ratio)</bold>
                </p>
                <p>
                    <bold> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)</bold>
                </p>
                <p>
                    <bold> # Feature scaling using RobustScaler</bold>
                </p>
                <p>
                    <bold> scaler = RobustScaler()</bold>
                </p>
                <p>
                    <bold> X_train_scaled = scaler.fit_transform(X_train)</bold>
                </p>
                <p>
                    <bold> X_test_scaled = scaler.transform(X_test)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Visualizing key features (Violin plot for texture mean)</bold>
                </p>
                <p>
                    <bold> sns.violinplot(x='diagnosis', y='texture_mean', data=data)</bold>
                </p>
                <p>
                    <bold> plt.title('Violin Plot for Texture Mean vs Diagnosis')</bold>
                </p>
                <p>
                    <bold> plt.show()</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Visualizing correlation (Joint plot for concavity worst vs concave points worst)</bold>
                </p>
                <p>
                    <bold> sns.jointplot(x='concavity_worst', y='concave_points_worst', data=data, kind='scatter')</bold>
                </p>
                <p>
                    <bold> plt.title('Joint Plot for Concavity Worst vs Concave Points Worst')</bold>
                </p>
                <p>
                    <bold> plt.show()</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Model development and training</bold>
                </p>
                <p>
                    <bold> # Logistic Regression</bold>
                </p>
                <p>
                    <bold> log_reg = LogisticRegression()</bold>
                </p>
                <p>
                    <bold> log_reg.fit(X_train_scaled, y_train)</bold>
                </p>
                <p>
                    <bold> log_reg_pred = log_reg.predict(X_test_scaled)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # SVC - Linear Kernel</bold>
                </p>
                <p>
                    <bold> svc_linear = SVC(kernel='linear')</bold>
                </p>
                <p>
                    <bold> svc_linear.fit(X_train_scaled, y_train)</bold>
                </p>
                <p>
                    <bold> svc_linear_pred = svc_linear.predict(X_test_scaled)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # SVC - RBF Kernel</bold>
                </p>
                <p>
                    <bold> svc_rbf = SVC(kernel='rbf')</bold>
                </p>
                <p>
                    <bold> svc_rbf.fit(X_train_scaled, y_train)</bold>
                </p>
                <p>
                    <bold> svc_rbf_pred = svc_rbf.predict(X_test_scaled)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Decision Tree Classifier</bold>
                </p>
                <p>
                    <bold> decision_tree = DecisionTreeClassifier()</bold>
                </p>
                <p>
                    <bold> decision_tree.fit(X_train_scaled, y_train)</bold>
                </p>
                <p>
                    <bold> decision_tree_pred = decision_tree.predict(X_test_scaled)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Random Forest Classifier</bold>
                </p>
                <p>
                    <bold> random_forest = RandomForestClassifier()</bold>
                </p>
                <p>
                    <bold> random_forest.fit(X_train_scaled, y_train)</bold>
                </p>
                <p>
                    <bold> random_forest_pred = random_forest.predict(X_test_scaled)</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> # Evaluating models</bold>
                </p>
                <p>
                    <bold> models = {</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; "Logistic Regression": log_reg_pred,</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; "SVC Linear": svc_linear_pred,</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; "SVC RBF": svc_rbf_pred,</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; "Decision Tree": decision_tree_pred,</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; "Random Forest": random_forest_pred</bold>
                </p>
                <p>
                    <bold> }</bold>
                </p>
                <p>
                    <bold> </bold>
                </p>
                <p>
                    <bold> for name, predictions in models.items():</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; print(f"{name} Performance Metrics:")</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; print(classification_report(y_test, predictions))</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; print("Confusion Matrix:")</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; print(confusion_matrix(y_test, predictions))</bold>
                </p>
                <p>
                    <bold> &#x00a0;&#x00a0;&#x00a0; print("-" * 50)</bold>
                </p>
            </body>
        </sub-article>
    </sub-article>
</article>
