Breast Cancer Machine Learning Prediction. Used Scikit Learn for Training, Evaluating, and Prediction. Used Seaborn and Matplotlib for Visualizing.
Run this on Jupyter Notebook
# LIBRARY IMPORTS
import pandas as pdimport numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# DATASET IMPORT
from sklearn.datasets import load_breast_cancercancer = load_breast_cancer()
#VIEW DATA
cancercancer.keys()
print(cancer['DESCR'])
print(cancer['target_names'])
print(cancer['target'])
print(cancer['feature_names'])
print(cancer['data'])
cancer['data'].shape
df_cancer = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns = np.append(cancer['feature_names'], ['target']))
df_cancer.head()
df_cancer.tail()
# VISUALIZE DATA
# SEABORN PAIRPLOT
sns.pairplot(df_cancer, hue = 'target', vars = ['mean radius', 'mean texture', 'mean area', 'mean perimeter', 'mean smoothness'] )
# SEABORN COUNTPLOT
sns.countplot(df_cancer['target'], label = "Count")
# SEABORN SCATTERPLOT
sns.scatterplot(x = 'mean area', y = 'mean smoothness', hue = 'target', data = df_cancer)
# SEABORN LMPLOT
sns.lmplot('mean area', 'mean smoothness', hue ='target', data = df_cancer, fit_reg=False)
# SEABORN HEATMAP
plt.figure(figsize=(20,10))
sns.heatmap(df_cancer.corr(), annot=True)
# MODEL TRAINING
# DEFINING X and y
X = df_cancer.drop(['target'],axis=1)
X.head()
y = df_cancer['target']
y.head()
# TRAIN TEST SPLIT (20-80)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
# IMPORT MODELS
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# TRAIN ON SVC MODEL
svc_model = SVC()
svc_model.fit(X_train, y_train)
# EVALUATING
# PREDICT
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)
print(classification_report(y_test, y_predict))
precision recall f1-score support 0.0 0.00 0.00 0.00 44 1.0 0.61 1.00 0.76 70 avg / total 0.38 0.61 0.47 114
# If the result turns out to be terribly off the precision. Like in this case, it is coming out to be 34% only then we need to normalize the data.
# IMPROVING MODEL
X_train.head()
# NORMALIZATION
# TRAIN DATA
min_train = X_train.min()
print(min_train)
max_train = X_train.max()
print(max_train)
range_train = max_train - min_train
print(range_train)
X_train_scaled = (X_train - min_train)/range_train
print(X_train_scaled)
# COMPARING EARLIER TRAIN DATA AND NORMALIZED TRAIN DATA
sns.scatterplot(x = X_train['mean area'], y = X_train['mean smoothness'], hue = y_train)
sns.scatterplot(x = X_train_scaled['mean area'], y = X_train_scaled['mean smoothness'], hue = y_train)
# TEST DATA
min_test = X_test.min()
max_test = X_test.max()
range_test = max_test - min_test
X_test_scaled = (X_test - min_test)/range_test
# TRAIN AND PREDICT
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)
y_predict = svc_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test,y_predict))
precision recall f1-score support 0.0 0.76 0.97 0.85 39 1.0 0.98 0.84 0.91 75 avg / total 0.91 0.89 0.89 114
# If the result has improved, the normalization is successful. In this case precision has drastically improved with normalization
Comments
Post a Comment