ML | Diagnosing Breast Cancer in Wisconsin Using Logistic Regression

Logistic regression is used to predict whether a given patient has a malignant or benign tumor based on attributes in a given dataset.

Code: Load Libraries

# doing linear algebra

import numpy as np 

 
# data processing

import pandas as pd

 
# rendering

import matplotlib.pyplot as plt

Code: Load dataset

data = pd.read_csv ( ".. breast-cancer-wisconsin-data data.csv" )

 

print (data.head)

Output:

Code: Load dataset

data.info ()

Output:

 RangeIndex : 569 entries, 0 to 568 Data columns (total 33 columns): id 569 non-null int64 diagnosis 569 non-null object radius_mean 569 non-null float64 texture_mean 569 non-null float64 perimeter_mean 569 non-null float64 area_mean 569 non-null float64 smoothness_mean 569 non-null float64 compactness_mean 569 non-null float64 concavity_mean 569 non-nullmean concavity 569 non-nullmean points 569 non-nullmean conc9 -null float64 symmetry_mean 569 non-null float64 fractal_dimension_mean 569 non-null float64 radius_se 569 non-null float64 texture_se 569 non-null float64 perimeter_se 569 non-null float64 area_se 569 non-null float64 smoothness_se 569 non-nullse float non compactness-569 float64 concavity_se 569 non-null float64 concave points_se 569 non-null float64 symmetry_se 569 non-null float64 fractal_dimension_se 569 non-null float64 radius_worst 569 non-null float64 texture_worst 569 non-null float64 perimeter_worst 569_ non-null float64 perimeter_worst 569_ non-null-null area smoothness_worst 569 non-null float64 compactness_worst 569 non-null float64 concavity_worst 569 non-null float64 concave points_worst 5 69 non-null float64 symmetry_worst 569 non-null float64 fractal_dimension_worst 569 non-null float64 Unnamed: 32 0 non-null float64 dtypes: float64 (31), int64 (1), object (1) memory usage: 146.8+ KB  

Code: we are dropping columns — "Id" and "Unnamed: 32" as they play no role in forecasting.

data.drop ([ `Unnamed: 32` , ` id` ], axis = 1 )

data.diagnosis = [ 1 if each = = "M" else 0 for each in data.diagnosis]

Code: input and output

y = data.diagnosis.values ​​

x_data = data.drop ([ `diagnosis` ], axis = 1 )

Code: normalization

x = (x_data - np. min (x_data)) / (np. max (x_data) - np. min (x_data)). values ​​

Code: Separate data for training and testing.

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split (

x, y, test_size = 0.15 , random_state = 42 )

  

x_train = x_train.T

x_test = x_test.T

y_train = y_train.T

y_test = y_test.T

 

print ( "x train:" , x_train.shape)

print ( "x test:" , x_test.shape)

print ( " y train: " , y_train.shape)

print ( "y test:" , y_test.shape)

Code Weight and Grade

def initialize_weights_and_bias (dimension):

w = np.full ((dimension, 1 ), 0.01 )

  b = 0.0

return w, b

Code: sigmoidal function — calculates the z-value.

# z = np.dot (wT, x_train) + b

def sigmoid (z):

y_head = 1 / ( 1 + np.exp ( - z))

return y_head

Codex Forward and Backward Distribution

def forward_backward_propagation (w, b, x_train, y_train):

z = np.dot (wT, x_train) + b

y_head = sigmoid (z)

loss = - y_train * np.log (y_head) - ( 1 - y_train) * np.log ( 1 - y_head)

# x_train.shape [1] to scale

cost = ( np. sum (loss)) / x_train.shape [ 1

 

# backpropagation

derivative_weight = ( np.dot (x_train, (

(y_head - y_train) .T))) / x_train. shape [ 1

derivative_bias = np. sum (

y_head - y_train) / x_train.shape [ 1

gradients = { "derivative_weight" : derivative_weight,

"derivative_bias" : derivative_bias}

return cost, gradients

Code: update options

def update (w, b, x_train , y_train, learning_rate, number_of_iterarion):

cost_list = []

  cost_list2 = []

index = []

 

# Updating (learning) parameters is a number

  for i in range (number_of_iterarion):

# do forward and backward propagation and find cost and gradients

cost, gradients = forward_backward_propagation (w, b, x_train, y_train)

cost_list.append (cost)

 

# let`s update

w = w - learning_rate * gradients [ "derivative_weight" ]

b = b - learning_rate * gradients [ "derivative_bias" ]

if i % 10 = = 0 :

cost_list2.append (cost )

index.append (i)

  print ( "Cost after iteration% i:% f" % (i, cost))

  

# update (learn) the parameters of the weights and biases

parameters = { "weight" : w, "bias" : b}

plt.plot (index, cost_list2)

plt.xticks (index, rotation = `vertical` )

  plt.xlabel ( " Number of Iterarion " )

plt.ylabel ( "Cost" )

plt.show ()

return parameters, gradients, cost_list

Code: Predictions

def predict (w, b, x_test):

# x_test is an input for direct distribution

z = si gmoid (np.dot (wT, x_test) + b)

Y_prediction = np.zeros (( 1 , x_test.shape [ 1 ]))

 

# if z is greater than 0.5, our forecast is sign one (y_head = 1),

# if z is less than 0.5, our forecast is zero (y_head = 0),

for i in range (z.shape [ 1 ]):

if z [ 0 , i] & lt; = 0.5 :

  Y_prediction [ 0 , i] = 0

else :

Y_prediction [ 0 , i] = 1

 

return Y_prediction

Code: logistic regression

def logistic_regression (x_train, y_train, x_test, y_test, 

learning_rate , num_iterations):

 

dimension = x_train.shape [ 0 ]

w, b = initialize_weights_and_bias (dimension)

 

parameters, gradients, cost_list = update (

w, b, x_train, y_train, learning_rate, num_iterations)

  

  y_prediction_test = predict (

parameters [ "weight" ], parameters [ "bias" ], x_test)

y_prediction_train = predict (

arameters [ "weight" ], parameters [ "bias" ], x_train)

 

Train / test errors

print ( " train accuracy: {}% " . format (

100 - np.mean (np. abs (y_prediction_train - y_train)) * 100 ))

print ( "test accuracy: {}% " . format (

100 - np. mean (np. abs (y_prediction_test - y_test)) * 100 ))

  
logistic_regression (x_train, y_train, x_test, 

  y_test, learning_rate = 1 , num_iterations = 100

Output:

 Cost after iteration 0 : 0.692836 Cost after iteration 10: 0.498576 Cost after iteration 20: 0.404996 Cost after iteration 30: 0.350059 Cost after iteration 40: 0.313747 Cost after iteration 50: 0.287767 Cost after iteration 60: 0.268114 Cost after iteration 70: 0.252627 Cost after iterat ion 80: 0.240036 Cost after iteration 90: 0.229543 Cost after iteration 100: 0.220624 Cost after iteration 110: 0.212920 Cost after iteration 120: 0.206175 Cost after iteration 130: 0.200201 Cost after iteration 140: 0.194860 

Output:

 train accuracy: 95.23809523809524% test accuracy: 94.18604651162791% 

Code: check results with linear_model.LogisticRegression

from sklearn import linear_model

logreg = linear_model.LogisticRegression (random_state = 42 , max_iter = 150 )

print ( "test accuracy: {}" . format (

logreg.fit ( x_train.T, y_train.T) .score (x_test.T, y_test.T)))

print ( " train accuracy: {} " . format (

logreg.fit (x_train. T, y_train.T) .score (x_train.T, y_train.T)))

Output:

 test accuracy: 0.9651162790697675 train a ccuracy: 0.9668737060041408