Implementation of the DBSCAN algorithm using Sklearn

Dense Spatial clustering of applications with noise ( DBCSAN ) — this is a clustering algorithm that was proposed in 1996. In 2014, the algorithm was awarded the “Test of Time” award at the leading Data Mining conference, KDD.

Dataset — Credit Card .

Step 1: Import Required Libraries

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

 

from sklearn.cluster import DBSCAN

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import normalize

from sklearn.decomposition import PCA

Step 2: Load data

X = pd.read_csv ( `..input_path / CC_GENERAL.csv` )

 
# Remove the CUST_ID column from the data

X = X.drop ( ` CUST_ID` , axis = 1 )

 
# Handling missing values ​​

X.fillna (method = `ffill` , inplace = True )

  

print (X.head ())

Step 3: Data preprocessing

# Scale data to bring all attributes to comparable level

scaler = StandardScaler ()

X_scaled = scaler.fit_transform (X)

 
# Normalize data so that
# data is roughly Gaussian

X_normalized = normalize (X_scaled)

  
# Convert numpy array to panda DataFrame

X_normalized = pd.DataFrame (X_normalized)

Step 4: Downsizing the data to make it renderable

pca = PCA (n_components = 2 )

X_principal = pca.fit_transform (X_normalized)

X_principal = pd.DataFrame (X_principal)

X_principal.columns = [ `P1` , ` P2` ]

print (X_principal.head ())

Step 5: Building the clustering model

# Numpy array of all cluster labels assigned each data point

db_default = DBSCAN ( eps = 0.0375 , min_samples = 3 ). fit (X_principal)

labels = db_default.labels_

Step 6: Visualize clustering

# Create a label for color matching

colors = {}

colors [ 0 ] = `r`

colors [ 1 ] = `g`

colors [ 2 ] = `b`

colors [ - 1 ] = ` k`

 
# Build a color vector for each data point

cvec = [colors [label] for label in labels]

  
# To build a legend about the plot

r = plt.scatter (X_principal [ `P1` ], X_principal [ `P2` ], color = ` r` );

g = plt.scatter (X_principal [ `P1` ], X_principal [ ` P2` ], color = `g` );

b = plt.scatter (X_principal [ `P1` ], X_principal [ ` P2` ], color = `b` );

k = plt.scatter (X_principal [ `P1` ], X_principal [ ` P2` ], color = `k` );

 
# P1 plotting on the X axis and P2 on the Y axis
# according to a specific color vector

plt.figure (figsize = ( 9 , 9 ))

plt.scatter (X_principal [ `P1` ], X_principal [ ` P2` ], c = cvec)

 
# Building a legend

plt.legend ((r , g, b, k), ( `Label 0` , `Label 1` , `Label 2` , ` Label -1` ))

  
plt.show ()

Step 7: Setting Model Parameters

db = DBSCAN (eps = 0.0375 , min_samples = 50 ). fit (X_principal)

labels1 = db.labels _

Step 8: Visualize the changes

colors1 = {}

colors1 [ 0 ] = `r`

colors1 [ 1 ] = `g`

colors1 [ 2 ] = `b`

colors1 [ 3 ] = `c`

colors1 [ 4 ] = `y`

colors1 [ 5 ] = `m`

colors1 [ - 1 ] = `k`

  

cvec = [colors1 [label] for label in labels]

colors = [ `r` , `g` , ` b` , `c` , `y` , ` m` , ` k` ]

  

r = plt.scatter (

X_principal [ `P1` ], X_principal [ `P2` ], marker = `o` , color = colors [ 0 ])

g = plt.scatter (

X_principal [ `P1` ], X_principal [ ` P2` ], marker = `o` , color = colors [ 1 ])

b = plt.scatter (

  X_principal [ ` P1` ], X_principal [ `P2` ], marker = ` o` , color = colors [ 2 ])

c = plt.scatter (

X_principal [ `P1` ], X_principal [ ` P2` ], marker = `o` , color = colors [ 3 ])

y = plt.scatter (

  X_principal [ ` P1` ], X_principal [ `P2` ], marker = ` o` , color = colors [ 4 ])

m = plt.scatter (

X_principal [ `P1` ], X_principal [ `P2` ], marker = `o` , color = colors [ 5 ])

k = plt.scatter (

  X_principal [ ` P1` ], X_principal [ `P2` ], marker = `o` , color = colors [ 6 ])

 

plt.figure (figsize = ( 9 , 9 ))

plt.scatter (X_principal [ `P1` ], X_principal [ ` P2` ], c = cvec)

plt.legend (( r, g, b, c, y, m, k),

( `Label 0` , ` Label 1` , `Label 2` , ` Label 3 ` Label 4 `,

`Label 5` , ` Label -1` ),

scatterpoints = 1 ,

loc = `upper left` ,

  ncol = 3 ,

  fontsize = 8 )

plt.show ()