ML | OPTICS Clustering Deployment Using Sklearn



This article will demonstrate how to implement the OPTICS clustering method using Sklearn in Python. The dataset used for the demo is — this is Mall customer segmentation data which can be downloaded from Kaggle .

Step 1: Import required libraries

Step 2: Load data

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import gridspec

from sklearn.cluster import OPTICS, cluster_optics_dbscan

from sklearn.pre processing import normalize, StandardScaler

# Change the desktop space per data location
cd C: UsersDevDesktopKaggleCustomer Segmentation

 

X = pd.read_csv ( `Mall_Customers.csv` )

  
# Removing irrelevant columns

drop_features = [ `CustomerID` , `Gender` ]

X = X.drop (drop_features, axis = 1 )

 
# Processing missing values, if any

X.fillna (method = `ffill` , inplace = True )

 
X.head ()

Step 3: Data preprocessing

# Scale data to bring all attributes to comparable level

scaler = StandardScaler ()

X_scaled = scaler.fit_transform (X)

 
# Normalize data so that data
# approximates Gaussian distribution

X_normalized = normalize (X_scaled)

 
# Convert numpy array to panda DataFrame

X_normalized = pd.DataFrame (X_normalized)

  
# Renaming columns

X_normalized.columns = X.columns

 
X_normalized.head ()

Step 4: Building the clustering model

# Build OPTICS clustering model

optics_model = OPTICS (min_samples = 10 , xi = 0.05 , min_c luster_size = 0.05 )

 
# Model training
optics_model.fit (X_normalized )

Step 5: Storing the training results

# Making labels using the DBSCAN technique with eps = 0.5

labels1 = cluster_optics_dbscan (reachability = optics_model.reachability_,

  core_distances = optics_model.core_distances_,

  ordering = optics_model.ordering_, eps = 0.5 )

 
# Making tags using the DBSCAN technique with eps = 2.0

labels2 = cluster_optics_dbscan (reachability = optics_model.reachability_,

core_distances = optics_model.core_distances_,

  ordering = optics_model.ordering_, eps = 2 )

  
# Create an array with numbers in equal spaces before
# specified range

space = np.arange ( len (X_normalized))

 
# Save the reachable distance of each point

reachability = optics_model.reachability_ [optics_model.ordering_]

 
# Store each point`s cluster labels

labels = optics_model.labels_ [optics_model.ordering_]

 

print (labels)

Step 6: Rendering Results

# Define the rendering framework

plt .figure (figsize = ( 10 , 7 ))

G = gridspec.GridSpec ( 2 , 3 )

ax1 = plt.subplot (G [ 0 ,:])

ax2 = plt.subplot (G [ 1 , 0 ])

ax3 = plt.subplot (G [ 1 , 1 ])

ax4 = plt.subplot (G [ 1 , 2 ])

 
# Plot accessibility-distance

colors = [ `c.` , `b.` , ` r .` , `y.` , `g.` ]

for Class, color in zip ( range ( 0 , 5 ), colors):

Xk = space [labels = = Class]

  Rk = reac hability [labels = = Class]

ax1.plot (Xk, Rk, color, alpha = 0.3 )

ax1.plot (space [ labels = = - 1 ], reachability [labels = = - 1 ], `k .` , alpha = 0.3 )

ax1.plot (space, np.full_like (space, 2. , dtype = float ), `k-` , alpha = 0.5 )

ax1.plot (space, np.full_like (space, 0.5 , dtype = float ), `k-.` , alpha = 0.5 )

ax1.set_ylabel ( ` Reachability Distance` )

ax1.set_title ( `Reachability Plot` )

  
# Building OPTICS clusters

colors = [ `c.` , `b.` , ` r.` , `y.` , `g.` ]

for Class, color in zip ( range ( 0 , 5 ), colors):

  Xk = X_normalized [optics_model.labels_ = = Class]

  ax2.plot (Xk.iloc [:, 0 ], Xk.iloc [:, 1 ], color, alpha = 0.3 )

 

ax2.plot (X_normalized.iloc [optics_model.labels_ = = - 1 , 0 ],

X_normalized.iloc [optics_model.labels_ = = - 1 , 1 ],

`k +` , alpha = 0.1 )

ax2. set_title ( `OPTICS Clustering` )

  
# Build DBSCAN clustering with eps = 0.5

colors = [ `c` , `b` , ` r` , `y` , ` g` , `greenyellow` ]

for Class, color in zip ( range ( 0 , 6 ), colors):

Xk ​​ = X_normalized [labels1 = = Class]

ax3.plot (Xk.iloc [:, 0 ], Xk.iloc [:, 1 ], color, alpha = 0.3 , marker = `.` )

  

ax3.plot (X_normalized.iloc [labels1 = = - 1 , 0 ],

  X_normalized .iloc [labels1 = = - 1 , 1 ],

`k +` , alpha = 0.1 )

ax3.set_title ( ` DBSCAN clustering with eps = 0.5` )

 
# Build DBSCAN clusters using eps = 2.0

colors = [ `c.` , ` y. ` , `m.` , ` g.` ]

for Class, color in zip ( range ( 0 , 4 ), colors):

Xk = X_normalized.iloc [labels2 = = Class]

ax4.plot (Xk.iloc [:, 0 ], Xk.iloc [:, 1 ], color, alpha = 0.3 )

 

ax4.plot (X_normalized.iloc [labels2 = = - 1 , 0 ],

  X_normalized.iloc [labels2 = = - 1 , 1 ],

`k +` , alpha = 0.1 )

ax4.set_title ( `DBSCAN C lustering with eps = 2.0` )

 

 
plt.tight_layout ()
plt.show ( )