In [ ]:
# First import all dependencies
In [ ]:
 
In [1]:
from sklearn import datasets

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

plt.style.use('ggplot')

iris = datasets.load_iris()
In [20]:
for name in iris.keys():
    print(name)
data
target
target_names
DESCR
feature_names
filename
In [60]:
iris.data[:5]
Out[60]:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])
In [13]:
iris.data[0]
Out[13]:
array([5.1, 3.5, 1.4, 0.2])
In [22]:
iris.feature_names
Out[22]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
In [14]:
iris.target
Out[14]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [15]:
iris.target[0]
Out[15]:
0
In [24]:
for target_name in iris.target_names:
    print(target_name)
setosa
versicolor
virginica
In [21]:
iris.data.shape
Out[21]:
(150, 4)
In [ ]:
 
In [ ]:
 

Exploratory Data Analysis (EDA)

In [ ]:
 
In [25]:
x = iris.data
y = iris.target
In [29]:
df = pd.DataFrame(x, columns = iris.feature_names)
In [30]:
df.head()
Out[30]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [35]:
sp = pd.plotting.scatter_matrix(df, c=y, figsize= [10,10], s=150, marker = 'D')
In [ ]:
 
In [ ]:
 

Now use KNeighborsClassifier method to cluster data for classification

In [ ]:
 
In [ ]:
 
In [36]:
from sklearn.neighbors import KNeighborsClassifier
In [37]:
knnModel = KNeighborsClassifier(n_neighbors = 5)
knnModel.fit(x,y)
Out[37]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
In [44]:
test_data = [[4.8, 3.0, 1.5, 0.2],
             [4.7, 2.9, 1.4, 0.2],
             [4.9, 3.1, 1.5, 0.2],
             [4.5, 3.0, 1.6, 0.2]]
In [47]:
prediction = knnModel.predict(test_data)
prediction
Out[47]:
array([0, 0, 0, 0])
In [ ]:
 
In [ ]:
 
In [ ]:
 

Now we will split the whole dataset into 2 parts for training and testing

In [ ]:
 
In [ ]:
 
In [49]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=21, stratify=y)
In [58]:
# n_neighbors = 3 gives the maximum score i think
knnSplitTestModel = KNeighborsClassifier(n_neighbors = 7)
knnSplitTestModel.fit(x_train, y_train)
splitPrediction = knnSplitTestModel.predict(x_test)
splitPrediction
Out[58]:
array([2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 0, 2, 0, 2, 2, 0, 0, 0, 1, 0, 2, 2,
       2, 0, 1, 1, 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 1, 0, 2,
       1])
In [59]:
knnSplitTestModel.score(x_test, y_test)
Out[59]:
0.9555555555555556