# First import all dependencies

from sklearn import datasets

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

plt.style.use('ggplot')

iris = datasets.load_iris()

for name in iris.keys():
    print(name)

data
target
target_names
DESCR
feature_names
filename

iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

iris.data[0]

array([5.1, 3.5, 1.4, 0.2])

iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

iris.target[0]

0

for target_name in iris.target_names:
    print(target_name)

setosa
versicolor
virginica

iris.data.shape

(150, 4)

Exploratory Data Analysis (EDA)¶

x = iris.data
y = iris.target

df = pd.DataFrame(x, columns = iris.feature_names)

df.head()

sp = pd.plotting.scatter_matrix(df, c=y, figsize= [10,10], s=150, marker = 'D')

Now use KNeighborsClassifier method to cluster data for classification¶

from sklearn.neighbors import KNeighborsClassifier

knnModel = KNeighborsClassifier(n_neighbors = 5)
knnModel.fit(x,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

test_data = [[4.8, 3.0, 1.5, 0.2],
             [4.7, 2.9, 1.4, 0.2],
             [4.9, 3.1, 1.5, 0.2],
             [4.5, 3.0, 1.6, 0.2]]

prediction = knnModel.predict(test_data)
prediction

array([0, 0, 0, 0])

Now we will split the whole dataset into 2 parts for training and testing¶

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=21, stratify=y)

# n_neighbors = 3 gives the maximum score i think
knnSplitTestModel = KNeighborsClassifier(n_neighbors = 7)
knnSplitTestModel.fit(x_train, y_train)
splitPrediction = knnSplitTestModel.predict(x_test)
splitPrediction

array([2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 0, 2, 0, 2, 2, 0, 0, 0, 1, 0, 2, 2,
       2, 0, 1, 1, 1, 0, 0, 1, 2, 2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 1, 0, 2,
       1])

knnSplitTestModel.score(x_test, y_test)

0.9555555555555556

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2