Real Data Analysis

FASHION MNIST with Python (DAY 5) - knn

딥스탯 2018. 8. 19. 19:55
FASHION_MNIST_DAY5_with_Python

FASHION MNIST with Python (DAY 5)

DATA SOURCE : https://www.kaggle.com/zalando-research/fashionmnist (Kaggle, Fashion MNIST)

FASHION MNIST with Python (DAY 1) : http://deepstat.tistory.com/35

FASHION MNIST with Python (DAY 2) : http://deepstat.tistory.com/36

FASHION MNIST with Python (DAY 3) : http://deepstat.tistory.com/37

FASHION MNIST with Python (DAY 4) : http://deepstat.tistory.com/38

Datasets

Importing numpy, pandas, pyplot

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading datasets

In [2]:
data_train = pd.read_csv("..\\datasets\\fashion-mnist_train.csv")
data_test = pd.read_csv("..\\datasets\\fashion-mnist_test.csv")
In [3]:
data_train_y = data_train.label
y_test = data_test.label
In [4]:
data_train_x = data_train.drop("label",axis=1)/256
x_test = data_test.drop("label",axis=1)/256

Spliting valid and training

In [5]:
np.random.seed(0)
valid2_idx = np.random.choice(60000,10000,replace = False)
valid1_idx = np.random.choice(list(set(range(60000)) - set(valid2_idx)),10000,replace=False)
train_idx = list(set(range(60000))-set(valid1_idx)-set(valid2_idx))

x_train = data_train_x.iloc[train_idx,:]
y_train = data_train_y.iloc[train_idx]

x_valid1 = data_train_x.iloc[valid1_idx,:]
y_valid1 = data_train_y.iloc[valid1_idx]

x_valid2 = data_train_x.iloc[valid2_idx,:]
y_valid2 = data_train_y.iloc[valid2_idx]

K-Nearest Neighbors

Importing KNeighborsClassifier

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

Fitting KNN with k=1

In [7]:
KNN_model_type_1 = KNeighborsClassifier(n_neighbors=1).fit(x_train, y_train)

Training Accuracy

In [8]:
confusion_matrix(KNN_model_type_1.predict(x_train),y_train)
Out[8]:
array([[3994,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, 3990,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 4056,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 3929,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 4016,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0, 3932,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 4005,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 4103,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0, 3946,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 4029]],
      dtype=int64)
In [9]:
KNN_model_type_1_train_acc = (KNN_model_type_1.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_1_train_acc)
TRAINING ACCURACY = 1.0

Validation Accuracy

In [10]:
confusion_matrix(KNN_model_type_1.predict(x_valid1),y_valid1)
Out[10]:
array([[ 829,    6,   10,   40,    7,    0,  144,    0,    9,    0],
       [   3, 1007,    0,    8,    1,    0,    1,    0,    0,    0],
       [  18,    1,  730,   12,  140,    0,  117,    0,   11,    0],
       [  22,    7,   11,  863,   40,    0,   18,    0,    6,    0],
       [   4,    0,   98,   40,  710,    0,   77,    0,    6,    0],
       [   0,    0,    0,    0,    0,  896,    0,    3,    0,    7],
       [ 134,    4,   96,   45,   93,    1,  622,    0,    8,    1],
       [   0,    0,    0,    0,    0,   80,    0,  892,    4,   39],
       [   5,    0,    0,    4,    4,    3,    8,    2,  987,    1],
       [   0,    1,    0,    0,    0,   80,    0,   51,    3,  930]],
      dtype=int64)
In [11]:
KNN_model_type_1_valid1_acc = (KNN_model_type_1.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_1_valid1_acc)
VALIDATION ACCURACY = 0.8466
In [12]:
{"TRAIN_ACC" : KNN_model_type_1_train_acc , "VALID_ACC" : KNN_model_type_1_valid1_acc}
Out[12]:
{'TRAIN_ACC': 1.0, 'VALID_ACC': 0.8466}

Fitting KNN with k=2

In [13]:
KNN_model_type_2 = KNeighborsClassifier(n_neighbors=2).fit(x_train, y_train)

Training Accuracy

In [14]:
confusion_matrix(KNN_model_type_2.predict(x_train),y_train)
Out[14]:
array([[3994,   10,   71,  199,   17,    3,  640,    0,   25,    1],
       [   0, 3980,    3,   43,    5,    0,    4,    0,    2,    0],
       [   0,    0, 3982,   52,  501,    0,  470,    0,   38,    0],
       [   0,    0,    0, 3635,  134,    2,   99,    0,   26,    0],
       [   0,    0,    0,    0, 3359,    0,  301,    0,   19,    0],
       [   0,    0,    0,    0,    0, 3927,    0,   18,    3,   11],
       [   0,    0,    0,    0,    0,    0, 2491,    1,   49,    1],
       [   0,    0,    0,    0,    0,    0,    0, 4084,   13,  155],
       [   0,    0,    0,    0,    0,    0,    0,    0, 3771,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 3861]],
      dtype=int64)
In [15]:
KNN_model_type_2_train_acc = (KNN_model_type_2.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_2_train_acc)
TRAINING ACCURACY = 0.9271

Validation Accuracy

In [16]:
confusion_matrix(KNN_model_type_2.predict(x_valid1),y_valid1)
Out[16]:
array([[ 922,   11,   28,   65,   11,    2,  223,    0,   15,    0],
       [   4, 1006,    0,   17,    2,    0,    5,    0,    0,    0],
       [  22,    2,  803,   19,  224,    3,  184,    0,   23,    0],
       [  17,    5,   13,  865,   60,    0,   28,    1,    6,    1],
       [   2,    0,   71,   22,  656,    0,   94,    0,    7,    0],
       [   0,    0,    0,    0,    0,  951,    0,    8,    1,   11],
       [  44,    1,   30,   21,   40,    1,  448,    0,   20,    1],
       [   0,    0,    0,    0,    0,   54,    0,  910,    8,   58],
       [   4,    0,    0,    3,    2,    2,    5,    0,  952,    1],
       [   0,    1,    0,    0,    0,   47,    0,   29,    2,  906]],
      dtype=int64)
In [17]:
KNN_model_type_2_valid1_acc = (KNN_model_type_2.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_2_valid1_acc)
VALIDATION ACCURACY = 0.8419
In [18]:
{"TRAIN_ACC" : KNN_model_type_2_train_acc , "VALID_ACC" : KNN_model_type_2_valid1_acc}
Out[18]:
{'TRAIN_ACC': 0.9271, 'VALID_ACC': 0.8419}

Fitting KNN with k=3

In [19]:
KNN_model_type_3 = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)

Training Accuracy

In [20]:
confusion_matrix(KNN_model_type_3.predict(x_train),y_train)
Out[20]:
array([[3726,   18,   72,  158,   23,    4,  444,    0,   28,    1],
       [   2, 3918,    2,   15,    5,    0,    4,    0,    3,    0],
       [  28,    6, 3619,   39,  344,    0,  376,    0,   44,    0],
       [  38,   36,    9, 3585,   74,    2,   59,    0,   13,    0],
       [   8,    3,  195,   86, 3391,    1,  139,    0,   16,    0],
       [   0,    0,    0,    0,    0, 3587,    0,   10,    2,    8],
       [ 171,    8,  154,   45,  176,    6, 2969,    0,   13,    0],
       [   1,    0,    1,    0,    0,  183,    0, 3992,   11,   65],
       [  20,    1,    4,    1,    3,    7,   13,    1, 3813,    0],
       [   0,    0,    0,    0,    0,  142,    1,  100,    3, 3955]],
      dtype=int64)
In [21]:
KNN_model_type_3_train_acc = (KNN_model_type_3.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_3_train_acc)
TRAINING ACCURACY = 0.913875

Validation Accuracy

In [22]:
confusion_matrix(KNN_model_type_3.predict(x_valid1),y_valid1)
Out[22]:
array([[ 873,   11,   26,   56,   12,    3,  176,    0,   13,    0],
       [   3, 1003,    1,    8,    3,    0,    4,    0,    2,    0],
       [  18,    1,  754,   12,  154,    2,  148,    0,   21,    0],
       [  17,    8,   11,  861,   40,    0,   15,    1,    7,    1],
       [   2,    0,   82,   43,  701,    0,   56,    0,    6,    0],
       [   0,    0,    0,    0,    0,  882,    0,    1,    0,    4],
       [  98,    2,   71,   29,   81,    2,  580,    0,    9,    1],
       [   0,    0,    0,    0,    0,   90,    0,  901,    6,   32],
       [   4,    0,    0,    3,    4,    3,    8,    0,  967,    1],
       [   0,    1,    0,    0,    0,   78,    0,   45,    3,  939]],
      dtype=int64)
In [23]:
KNN_model_type_3_valid1_acc = (KNN_model_type_3.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_3_valid1_acc)
VALIDATION ACCURACY = 0.8461
In [24]:
{"TRAIN_ACC" : KNN_model_type_3_train_acc , "VALID_ACC" : KNN_model_type_3_valid1_acc}
Out[24]:
{'TRAIN_ACC': 0.913875, 'VALID_ACC': 0.8461}

Fitting KNN with k=5

In [25]:
KNN_model_type_4 = KNeighborsClassifier(n_neighbors=5).fit(x_train, y_train)

Training Accuracy

In [26]:
confusion_matrix(KNN_model_type_4.predict(x_train),y_train)
Out[26]:
array([[3650,   14,   54,  145,   15,    1,  562,    0,   19,    1],
       [   2, 3890,    2,   17,    3,    0,    2,    0,    0,    0],
       [  46,   17, 3504,   35,  326,    2,  436,    0,   51,    1],
       [  62,   48,   24, 3554,   87,    2,   60,    0,   20,    0],
       [  16,    5,  272,  109, 3336,    0,  243,    0,   21,    0],
       [   0,    0,    0,    0,    0, 3435,    0,    9,    2,    3],
       [ 193,   15,  194,   64,  245,   15, 2685,    1,   36,    2],
       [   1,    0,    1,    0,    0,  276,    0, 3953,   12,   97],
       [  24,    1,    5,    5,    4,    8,   16,    0, 3782,    0],
       [   0,    0,    0,    0,    0,  193,    1,  140,    3, 3925]],
      dtype=int64)
In [27]:
KNN_model_type_4_train_acc = (KNN_model_type_4.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_4_train_acc)
TRAINING ACCURACY = 0.89285

Validation Accuracy

In [28]:
confusion_matrix(KNN_model_type_4.predict(x_valid1),y_valid1)
Out[28]:
array([[ 886,    7,   15,   58,    7,    2,  181,    0,    9,    0],
       [   3, 1000,    1,    4,    3,    0,    3,    0,    0,    0],
       [  17,    4,  772,   12,  124,    1,  147,    0,   22,    0],
       [  18,   11,   13,  872,   36,    0,   18,    0,    8,    1],
       [   3,    0,   92,   41,  732,    0,   67,    0,   10,    0],
       [   0,    0,    0,    0,    0,  868,    0,    1,    1,    1],
       [  82,    3,   52,   22,   88,    5,  562,    0,    7,    1],
       [   0,    0,    0,    0,    0,  102,    0,  891,    5,   32],
       [   6,    0,    0,    3,    5,    4,    9,    1,  971,    1],
       [   0,    1,    0,    0,    0,   78,    0,   55,    1,  942]],
      dtype=int64)
In [29]:
KNN_model_type_4_valid1_acc = (KNN_model_type_4.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_4_valid1_acc)
VALIDATION ACCURACY = 0.8496
In [30]:
{"TRAIN_ACC" : KNN_model_type_4_train_acc , "VALID_ACC" : KNN_model_type_4_valid1_acc}
Out[30]:
{'TRAIN_ACC': 0.89285, 'VALID_ACC': 0.8496}

Fitting KNN with k=8

In [31]:
KNN_model_type_5 = KNeighborsClassifier(n_neighbors=8).fit(x_train, y_train)

Training Accuracy

In [32]:
confusion_matrix(KNN_model_type_5.predict(x_train),y_train)
Out[32]:
array([[3628,   14,   55,  149,   22,    2,  713,    0,   15,    0],
       [   2, 3870,    3,   22,    4,    0,    3,    0,    0,    0],
       [  62,   19, 3455,   38,  383,    2,  458,    0,   64,    1],
       [  64,   60,   22, 3527,   99,    1,   70,    0,   17,    1],
       [  16,    6,  299,  108, 3217,    0,  273,    0,   28,    0],
       [   0,    0,    0,    0,    0, 3353,    0,   10,    2,    3],
       [ 191,   19,  215,   81,  285,   23, 2466,    1,   44,    4],
       [   1,    0,    1,    0,    0,  321,    1, 3969,   20,  128],
       [  30,    2,    6,    3,    6,   12,   20,    0, 3752,    0],
       [   0,    0,    0,    1,    0,  218,    1,  123,    4, 3892]],
      dtype=int64)
In [33]:
KNN_model_type_5_train_acc = (KNN_model_type_5.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_5_train_acc)
TRAINING ACCURACY = 0.878225

Validation Accuracy

In [34]:
confusion_matrix(KNN_model_type_5.predict(x_valid1),y_valid1)
Out[34]:
array([[907,  10,  13,  52,   8,   1, 195,   0,   6,   0],
       [  1, 996,   0,   7,   3,   0,   3,   0,   0,   0],
       [ 13,   5, 771,  10, 120,   0, 149,   0,  22,   0],
       [ 21,   9,  12, 890,  37,   1,  17,   0,   7,   1],
       [  5,   0,  88,  28, 729,   0,  64,   0,  12,   0],
       [  0,   0,   0,   0,   0, 865,   0,   0,   1,   0],
       [ 62,   5,  57,  22,  93,   6, 547,   1,   8,   1],
       [  0,   0,   0,   0,   0, 106,   0, 905,   6,  37],
       [  6,   0,   4,   3,   5,   4,  12,   0, 971,   1],
       [  0,   1,   0,   0,   0,  77,   0,  42,   1, 938]], dtype=int64)
In [35]:
KNN_model_type_5_valid1_acc = (KNN_model_type_5.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_5_valid1_acc)
VALIDATION ACCURACY = 0.8519
In [36]:
{"TRAIN_ACC" : KNN_model_type_5_train_acc , "VALID_ACC" : KNN_model_type_5_valid1_acc}
Out[36]:
{'TRAIN_ACC': 0.878225, 'VALID_ACC': 0.8519}

Fitting KNN with k=12

In [37]:
KNN_model_type_6 = KNeighborsClassifier(n_neighbors=12).fit(x_train, y_train)

Training Accuracy

In [38]:
confusion_matrix(KNN_model_type_6.predict(x_train),y_train)
Out[38]:
array([[3601,   20,   57,  150,   15,    1,  736,    0,   14,    1],
       [   1, 3841,    3,   24,    5,    0,    3,    0,    0,    0],
       [  62,   28, 3361,   37,  369,    1,  475,    0,   65,    1],
       [  76,   70,   27, 3484,  101,    3,   78,    0,   22,    1],
       [  23,    9,  322,  139, 3191,    0,  298,    0,   33,    0],
       [   0,    0,    0,    0,    0, 3244,    0,   11,    1,    4],
       [ 196,   20,  277,   91,  325,   27, 2386,    1,   34,    2],
       [   1,    0,    1,    0,    0,  389,    1, 3927,   24,  129],
       [  34,    2,    8,    4,   10,   14,   27,    0, 3749,    0],
       [   0,    0,    0,    0,    0,  253,    1,  164,    4, 3891]],
      dtype=int64)
In [39]:
KNN_model_type_6_train_acc = (KNN_model_type_6.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_6_train_acc)
TRAINING ACCURACY = 0.866875

Validation Accuracy

In [40]:
confusion_matrix(KNN_model_type_6.predict(x_valid1),y_valid1)
Out[40]:
array([[891,   8,   9,  42,   6,   0, 197,   0,   4,   0],
       [  1, 992,   0,   8,   2,   0,   2,   0,   0,   0],
       [ 18,   7, 754,   9, 120,   0, 148,   0,  16,   0],
       [ 25,  12,  14, 893,  38,   1,  19,   0,   9,   0],
       [  5,   1,  96,  30, 717,   0,  62,   0,  10,   0],
       [  0,   0,   0,   0,   0, 851,   0,   1,   1,   1],
       [ 69,   5,  68,  27, 107,  10, 547,   1,  17,   1],
       [  0,   0,   0,   0,   0, 109,   0, 898,   8,  34],
       [  6,   0,   4,   3,   5,   4,  12,   1, 968,   1],
       [  0,   1,   0,   0,   0,  85,   0,  47,   1, 941]], dtype=int64)
In [41]:
KNN_model_type_6_valid1_acc = (KNN_model_type_6.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_6_valid1_acc)
VALIDATION ACCURACY = 0.8452
In [42]:
{"TRAIN_ACC" : KNN_model_type_6_train_acc , "VALID_ACC" : KNN_model_type_6_valid1_acc}
Out[42]:
{'TRAIN_ACC': 0.866875, 'VALID_ACC': 0.8452}

Fitting KNN with k=17

In [43]:
KNN_model_type_7 = KNeighborsClassifier(n_neighbors=17).fit(x_train, y_train)

Training Accuracy

In [44]:
confusion_matrix(KNN_model_type_7.predict(x_train),y_train)
Out[44]:
array([[3562,   12,   55,  160,   20,    1,  753,    0,   16,    1],
       [   1, 3815,    2,   24,    5,    0,    2,    0,    0,    0],
       [  71,   29, 3292,   32,  351,    2,  480,    0,   64,    1],
       [  81,   98,   26, 3444,   94,    4,   74,    0,   22,    1],
       [  28,    9,  362,  159, 3171,    0,  304,    0,   36,    0],
       [   0,    0,    0,    0,    0, 3129,    0,   12,    2,    4],
       [ 215,   26,  309,  104,  366,   32, 2357,    1,   46,    4],
       [   1,    0,    1,    0,    0,  454,    1, 3885,   28,  114],
       [  35,    1,    9,    6,    9,   16,   33,    0, 3727,    0],
       [   0,    0,    0,    0,    0,  294,    1,  205,    5, 3904]],
      dtype=int64)
In [45]:
KNN_model_type_7_train_acc = (KNN_model_type_7.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_7_train_acc)
TRAINING ACCURACY = 0.85715

Validation Accuracy

In [46]:
confusion_matrix(KNN_model_type_7.predict(x_valid1),y_valid1)
Out[46]:
array([[893,   6,  12,  48,   8,   0, 198,   0,   4,   0],
       [  1, 987,   0,  10,   3,   0,   2,   0,   0,   0],
       [ 12,  12, 745,   8, 108,   0, 154,   0,  17,   0],
       [ 27,  14,  12, 884,  35,   1,  17,   0,   9,   1],
       [  4,   1,  98,  32, 725,   0,  61,   0,  11,   0],
       [  0,   0,   0,   0,   0, 828,   0,   1,   1,   0],
       [ 72,   5,  74,  27, 111,  10, 543,   1,  17,   2],
       [  0,   0,   0,   0,   0, 128,   0, 888,   7,  30],
       [  6,   0,   4,   2,   5,   4,  12,   0, 967,   1],
       [  0,   1,   0,   1,   0,  89,   0,  58,   1, 944]], dtype=int64)
In [47]:
KNN_model_type_7_valid1_acc = (KNN_model_type_7.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_7_valid1_acc)
VALIDATION ACCURACY = 0.8404
In [48]:
{"TRAIN_ACC" : KNN_model_type_7_train_acc , "VALID_ACC" : KNN_model_type_7_valid1_acc}
Out[48]:
{'TRAIN_ACC': 0.85715, 'VALID_ACC': 0.8404}

Fitting KNN with k=23

In [49]:
KNN_model_type_8 = KNeighborsClassifier(n_neighbors=23).fit(x_train, y_train)

Training Accuracy

In [50]:
confusion_matrix(KNN_model_type_8.predict(x_train),y_train)
Out[50]:
array([[3556,   15,   60,  165,   16,    1,  775,    0,   13,    1],
       [   1, 3800,    2,   24,    4,    0,    3,    0,    0,    0],
       [  76,   29, 3215,   34,  367,    1,  499,    0,   72,    1],
       [  88,  107,   27, 3431,   93,    4,   71,    0,   26,    1],
       [  27,   11,  388,  162, 3116,    0,  311,    0,   39,    0],
       [   0,    0,    0,    0,    0, 3065,    0,   13,    1,    6],
       [ 206,   27,  354,  107,  411,   35, 2307,    1,   53,    5],
       [   1,    0,    1,    0,    0,  506,    1, 3864,   36,  116],
       [  39,    1,    9,    6,    9,   17,   37,    0, 3701,    0],
       [   0,    0,    0,    0,    0,  303,    1,  225,    5, 3899]],
      dtype=int64)
In [51]:
KNN_model_type_8_train_acc = (KNN_model_type_8.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",KNN_model_type_8_train_acc)
TRAINING ACCURACY = 0.84885

Validation Accuracy

In [52]:
confusion_matrix(KNN_model_type_8.predict(x_valid1),y_valid1)
Out[52]:
array([[888,   6,   9,  47,   4,   0, 205,   0,   4,   0],
       [  1, 982,   0,  10,   3,   0,   2,   0,   0,   0],
       [ 14,  13, 729,   6, 107,   0, 154,   0,  16,   0],
       [ 23,  17,   9, 890,  39,   1,  17,   0,   9,   1],
       [  6,   0, 107,  34, 720,   0,  67,   0,  12,   0],
       [  0,   0,   0,   0,   0, 809,   0,   1,   1,   1],
       [ 77,   7,  87,  22, 116,  11, 530,   1,  19,   2],
       [  0,   0,   0,   0,   0, 142,   0, 884,   9,  30],
       [  6,   0,   4,   2,   6,   4,  11,   2, 963,   1],
       [  0,   1,   0,   1,   0,  93,   1,  60,   1, 943]], dtype=int64)
In [53]:
KNN_model_type_8_valid1_acc = (KNN_model_type_8.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",KNN_model_type_8_valid1_acc)
VALIDATION ACCURACY = 0.8338
In [54]:
{"TRAIN_ACC" : KNN_model_type_8_train_acc , "VALID_ACC" : KNN_model_type_8_valid1_acc}
Out[54]:
{'TRAIN_ACC': 0.84885, 'VALID_ACC': 0.8338}