티스토리 뷰

FASHION_MNIST_DAY1_with_Python

FASHION MNIST with Python (DAY 1)

DATA SOURCE : https://www.kaggle.com/zalando-research/fashionmnist (Kaggle, Fashion MNIST)

Datasets

Importing numpy, pandas, pyplot

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading datasets

In [2]:
data_train = pd.read_csv("..\\datasets\\fashion-mnist_train.csv")
data_test = pd.read_csv("..\\datasets\\fashion-mnist_test.csv")
In [3]:
data_train.head()
Out[3]:
label pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783 pixel784
0 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 6 0 0 0 0 0 0 0 5 0 ... 0 0 0 30 43 0 0 0 0 0
3 0 0 0 0 1 2 0 0 0 0 ... 3 0 0 0 0 1 0 0 0 0
4 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns

In [4]:
data_train.shape
Out[4]:
(60000, 785)
In [5]:
data_test.shape
Out[5]:
(10000, 785)
In [6]:
data_train_y = data_train.label
y_test = data_test.label
In [7]:
data_train_x = data_train.drop("label",axis=1)/256
x_test = data_test.drop("label",axis=1)/256
In [8]:
plt.imshow(data_train_x.iloc[0,:].values.reshape([28,28])) ; data_train_y.iloc[0]
Out[8]:
2
In [9]:
plt.imshow(data_train_x.iloc[1,:].values.reshape([28,28])) ; data_train_y.iloc[1]
Out[9]:
9

Spliting valid and training

In [10]:
np.random.seed(0)
valid2_idx = np.random.choice(60000,10000,replace = False)
valid1_idx = np.random.choice(list(set(range(60000)) - set(valid2_idx)),10000,replace=False)
train_idx = list(set(range(60000))-set(valid1_idx)-set(valid2_idx))

x_train = data_train_x.iloc[train_idx,:]
y_train = data_train_y.iloc[train_idx]

x_valid1 = data_train_x.iloc[valid1_idx,:]
y_valid1 = data_train_y.iloc[valid1_idx]

x_valid2 = data_train_x.iloc[valid2_idx,:]
y_valid2 = data_train_y.iloc[valid2_idx]

Multinomial Logistic Regression

Importing LogisticRegression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

Fitting Logistic Regression

In [12]:
LR_model = LogisticRegression().fit(x_train, y_train)

Training Accuracy

In [13]:
confusion_matrix(LR_model.predict(x_train),y_train)
Out[13]:
array([[3390,   14,   69,   99,    8,    3,  546,    0,   14,    0],
       [   8, 3887,    5,   37,    7,    0,    8,    0,    2,    0],
       [  72,    4, 3197,   42,  321,    2,  420,    0,   16,    0],
       [ 177,   68,   37, 3533,  126,    0,  134,    0,   35,    1],
       [  19,    7,  459,  112, 3254,    0,  323,    0,   13,    0],
       [   5,    0,    1,    0,    0, 3752,    0,   94,   12,   38],
       [ 279,    5,  259,   90,  285,    0, 2507,    0,   33,    0],
       [   0,    1,    1,    0,    0,  122,    1, 3905,   13,  106],
       [  44,    4,   28,   16,   15,   20,   65,    9, 3807,    2],
       [   0,    0,    0,    0,    0,   33,    1,   95,    1, 3882]],
      dtype=int64)
In [14]:
LR_model_train_acc = (LR_model.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",LR_model_train_acc)
TRAINING ACCURACY = 0.87785

Validation Accuracy

In [15]:
confusion_matrix(LR_model.predict(x_valid1),y_valid1)
Out[15]:
array([[841,   5,  17,  45,   5,   0, 142,   0,   5,   1],
       [  6, 987,   1,   9,   2,   0,   3,   0,   0,   0],
       [ 11,   4, 720,   7,  97,   1, 138,   0,   8,   0],
       [ 47,  23,  10, 891,  45,   0,  28,   0,   8,   0],
       [  1,   1, 109,  34, 743,   0,  93,   0,   5,   0],
       [  0,   0,   1,   0,   1, 964,   0,  28,   6,  12],
       [ 95,   5,  82,  23,  94,   1, 561,   1,  21,   0],
       [  0,   0,   0,   0,   0,  48,   0, 889,   5,  32],
       [ 14,   1,   5,   3,   8,  15,  20,   2, 975,   2],
       [  0,   0,   0,   0,   0,  31,   2,  28,   1, 931]], dtype=int64)
In [16]:
LR_model_valid1_acc = (LR_model.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",LR_model_valid1_acc)
VALIDATION ACCURACY = 0.8502
In [17]:
{"TRAIN_ACC" : LR_model_train_acc , "VALID_ACC" : LR_model_valid1_acc}
Out[17]:
{'TRAIN_ACC': 0.87785, 'VALID_ACC': 0.8502}

Decision Tree

Importing DecisionTreeClassifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

Huge Tree

In [19]:
TR_model1 = DecisionTreeClassifier().fit(x_train, y_train)

Training Accuracy1

In [20]:
confusion_matrix(TR_model1.predict(x_train),y_train)
Out[20]:
array([[3994,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, 3990,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0, 4056,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 3929,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0, 4016,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0, 3932,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 4005,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 4103,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0, 3946,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 4029]],
      dtype=int64)
In [21]:
TR_model1_train_acc = (TR_model1.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",TR_model1_train_acc)
TRAINING ACCURACY = 1.0

Validation Accuracy1

In [22]:
confusion_matrix(TR_model1.predict(x_valid1),y_valid1)
Out[22]:
array([[747,  10,  18,  41,  15,   1, 152,   0,  12,   0],
       [  8, 981,   4,  31,   7,   1,   5,   0,   4,   0],
       [ 19,   1, 612,  15, 131,   3, 115,   0,  17,   2],
       [ 46,  25,  22, 820,  71,   3,  46,   0,   8,   1],
       [  5,   2, 138,  44, 641,   0, 115,   0,  15,   1],
       [  0,   0,   3,   3,   1, 928,   1,  52,  20,  24],
       [179,   7, 134,  43, 120,   1, 535,   0,  22,   1],
       [  1,   0,   0,   2,   0,  79,   0, 817,   6,  55],
       [  9,   0,  12,  13,   6,  12,  17,   6, 927,   6],
       [  1,   0,   2,   0,   3,  32,   1,  73,   3, 888]], dtype=int64)
In [23]:
TR_model1_valid1_acc = (TR_model1.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",TR_model1_valid1_acc)
VALIDATION ACCURACY = 0.7896
In [24]:
{"TRAIN_ACC" : TR_model1_train_acc , "VALID_ACC" : TR_model1_valid1_acc}
Out[24]:
{'TRAIN_ACC': 1.0, 'VALID_ACC': 0.7896}

Smaller Tree (to avoid overfitting)

In [25]:
TR_model2 = DecisionTreeClassifier(min_samples_leaf = 5, max_depth = 12).fit(x_train, y_train)

Training Accuracy1

In [26]:
confusion_matrix(TR_model2.predict(x_train),y_train)
Out[26]:
array([[3491,   23,   49,  101,    9,    5,  341,    3,   22,    0],
       [   6, 3837,    4,   18,    5,    3,    8,    0,    0,    0],
       [  44,   14, 3125,   53,  392,    2,  357,    1,   19,    7],
       [  86,   72,   30, 3522,  181,    7,   98,    3,   18,    0],
       [  39,   12,  516,  120, 3218,    0,  479,    0,   34,    2],
       [   6,    1,    6,    4,    2, 3741,    3,   73,   17,   46],
       [ 298,   25,  299,   98,  198,    4, 2683,    0,   33,    4],
       [   0,    0,    0,    0,    0,  115,    1, 3858,   12,  140],
       [  24,    6,   26,   13,   10,   17,   34,   13, 3790,   13],
       [   0,    0,    1,    0,    1,   38,    1,  152,    1, 3817]],
      dtype=int64)
In [27]:
TR_model2_train_acc = (TR_model2.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",TR_model2_train_acc)
TRAINING ACCURACY = 0.87705

Validation Accuracy1

In [28]:
confusion_matrix(TR_model2.predict(x_valid1),y_valid1)
Out[28]:
array([[813,   9,  15,  35,   7,   0, 151,   1,  10,   0],
       [  6, 967,   1,  12,   1,   2,   3,   0,   1,   0],
       [  7,   5, 660,  13, 110,   1, 104,   0,  18,   1],
       [ 42,  35,  16, 870,  63,   8,  37,   0,   7,   1],
       [  3,   3, 143,  39, 735,   0, 155,   0,  14,   0],
       [  0,   0,   4,   2,   3, 938,   2,  45,  13,  31],
       [134,   7,  92,  32,  66,   0, 519,   0,  22,   2],
       [  1,   0,   0,   0,   0,  61,   0, 839,   6,  51],
       [  9,   0,  14,   8,   7,  10,  16,   8, 941,   6],
       [  0,   0,   0,   1,   3,  40,   0,  55,   2, 886]], dtype=int64)
In [29]:
TR_model2_valid1_acc = (TR_model2.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",TR_model2_valid1_acc)
VALIDATION ACCURACY = 0.8168
In [30]:
{"TRAIN_ACC" : TR_model2_train_acc , "VALID_ACC" : TR_model2_valid1_acc}
Out[30]:
{'TRAIN_ACC': 0.87705, 'VALID_ACC': 0.8168}
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
TAG
more
«   2025/05   »
1 2 3
4 5 6 7 8 9 10
11 12 13 14 15 16 17
18 19 20 21 22 23 24
25 26 27 28 29 30 31
글 보관함