Real Data Analysis

FASHION MNIST with Python (DAY 2) - 1. bagging, 2. random forest

딥스탯 2018. 8. 13. 17:31
FASHION_MNIST_DAY2_with_Python

FASHION MNIST with Python (DAY 2)

DATA SOURCE : https://www.kaggle.com/zalando-research/fashionmnist (Kaggle, Fashion MNIST)

FASHION MNIST with Python (DAY 1) : http://deepstat.tistory.com/35

Datasets

Importing numpy, pandas, pyplot

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading datasets

In [2]:
data_train = pd.read_csv("..\\datasets\\fashion-mnist_train.csv")
data_test = pd.read_csv("..\\datasets\\fashion-mnist_test.csv")
In [3]:
data_train_y = data_train.label
y_test = data_test.label
In [4]:
data_train_x = data_train.drop("label",axis=1)/256
x_test = data_test.drop("label",axis=1)/256

Spliting valid and training

In [5]:
np.random.seed(0)
valid2_idx = np.random.choice(60000,10000,replace = False)
valid1_idx = np.random.choice(list(set(range(60000)) - set(valid2_idx)),10000,replace=False)
train_idx = list(set(range(60000))-set(valid1_idx)-set(valid2_idx))

x_train = data_train_x.iloc[train_idx,:]
y_train = data_train_y.iloc[train_idx]

x_valid1 = data_train_x.iloc[valid1_idx,:]
y_valid1 = data_train_y.iloc[valid1_idx]

x_valid2 = data_train_x.iloc[valid2_idx,:]
y_valid2 = data_train_y.iloc[valid2_idx]

Bagging

Importing BaggingClassifier

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix
c:\users\stat413server1\appdata\local\programs\python\python36\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Fitting Bagging

In [7]:
BG_model = BaggingClassifier().fit(x_train, y_train)

Training Accuracy

In [8]:
confusion_matrix(BG_model.predict(x_train),y_train)
Out[8]:
array([[3974,    0,    0,    4,    1,    0,   50,    0,    0,    0],
       [   0, 3988,    0,    3,    1,    0,    0,    0,    0,    0],
       [   2,    0, 4046,    1,   15,    0,   33,    0,    1,    0],
       [   7,    2,    1, 3914,    9,    0,    6,    0,    2,    0],
       [   0,    0,    6,    4, 3984,    0,   27,    0,    0,    0],
       [   0,    0,    0,    0,    0, 3930,    0,    7,    0,    4],
       [   9,    0,    2,    3,    6,    0, 3886,    0,    1,    0],
       [   0,    0,    0,    0,    0,    2,    0, 4093,    0,   16],
       [   2,    0,    1,    0,    0,    0,    3,    0, 3942,    0],
       [   0,    0,    0,    0,    0,    0,    0,    3,    0, 4009]],
      dtype=int64)
In [9]:
BG_model_train_acc = (BG_model.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",BG_model_train_acc)
TRAINING ACCURACY = 0.99415

Validation Accuracy

In [10]:
confusion_matrix(BG_model.predict(x_valid1),y_valid1)
Out[10]:
array([[ 852,    7,   13,   38,    5,    0,  182,    0,    7,    0],
       [   5,  985,    1,   13,    2,    0,    2,    0,    1,    0],
       [  10,    6,  761,    9,  113,    0,  127,    0,    4,    0],
       [  40,   22,   10,  911,   50,    1,   24,    0,    3,    0],
       [   4,    1,   99,   24,  771,    0,  105,    0,    9,    0],
       [   1,    0,    0,    0,    0, 1004,    1,   31,    5,   20],
       [  97,    5,   54,   16,   49,    0,  533,    0,   12,    0],
       [   0,    0,    0,    0,    0,   34,    0,  879,    3,   48],
       [   6,    0,    7,    1,    5,    3,   13,    3,  990,    3],
       [   0,    0,    0,    0,    0,   18,    0,   35,    0,  907]],
      dtype=int64)
In [11]:
BG_model_valid1_acc = (BG_model.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",BG_model_valid1_acc)
VALIDATION ACCURACY = 0.8593
In [12]:
{"TRAIN_ACC" : BG_model_train_acc , "VALID_ACC" : BG_model_valid1_acc}
Out[12]:
{'TRAIN_ACC': 0.99415, 'VALID_ACC': 0.8593}

Random Forest

Importing RandomForestClassifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

Fitting Random Forest

In [14]:
RF_model = RandomForestClassifier().fit(x_train, y_train)

Training Accuracy

In [15]:
confusion_matrix(RF_model.predict(x_train),y_train)
Out[15]:
array([[3985,    1,    2,    0,    4,    0,   45,    0,    1,    0],
       [   0, 3983,    0,    1,    0,    0,    0,    0,    0,    0],
       [   0,    0, 4037,    2,   20,    0,   25,    0,    1,    0],
       [   3,    6,    3, 3921,    8,    0,   10,    0,    0,    0],
       [   0,    0,   11,    3, 3980,    0,   24,    0,    1,    0],
       [   0,    0,    0,    0,    0, 3929,    0,   10,    0,    2],
       [   4,    0,    3,    2,    3,    0, 3900,    0,    1,    0],
       [   0,    0,    0,    0,    0,    3,    0, 4088,    0,   11],
       [   2,    0,    0,    0,    1,    0,    1,    0, 3942,    0],
       [   0,    0,    0,    0,    0,    0,    0,    5,    0, 4016]],
      dtype=int64)
In [16]:
RF_model_train_acc = (RF_model.predict(x_train) == y_train).mean()
print("TRAINING ACCURACY =",RF_model_train_acc)
TRAINING ACCURACY = 0.994525

Validation Accuracy

In [17]:
confusion_matrix(RF_model.predict(x_valid1),y_valid1)
Out[17]:
array([[ 856,    8,   15,   31,   10,    0,  182,    0,    6,    0],
       [   1,  988,    3,    7,    3,    0,    1,    0,    1,    0],
       [  14,    5,  747,    6,  133,    0,  160,    0,    9,    0],
       [  40,   18,   11,  912,   58,    0,   28,    0,    6,    0],
       [   6,    1,  109,   31,  739,    0,   95,    0,    2,    0],
       [   0,    0,    0,    0,    0, 1002,    0,   31,    7,   23],
       [  93,    6,   54,   24,   46,    0,  502,    0,   14,    0],
       [   0,    0,    0,    0,    0,   35,    0,  874,    1,   49],
       [   5,    0,    6,    0,    6,    3,   19,    2,  988,    3],
       [   0,    0,    0,    1,    0,   20,    0,   41,    0,  903]],
      dtype=int64)
In [18]:
RF_model_valid1_acc = (RF_model.predict(x_valid1) == y_valid1).mean()
print("VALIDATION ACCURACY =",RF_model_valid1_acc)
VALIDATION ACCURACY = 0.8511
In [19]:
{"TRAIN_ACC" : RF_model_train_acc , "VALID_ACC" : RF_model_valid1_acc}
Out[19]:
{'TRAIN_ACC': 0.994525, 'VALID_ACC': 0.8511}