Data Augmentation (데이터 증식)

2021. 10. 1. 21:33ai/Deep Learning

728x90

Data Augmentation (데이터 증식)

-  강아지&고양이 전체 data : 25000 (학습20000개) → 80~85%  accuracy
→ 강아지&고양이 data를 4000개로 학습 진행 (학습2000개) → 77% accuracy

→ data가 적어지면 overfitting 결국 정확도가 떨어진다
overfitting을 피하려면 어떻게 해야하나요 ?
1. L2 Regularization (L2규제) : model을 수정해서 overfitting을 완화,

                                         overfitting을 강제로 조절해서 규제하는 편법?이다 
2. Data Augmentation (데이터 증식) : 조금씩 움직여 변화한 사진들을 증식

       data량을 증가 → overfitting을 감소 하지만 전가의보도(모든문제해결)은 아니에요
   → Image Data Generator를 이용 → over fitting은 상당히 감소 , accruacy 를 증가!!

   학습이 덜됬음으로 train data만 Augmentation 하면된다 vald 제외

over fitting의 예제

loss가 올라간다고 학습이 갑자기 안 좋아지지 않는다 왜냐하면 기존 학습한것이 있기때문

즉 학습과 loss가 동일하게 움직이지 않는다

 

 


############ data augmentation sample (데이터 증식) #################

 

from tensorflow.keras.preprocessing import image

from tensorflow.keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt

 

datagen = ImageDataGenerator(rotation_range=20,

                             # 지정된 각도 범위 내에서 임의로 원본이미지를 회전

                             width_shift_range=0.1,

                             height_shift_range=0.1,

                             # 지정된 이동범위 내( (10) %비율)에서 원본이미지를 이동

                             zoom_range=0.1,

                             # 지정된 이동범위 내(%비율)에서 원본이미지를 확대/축소                       

                             vertical_flip=True,

                             horizontal_flip=True,

                            # 상하 좌우 반전 ( 이미지 증식시 다른 비슷한 이미지가 아닌 다른 이미지로 간주)

                             fill_mode='nearest'      #이미지를 변형시키면 틀었을때 빈공간을 주변 pixel로 채운다

                             )

 

# 이미지 1장을 이용해서 증식시켜 보아요!

# 이미지를 가져온다

img = image.load_img('/content/drive/MyDrive/융복합 프로젝트형 AI 서비스 개발(2021.06)/09월/30일(목요일)/cat_dog_small/train/cats/cat.3.jpg',

                     target_size=(150,150))

# print(type(img))  # <class 'PIL.Image.Image'>

# 이미지를 array로 변형

img_arr = image.img_to_array(img)

print(type(img_arr), img_arr.shape)  # <class 'numpy.ndarray'> (150, 150, 3)

 

img_arr = img_arr.reshape((1,) + img_arr.shape)

print(type(img_arr), img_arr.shape)  # <class 'numpy.ndarray'> (1, 150, 150, 3)

 

도화지 그리기

fig = plt.figure(figsize=(10,10))

ax = list()

 

# subplot을 생성해서 list안에 저장

for i in range(20):

    ax.append(fig.add_subplot(4,5,i+1))

 

idx=0

for batch in datagen.flow(img_arr, batch_size=1):

                    # flow: 이미지를 발생시켜바 (img_arr 여기서부터)

    ax[idx].imshow(image.array_to_img(batch[0]))

    idx += 1

    if idx % 20 == 0:

      #20번 반복시 그만

        break

 

plt.tight_layout()

plt.show()  

 


## ImageDataGenerator를 이용한 Dogs vs. Cats CNN 구현

## Data Augmentation 포함

 

import numpy as np

 

import tensorflow as tf

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPooling2D, Dropout

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.preprocessing.image import ImageDataGenerator

 

import os

import matplotlib.pyplot as plt

 

# ImageDataGenerator부터 생성해야 해요!

train_dir = '/content/drive/MyDrive/융복합 프로젝트형 AI 서비스 개발(2021.06)/09월/30일(목요일)/cat_dog_small/train'

valid_dir = '/content/drive/MyDrive/융복합 프로젝트형 AI 서비스 개발(2021.06)/09월/30일(목요일)/cat_dog_small/validation'

test_dir = '/content/drive/MyDrive/융복합 프로젝트형 AI 서비스 개발(2021.06)/09월/30일(목요일)/cat_dog_small/test'

 

# ImageDataGenerator를 생성해 보아요!

# 특정 폴더에서 이미지를 가져오는 역할을 해요!

# 이미지를 가져올 때 scaling해 줄 수 있어요!

 

train_datagen = ImageDataGenerator(rescale=1/255,

                                   rotation_range=20,

                                   width_shift_range=0.1,

                                   height_shift_range=0.1,

                                   zoom_range=0.1,

                                   vertical_flip=True,

                                   horizontal_flip=True,

                                   fill_mode='nearest')  

 

# validation data와 test data에는 당연히 증식을 사용하지 않아요!

valid_datagen = ImageDataGenerator(rescale=1/255)

 

train_generator = train_datagen.flow_from_directory(

    train_dir,                   # target directory

    classes=['cats''dogs'],    # cats, dogs directory 순서로 label(0, 1)을 설정

                                 # 만약 classes 설정을 생략하면 폴더의 순서(오름차순)로 label을 결정

    target_size=(150,150),       # image resize

    batch_size=20,               # 한번의 20개의 이미지를 가져와요!

                                 # label에 상관없이 가져와요!

    class_mode='binary'          # 이진분류를 위한 값 : binary

                                 # 다중분류를 위한 값 : categorical

)

 

valid_generator = valid_datagen.flow_from_directory(

    valid_dir,               

    classes=['cats''dogs'],                                     

    target_size=(150,150),       

    batch_size=20,               

    class_mode='binary')

  

# Data가 준비되었어요!

 

# CNN Model 구현

 

model = Sequential()

 

model.add(Conv2D(filters=32

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME',

                 input_shape=(150,150,3)))

model.add(MaxPooling2D(pool_size=(2,2)))

 

model.add(Conv2D(filters=64

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME'))

model.add(MaxPooling2D(pool_size=(2,2)))

 

model.add(Conv2D(filters=128

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME'))

model.add(Conv2D(filters=128

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME'))

model.add(MaxPooling2D(pool_size=(2,2)))

 

model.add(Conv2D(filters=64

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME'))

model.add(Conv2D(filters=32

                 kernel_size=(3,3),

                 activation='relu',

                 padding='SAME'))

model.add(MaxPooling2D(pool_size=(2,2)))

 

# FC Layer

 

# input layer의 역할

model.add(Flatten())   # 전체 데이터를 2차원으로 변형

 

# Dropout layer(overfitting을 피하기 위해 사용)

model.add(Dropout(rate=0.5))

 

# Hidden layer

model.add(Dense(units=256,

                activation='relu'))

 

# output layout

model.add(Dense(units=1,

                activation='sigmoid'))

 

# model summary

# print(model.summary())

 

# Optimizer 설정

model.compile(optimizer=Adam(learning_rate=1e-4),

              loss='binary_crossentropy',

              metrics=['accuracy'])

 

# Learning

 

history = model.fit(train_generator,     # 한번에 20개씩 뽑아..그런데 2000개를 뽑아야 해요! 1epoch

                    steps_per_epoch=100,  # 20 * 100 = 2000

                    epochs=100,

                    verbose=1,

                    validation_data=valid_generator,

                    validation_steps=50)   # 20 * 50 = 1000


test_datagen = ImageDataGenerator(rescale=1/255)

 

test_generator = test_datagen.flow_from_directory(

    test_dir,               

    classes=['cats''dogs'],                                     

    target_size=(150,150),       

    batch_size=20,               

    class_mode='binary')

 

result = model.evaluate(test_generator, steps=50)

print('small data를 이용한 Dogs vs. Cats의 accuracy : {}'.format(result))


# model 저장
model.save('/content/drive/MyDrive/융복합 프로젝트형 AI 서비스 개발(2021.06)/10월/01일(금요일)/cat_dog_small_cnn_augmentation_model.h5')

 


# accuracy와 loss 비교

train_acc = history.history['accuracy']
valid_acc = history.history['val_accuracy']

train_loss = history.history['loss']
valid_loss = history.history['val_loss']

fig = plt.figure()
fig_1 = fig.add_subplot(1,2,1)
fig_2 = fig.add_subplot(1,2,2)

fig_1.plot(train_acc, color='b', label='training_accuracy')
fig_1.plot(valid_acc, color='r', label='validation_accuracy')
fig_1.legend()

fig_2.plot(train_loss, color='b', label='training_loss')
fig_2.plot(valid_loss, color='r', label='validation_loss')
fig_2.legend()

plt.tight_layout()
plt.show()