titanic code

2021. 9. 7. 00:19ai/Machine Learning

728x90

# Logistic Regression (binary classification)

 

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

# 이상치 처리와 정규화처리는 여러분들이 나중에 한번 해 보세요!

 

# Domain 분석부터 시작해 보아요!

# Raw Data Loading
df = pd.read_csv('./data/titanic/train.csv', sep=',')

# display(df)

train_df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)

# display(train_df)

#성별을 숫자로 변환
sex_mapping = { 'male' : 0, 'female' : 1 }

# 바꿀걸 dic형태로 만들었다
train_df['Sex'] = train_df['Sex'].map(sex_mapping)
# .map() : mapping할거라는 의미이며 

#두 컬럼을 하나로 합치기
train_df['Family'] = train_df['SibSp'] + train_df['Parch']

train_df.drop(['SibSp', 'Parch'], axis=1, inplace=True)

#inplace=True : 원본삭제

# Embarked에 NaN이 존재해요. 이 NaN을 다른값으로 치환해서 사용해야 해요!
# 최빈값을 이용(Q)해서 처리하는게 좋을듯 싶어요!
train_df['Embarked'] = train_df['Embarked'].fillna('Q')

 

# 승선등급을 숫자로
embarked_mapping = { 'S' : 0, 'C' : 1, 'Q' : 2}
train_df['Embarked'] = train_df['Embarked'].map(embarked_mapping)

# Age에 NaN이 존재해요! 
# 여기서는 전체 평균을 이용해서 NaN을 치환할꺼예요!
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
 
# display(train_df)

Age는 어떻게 처리하면 좋을까요?? => Binning [ 쓰레기통, 버리다 ] 처리(Numerical value -> Categorical value (범주형 : 학년,성별등 , 범위 처리 ) )
train_df.loc[train_df['Age'] < 8,'Age'] = 0
train_df.loc[(train_df['Age'] >= 8) & (train_df['Age'] < 20),'Age'] = 1
train_df.loc[(train_df['Age'] >= 20) & (train_df['Age'] < 60),'Age'] = 2
train_df.loc[(train_df['Age'] >= 60),'Age'] = 3

display(train_df)



# 데이터가 준비됬으니.. 머신러닝 진행하고 validation을 진행해서 정확도를 구해봐야겠죠?
# 75% 내외가 나오면 괜찮아요!

# 머신러닝 모델이 완성되면 test.csv를 이용해서 prediction data를 생성하고 이것을
# 파일로 만들어서 kaggle에 제출!

# kaggle에서 최종 accuracy를 화면에서 확인하고 capture해서 나중에 제출!

# 수행평가이고 개별적으로 작성을 해 보세요!!

 

x_data = train_df.drop('Survived',axis=1,inplace=False).values
t_data = train_df['Survived'].values

#Training Data와 Validation Data를 준비!
train_x_data, valid_x_data, train_t_data, valid_t_data = \
train_test_split(x_data,t_data,
                 test_size=0.2, 
                 stratify=t_data,
                 random_state=2)
display(train_x_data)

 


 

# Place Holder
x = tf.placeholder(shape=[None,5], dtype=tf.float32)
t = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
w = tf.Variable(tf.random.normal([5,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesisi(Logistic Model)
logit = tf.matmul(x,w) + b
H = tf.sigmoid(logit)

# loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit,
                                                              labels=t))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-5).minimize(loss)

# session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 반복학습
for step in range(300000):
    
    tmp, loss_val = sess.run([train, loss], 
                             feed_dict={x:train_x_data, 
                                        t:train_t_data.reshape(-1,1)})
    
    if step % 30000 == 0:
        print('loss : {}'.format(loss_val))

 


predict = tf.cast(H > 0.5, dtype=tf.float32)   
correct = tf.equal(predict, t)
acc = tf.reduce_mean(tf.cast(correct, tf.float32))

acc_val = sess.run(acc, feed_dict={x:valid_x_data, 
                                   t:valid_t_data.reshape(-1,1)})

print('accuracy : {}'.format(acc_val))

 


df = pd.read_csv('./data/titanic/test.csv')
submission = pd.read_csv('./data/titanic/gender_submission.csv')

test_df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)

# display(train_df)

sex_mapping = { 'male' : 0, 'female' : 1 }
test_df['Sex'] = test_df['Sex'].map(sex_mapping)



test_df['Family'] = test_df['SibSp'] + test_df['Parch']

test_df.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Embarked에 NaN이 존재해요. 이 NaN을 다른값으로 치환해서 사용해야 해요!
# 최빈값을 이용(Q)해서 처리하는게 좋을듯 싶어요!
test_df['Embarked'] = test_df['Embarked'].fillna('Q')

embarked_mapping = { 'S' : 0, 'C' : 1, 'Q' : 2}
test_df['Embarked'] = test_df['Embarked'].map(embarked_mapping)

# Age에 NaN이 존재해요!
# 여기서는 전체 평균을 이용해서 NaN을 치환할꺼예요!
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

# display(train_df)

# Age는 어떻게 처리하면 좋을까요?? => Binning처리(Numerical value -> Categorical value)
test_df.loc[train_df['Age'] < 8,'Age'] = 0
test_df.loc[(train_df['Age'] >= 8) & (test_df['Age'] < 20),'Age'] = 1
test_df.loc[(train_df['Age'] >= 20) & (test_df['Age'] < 60),'Age'] = 2
test_df.loc[(train_df['Age'] >= 60),'Age'] = 3

display(test_df)

test_x_data = test_df.values

predict = sess.run(tf.cast(H > 0.5, dtype=tf.float32), feed_dict={x:test_x_data})
submission['Survived'] = predict
submission['Survived'] = submission['Survived'].astype('int')

submission.to_csv('./submission.csv', index=False)

 


sklearn ver

import tensorflow as tf

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Flatten, Dense

# Dense : Fully Connected Layer ( 완전 연결성 레이어 )를 의미

# 딥러닝에서 layer가 여러개 들어가는데 앞과 뒤에 layer에 있는 node들끼리 data를 주고받는 형태 / 

# node가 data를 안 전달할수 없는 형태?

from tensorflow.keras.optimizers import SGD

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

 

# sklearn의 Logistic Regression 구현

train_x_data, valid_x_data, train_t_data, valid_t_data = \

train_test_split(train_df.drop('Survived', axis=1, inplace=False),

                 train_df['Survived'],

                 test_size=0.3,

                 random_state=0,

                 stratify=train_df['Survived'])

 

# Normalization

scaler = MinMaxScaler()

scaler.fit(train_x_data)

train_x_data_norm = scaler.transform(train_x_data)

valid_x_data_norm = scaler.transform(valid_x_data)

 

# Logistic Regression

model = LogisticRegression()

model.fit(train_x_data_norm, train_t_data)

score = model.score(valid_x_data_norm, valid_t_data)

# score : Accuracy(정확도) 로 평가

print('sklearn score : {}'.format(score))  # 0.8059701492537313

 

 


tf 2.0 ver

# tensorflow 구현

 

keras_model = Sequential()

# keras_model.add(Flatten(input_shape=(train_x_data_norm.shape[1],)))

# keras_model.add(Dense(1, activation='sigmoid'))

keras_model.add(Dense(1, activation='sigmoid', input_shape=(train_x_data_norm.shape[1],)))



keras_model.compile(optimizer=SGD(learning_rate=1e-2),

                    loss='binary_crossentropy',

                    metrics=['accuracy'])

 

keras_model.fit(train_x_data_norm,

                train_t_data,

                epochs=1000,

                verbose=0)

 

keras_result = keras_model.evaluate(valid_x_data_norm, valid_t_data)

 

# loss, acc

print('keras score : {}'.format(keras_result))  # [0.4378598928451538, 0.8171641826629639]

'ai > Machine Learning' 카테고리의 다른 글

Regulerization(규제)  (0) 2021.09.08
SGDClassifier, StandardScaler  (0) 2021.09.08
결측치처리  (0) 2021.09.06
Cancer 코드 Tensorflow ver  (0) 2021.09.06
2차 정리  (0) 2021.09.05