import numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
%config Completer.use_jedi = False


# Info
# 2021.07.26 ChanHyukLee
# Reference : https://kaggle-kr.tistory.com/17?category=868316


plt.style.use('seaborn')
sns.set(font_scale=2.5)


import missingno as msno
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline


df_train= pd.read_csv('./Dataset/train.csv')
df_test = pd.read_csv('./Dataset/test.csv')


df_train.head()


df_train.describe()


for col in df_train.columns:
    print(col)
    # :>10 means the right sort and the Maximum length of the word is 1s
    msg = 'column : {:>10}\t percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

PassengerId
column : PassengerId	 percent of NaN value: 0.00%
Survived
column :   Survived	 percent of NaN value: 0.00%
Pclass
column :     Pclass	 percent of NaN value: 0.00%
Name
column :       Name	 percent of NaN value: 0.00%
Sex
column :        Sex	 percent of NaN value: 0.00%
Age
column :        Age	 percent of NaN value: 19.87%
SibSp
column :      SibSp	 percent of NaN value: 0.00%
Parch
column :      Parch	 percent of NaN value: 0.00%
Ticket
column :     Ticket	 percent of NaN value: 0.00%
Fare
column :       Fare	 percent of NaN value: 0.00%
Cabin
column :      Cabin	 percent of NaN value: 77.10%
Embarked
column :   Embarked	 percent of NaN value: 0.22%


for col in df_test.columns:
    print(df_test.columns)
    msg = 'column : {:>10}\t percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msg)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column : PassengerId	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :     Pclass	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :       Name	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :        Sex	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :        Age	 percent of NaN value: 20.57%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :      SibSp	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :      Parch	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :     Ticket	 percent of NaN value: 0.00%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :       Fare	 percent of NaN value: 0.24%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :      Cabin	 percent of NaN value: 78.23%
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
column :   Embarked	 percent of NaN value: 0.00%


msno.matrix(df = df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))

<AxesSubplot:>


f, ax = plt.subplots(1, 2, figsize=(18,8))

df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('a')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')

plt.show()


df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()


df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).sum()


pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')


print('{:>10} : {:.1f} Years'.format('제일 나이 많은 탑승객', df_train['Age'].max()))
print('{:>10} : {:.1f} Years'.format('제일 어린 탑승객', df_train['Age'].min()))
print('{:>10} : {:.1f} Years'.format('탑승자 평균 나이', df_train['Age'].mean()))

제일 나이 많은 탑승객 : 80.0 Years
 제일 어린 탑승객 : 0.4 Years
 탑승자 평균 나이 : 29.7 Years


fig, ax = plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'],)
plt.show()


plt.figure(figsize = (8,6))
df_train[df_train['Survived'] == 1]['Age'].plot(kind = 'kde')
df_train[df_train['Survived'] == 0]['Age'].plot(kind = 'kde')
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()


plt.figure(figsize = (8,6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind = 'kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind = 'kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind = 'kde')
plt.title('Age distribution depends on Pclasses')
plt.legend(['Firstclass', 'Secondclass', 'Thirdclass'])
plt.show()


cummulate_survival_ratio = []
for i in range(1,80):
    cummulate_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age']<i]['Survived']))

plt.figure(figsize=(7,7))
plt.plot(cummulate_survival_ratio)
plt.title('Survival rate change depending on the age value')
plt.ylabel('Survival rate')
plt.xlabel('Age range')
plt.show()


f, ax = plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass", 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex", "Age", hue="Survived", data=df_train, scale='count', split=True, ax=ax[1])
plt.show()


f, ax = plt.subplots(1,1,figsize = (7,7))
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax)

<AxesSubplot:xlabel='Embarked'>


f, ax = plt.subplots(2,2, figsize=(20,15))
sns.countplot('Embarked', data=df_train, ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')
sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0,1])
ax[0,1].set_title('(2) Embarked graph per Sex')
sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked graph per Survived')
sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked graph per Pclass')
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()


df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1


print("Maximum size of family", df_train['FamilySize'].max())
print("Minimum size of family", df_train['FamilySize'].min())

Maximum size of family 11
Minimum size of family 1


f, ax = plt.subplots(1,3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('No of passenger boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on Family Size', y=1.02)

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize', y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()


fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')


df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean()

import numpy as np
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i>0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i>0 else 0)


fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')


df_train.head()


df_train['Ticket'].value_counts()

1601         7
CA. 2343     7
347082       7
3101295      6
347088       6
            ..
W/C 14208    1
C.A. 5547    1
370370       1
349228       1
349208       1
Name: Ticket, Length: 681, dtype: int64

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	Survived
Pclass
1	216
2	184
3	491

Survived	0	1	All
Pclass
1	80	136	216
2	97	87	184
3	372	119	491
All	549	342	891

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	FamilySize
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	1.981001	NaN	S	2
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	4.266662	C85	C	2
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	2.070022	NaN	S	1
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	3.972177	C123	S	2
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	2.085672	NaN	S	1

Dataset Check¶

Target Label check¶

Data analysis¶