import numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
%config Completer.use_jedi = False
# Info
# 2021.07.26 ChanHyukLee
# Reference : https://kaggle-kr.tistory.com/17?category=868316
plt.style.use('seaborn')
sns.set(font_scale=2.5)
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df_train= pd.read_csv('./Dataset/train.csv')
df_test = pd.read_csv('./Dataset/test.csv')
df_train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df_train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
for col in df_train.columns:
print(col)
# :>10 means the right sort and the Maximum length of the word is 1s
msg = 'column : {:>10}\t percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
print(msg)
PassengerId column : PassengerId percent of NaN value: 0.00% Survived column : Survived percent of NaN value: 0.00% Pclass column : Pclass percent of NaN value: 0.00% Name column : Name percent of NaN value: 0.00% Sex column : Sex percent of NaN value: 0.00% Age column : Age percent of NaN value: 19.87% SibSp column : SibSp percent of NaN value: 0.00% Parch column : Parch percent of NaN value: 0.00% Ticket column : Ticket percent of NaN value: 0.00% Fare column : Fare percent of NaN value: 0.00% Cabin column : Cabin percent of NaN value: 77.10% Embarked column : Embarked percent of NaN value: 0.22%
for col in df_test.columns:
print(df_test.columns)
msg = 'column : {:>10}\t percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
print(msg)
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : PassengerId percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Pclass percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Name percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Sex percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Age percent of NaN value: 20.57% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : SibSp percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Parch percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Ticket percent of NaN value: 0.00% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Fare percent of NaN value: 0.24% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Cabin percent of NaN value: 78.23% Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') column : Embarked percent of NaN value: 0.00%
msno.matrix(df = df_train.iloc[:,:], figsize=(8,8), color=(0.8, 0.5, 0.2))
<AxesSubplot:>
f, ax = plt.subplots(1, 2, figsize=(18,8))
df_train['Survived'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('a')
sns.countplot('Survived', data=df_train, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).count()
Survived | |
---|---|
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).sum()
Survived | |
---|---|
Pclass | |
1 | 136 |
2 | 87 |
3 | 119 |
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')
Survived | 0 | 1 | All |
---|---|---|---|
Pclass | |||
1 | 80 | 136 | 216 |
2 | 97 | 87 | 184 |
3 | 372 | 119 | 491 |
All | 549 | 342 | 891 |
print('{:>10} : {:.1f} Years'.format('제일 나이 많은 탑승객', df_train['Age'].max()))
print('{:>10} : {:.1f} Years'.format('제일 어린 탑승객', df_train['Age'].min()))
print('{:>10} : {:.1f} Years'.format('탑승자 평균 나이', df_train['Age'].mean()))
제일 나이 많은 탑승객 : 80.0 Years 제일 어린 탑승객 : 0.4 Years 탑승자 평균 나이 : 29.7 Years
fig, ax = plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], ax=ax)
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'],)
plt.show()
plt.figure(figsize = (8,6))
df_train[df_train['Survived'] == 1]['Age'].plot(kind = 'kde')
df_train[df_train['Survived'] == 0]['Age'].plot(kind = 'kde')
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()
plt.figure(figsize = (8,6))
df_train['Age'][df_train['Pclass'] == 1].plot(kind = 'kde')
df_train['Age'][df_train['Pclass'] == 2].plot(kind = 'kde')
df_train['Age'][df_train['Pclass'] == 3].plot(kind = 'kde')
plt.title('Age distribution depends on Pclasses')
plt.legend(['Firstclass', 'Secondclass', 'Thirdclass'])
plt.show()
cummulate_survival_ratio = []
for i in range(1,80):
cummulate_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age']<i]['Survived']))
plt.figure(figsize=(7,7))
plt.plot(cummulate_survival_ratio)
plt.title('Survival rate change depending on the age value')
plt.ylabel('Survival rate')
plt.xlabel('Age range')
plt.show()
f, ax = plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass", 'Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex", "Age", hue="Survived", data=df_train, scale='count', split=True, ax=ax[1])
plt.show()
f, ax = plt.subplots(1,1,figsize = (7,7))
df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax)
<AxesSubplot:xlabel='Embarked'>
f, ax = plt.subplots(2,2, figsize=(20,15))
sns.countplot('Embarked', data=df_train, ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')
sns.countplot('Embarked', hue='Sex', data=df_train, ax=ax[0,1])
ax[0,1].set_title('(2) Embarked graph per Sex')
sns.countplot('Embarked', hue='Survived', data=df_train, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked graph per Survived')
sns.countplot('Embarked', hue='Pclass', data=df_train, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked graph per Pclass')
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
print("Maximum size of family", df_train['FamilySize'].max())
print("Minimum size of family", df_train['FamilySize'].min())
Maximum size of family 11 Minimum size of family 1
f, ax = plt.subplots(1,3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('No of passenger boarded', y=1.02)
sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on Family Size', y=1.02)
df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize', y=1.02)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')
df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean()
import numpy as np
df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i>0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i>0 else 0)
fig, ax = plt.subplots(1,1, figsize=(8,8))
g = sns.distplot(df_train['Fare'], color='b', label='Skewness : {:.2f}'.format(df_train['Fare'].skew()), ax=ax)
g = g.legend(loc='best')
df_train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | FamilySize | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 1.981001 | NaN | S | 2 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 4.266662 | C85 | C | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 2.070022 | NaN | S | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 3.972177 | C123 | S | 2 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 2.085672 | NaN | S | 1 |
df_train['Ticket'].value_counts()
1601 7 CA. 2343 7 347082 7 3101295 6 347088 6 .. W/C 14208 1 C.A. 5547 1 370370 1 349228 1 349208 1 Name: Ticket, Length: 681, dtype: int64