[Python]Data Preparation Basic(데이터 전처리 기초) 4

Python_Intermediate/Pandas

[Python]Data Preparation Basic(데이터 전처리 기초) 4

AnKiWoong 2020. 2. 2. 09:47

Live Codeing

1. Sample Data

# 딕셔너리 성적 리스트
grade_dic = {
    '국어': [98, 88, 68, 64, 120],
    '영어': [None, 90, 60, 20, 50],
    '수학': [90, 70, None, 31, None],
    '과학': [120, 50, None, 60, 88]
}

2. 결측치 여부 확인

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 확인
null_data = df.isnull()
null_data2 = df.isna()

print_df(null_data)
print_df(null_data2)

3. 결측치 수 파악

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

null_data = df.isnull()

# 결측치 수 파악
null_data_sum = null_data.sum()

print_df(null_data_sum)

4. 결측치 포함된 행 삭제

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 모든 행 삭제
null_data_del = df.dropna()

print_df(null_data_del)

5. 결측치 갯수 확인

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

null_data_del = df.dropna()

# 결측치 갯수 확인
print_df(null_data_del.isnull().sum())

6. 결측치 포함된 열 삭제

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 모든 열 삭제
null_data_del = df.dropna(axis=1)

print_df(null_data_del)

7. 결측치 갯수 확인

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 모든 열 삭제
null_data_del = df.dropna(axis=1)

# 결측치 갯수 확인
print_df(null_data_del.isnull().sum())

8. 행의 모든 값이 결측치인 경우 행을 삭제

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 행의 모든 값이 결측치 일시 행을 삭제
null_data_del = df.dropna(how='all')

print_df(null_data_del)

9. 열의 모든 값이 결측치인 경우 열을 삭제

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 열의 모든 값이 결측치 일시 열을 삭제
null_data_del = df.dropna(how='all', axis=1)

print_df(null_data_del)

10. 결측치 대표값으로 대체

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df
from sklearn.impute import SimpleImputer
import numpy

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 대표값으로 대체
re_null_data = df.fillna(value=0)

print_df(re_null_data)

11. 결측치 규칙 정의 및 적용

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df
from sklearn.impute import SimpleImputer
import numpy

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 규칙 정의
null_regulation = SimpleImputer(
    missing_values=numpy.nan, strategy='most_frequent')

# 결측치 규칙 적용
df_null_regulation = null_regulation.fit_transform(df.values)

print_df(df_null_regulation)

12. 규칙적용 후 데이터 프레임 생성

from pandas import DataFrame
from Data import grade_dic
from print_df import print_df
from sklearn.impute import SimpleImputer
import numpy

df = DataFrame(grade_dic, index=['노진구', '이슬이', '비실이', '퉁퉁이', '도라에몽'])

# print_df(df)

# 결측치 규칙 정의
null_regulation = SimpleImputer(
    missing_values=numpy.nan, strategy='most_frequent')

# 결측치 규칙 적용
df_null_regulation = null_regulation.fit_transform(df.values)

# 결측치 규칙 적용 데이터 프레임 생성
df2 = DataFrame(df_null_regulation, index=df.index, columns=df.columns)

print_df(df2)

저작자표시 비영리 동일조건