Pandas - Json Data 분석 4(Data 시각화)
1. Data Code
import json
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from print_df import print_df
path = 'data\example.txt'
data = open(path, encoding='utf-8').read()
records = [json.loads(line) for line in open(path, encoding='utf-8')]
time_zone = [rec['tz'] for rec in records if 'tz' in rec]
def get_count(sequence):
counts = {} # '원소 : 갯수'들로 이루어진 dict
for x in sequence: # 리스트의 모든 원소를 검사
if x in counts: # 리스트의 원소가 dict의 키로 존재하면
counts[x] += 1 # 기존 갯수 + 1을 저장
else:
counts[x] = 1 # 1을 저장
return counts
counts = get_count(time_zone)
def top_counts(count_dict, n=10):
cnt_tz_pair = [(count, tz) for tz, count in count_dict.items()] # 정렬할때 첫번째를 가지고 정렬하므로 위치를 변경
cnt_tz_pair.sort() # count를 기준으로 정렬
return cnt_tz_pair[-n:]
# print(top_counts(counts))
frame = DataFrame(records)
browser = Series([x.split()[0] for x in frame.a.dropna()])
cframe = frame[frame.a.notnull()]
os = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'NotWindows')
by_tz_os = cframe.groupby(['tz', os])
agg_counts = by_tz_os.size().unstack().fillna(0)
# print(agg_counts[:10])
indexer = agg_counts.sum(1).argsort()
# print(indexer[:10])
count_subset = agg_counts.take(indexer)[-10:]
# print(count_subset)
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (15, 10)
normed_subset.plot(kind='barh', stacked=True, grid=True)
plt.title('Windows / NotWindows Percent of Users')
plt.ylabel('TimeZone')
plt.xlabel('Count')
plt.savefig('PercentofUsers.png', dpi=200)
plt.close()
2. Code 풀이
- Time Zone 중 America/New_York 카운터
print(counts['America/New_York'])
1251
browser = Series([x.split()[0] for x in frame.a.dropna()])
print(browser[:5])
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
dtype: object
- 브라우져 종류 카운터
print(browser.value_counts()[:8])
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
- OS 종류 분류 선행 작업(Windows / NotWinows)
os = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'NotWindows')
print(os[:5])
['Windows' 'NotWindows' 'Windows' 'NotWindows' 'Windows']
by_tz_os = cframe.groupby(['tz', os])
- 합계 생성 후 출력
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])
NotWindows Windows
tz
245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0
- TimeZone 순위
indexer = agg_counts.sum(1).argsort()
print(indexer[:10])
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
- 정렬된 순서 중 마지막 10개 행 추출
count_subset = agg_counts.take(indexer)[-10:]
print(count_subset)
NotWindows Windows
tz
America/Sao_Paulo 13.0 20.0
Europe/Madrid 16.0 19.0
Pacific/Honolulu 0.0 36.0
Asia/Tokyo 2.0 35.0
Europe/London 43.0 31.0
America/Denver 132.0 59.0
America/Los_Angeles 130.0 252.0
America/Chicago 115.0 285.0
245.0 276.0
America/New_York 339.0 912.0
- Windows / Not Windows 순위 시각화 작업
plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (15, 10)
count_subset.plot(kind='barh', stacked=True, grid=True)
plt.title('Windows / NotWindows Time Zone Rank')
plt.ylabel('TimeZone')
plt.xlabel('Count')
plt.savefig('TimeZoneRank.png', dpi=200)
plt.close()
- 시각화 결과물
- 사용자 비율
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
- 사용자 비율 시각화