Pandas - 1880 ~ 2010 년까지 출생 자료 분석 2
1. Sample Data
1880 ~ 2010 년까지 태어난 아이의 이름 / 성별 / 출생수
https://developer-ankiwoong.tistory.com/268
2. Import Module
import pandas as pd
import matplotlib.pyplot as plt
from print_df import print_df
3. Pandas Data Code
- TXT 파일 읽어서 DataBase화
years = range(1880, 2011)
df_pieces = [] # 1880 ~ 2010년 까지의 데이터 프레임을 젖아할 리스트
for year in years: # 1880 ~ 2010
path = 'data\yob%d.txt' % year # 파일 이름
df = pd.read_csv(path, header=None, names=['name', 'gender', 'born'], encoding='utf-8')
df['year'] = year # 데이터 프레임에 year 컬럼 추가
df_pieces.append(df) # 데이터 프레임들의 리스트
- 데이터 Concat 작업
names = pd.concat(df_pieces, ignore_index=True)
- 이름 트렌드 분석을 위한 Top100 함수 생성
def get_top100(group):
return group.sort_values(by='born', ascending=False)[:100]
- 연도 / 성별을 기준으로 그룹화 출력
grouped = names.groupby(['year', 'gender'])
top100 = grouped.apply(get_top100)
name gender born year
year gender
1880 F 0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
5 Margaret F 1578 1880
6 Ida F 1472 1880
7 Alice F 1414 1880
8 Bertha F 1320 1880
9 Sarah F 1288 1880
10 Annie F 1258 1880
11 Clara F 1226 1880
12 Ella F 1156 1880
13 Florence F 1063 1880
14 Cora F 1045 1880
15 Martha F 1040 1880
16 Laura F 1012 1880
17 Nellie F 995 1880
18 Grace F 982 1880
19 Carrie F 949 1880
20 Maude F 858 1880
21 Mabel F 808 1880
22 Bessie F 794 1880
23 Jennie F 793 1880
24 Gertrude F 787 1880
25 Julia F 783 1880
26 Hattie F 769 1880
27 Edith F 768 1880
28 Mattie F 704 1880
29 Rose F 700 1880
... ... ... ... ...
2010 M 1676714 Xavier M 5701 2010
1676715 Ian M 5510 2010
1676716 Colton M 5270 2010
1676717 Dominic M 5260 2010
1676718 Juan M 5217 2010
1676719 Cooper M 5206 2010
1676720 Josiah M 5138 2010
1676721 Luis M 5104 2010
1676722 Ayden M 5096 2010
1676723 Carson M 5064 2010
1676724 Adam M 5062 2010
1676725 Nathaniel M 5039 2010
1676726 Brody M 5015 2010
1676727 Tristan M 4854 2010
1676728 Diego M 4693 2010
1676729 Parker M 4687 2010
1676730 Blake M 4666 2010
1676731 Oliver M 4632 2010
1676732 Cole M 4562 2010
1676733 Carlos M 4559 2010
1676734 Jaden M 4468 2010
1676735 Jesus M 4425 2010
1676736 Alex M 4409 2010
1676737 Aidan M 4263 2010
1676738 Eric M 4163 2010
1676739 Hayden M 4151 2010
1676740 Bryan M 3914 2010
1676741 Max M 3819 2010
1676742 Jaxon M 3802 2010
1676743 Brian M 3744 2010
[26200 rows x 4 columns]
- 남자 정보 추출
boys = top100[top100.gender == 'M']
name gender born year
100 John M 9655 1880
101 William M 9533 1880
102 James M 5927 1880
103 Charles M 5348 1880
104 George M 5126 1880
105 Frank M 3242 1880
106 Joseph M 2632 1880
107 Thomas M 2534 1880
108 Henry M 2444 1880
109 Robert M 2416 1880
110 Edward M 2364 1880
111 Harry M 2152 1880
112 Walter M 1755 1880
113 Arthur M 1599 1880
114 Fred M 1569 1880
115 Albert M 1493 1880
116 Samuel M 1024 1880
117 David M 869 1880
118 Louis M 828 1880
119 Joe M 731 1880
120 Charlie M 730 1880
121 Clarence M 730 1880
122 Richard M 728 1880
123 Andrew M 644 1880
124 Daniel M 643 1880
125 Ernest M 615 1880
126 Will M 588 1880
127 Jesse M 569 1880
128 Oscar M 544 1880
129 Lewis M 517 1880
... ... ... ... ...
26170 Xavier M 5701 2010
26171 Ian M 5510 2010
26172 Colton M 5270 2010
26173 Dominic M 5260 2010
26174 Juan M 5217 2010
26175 Cooper M 5206 2010
26176 Josiah M 5138 2010
26177 Luis M 5104 2010
26178 Ayden M 5096 2010
26179 Carson M 5064 2010
26180 Adam M 5062 2010
26181 Nathaniel M 5039 2010
26182 Brody M 5015 2010
26183 Tristan M 4854 2010
26184 Diego M 4693 2010
26185 Parker M 4687 2010
26186 Blake M 4666 2010
26187 Oliver M 4632 2010
26188 Cole M 4562 2010
26189 Carlos M 4559 2010
26190 Jaden M 4468 2010
26191 Jesus M 4425 2010
26192 Alex M 4409 2010
26193 Aidan M 4263 2010
26194 Eric M 4163 2010
26195 Hayden M 4151 2010
26196 Bryan M 3914 2010
26197 Max M 3819 2010
26198 Jaxon M 3802 2010
26199 Brian M 3744 2010
[13100 rows x 4 columns]
- 1900년대 남자 정보만 추출
df_boy = boys[boys.year == 1900]
+------+-----------+--------+------+------+
| | name | gender | born | year |
+------+-----------+--------+------+------+
| 4100 | John | M | 9834 | 1900 |
| 4101 | William | M | 8580 | 1900 |
| 4102 | James | M | 7246 | 1900 |
| 4103 | George | M | 5405 | 1900 |
| 4104 | Charles | M | 4102 | 1900 |
| 4105 | Robert | M | 3827 | 1900 |
| 4106 | Joseph | M | 3714 | 1900 |
| 4107 | Frank | M | 3477 | 1900 |
| 4108 | Edward | M | 2723 | 1900 |
| 4109 | Henry | M | 2606 | 1900 |
| 4110 | Thomas | M | 2557 | 1900 |
| 4111 | Walter | M | 2298 | 1900 |
| 4112 | Harry | M | 2272 | 1900 |
| 4113 | Willie | M | 2113 | 1900 |
| 4114 | Arthur | M | 1853 | 1900 |
| 4115 | Albert | M | 1742 | 1900 |
| 4116 | Fred | M | 1646 | 1900 |
| 4117 | Clarence | M | 1483 | 1900 |
| 4118 | Paul | M | 1288 | 1900 |
| 4119 | Harold | M | 1246 | 1900 |
| 4120 | Roy | M | 1215 | 1900 |
| 4121 | Joe | M | 1180 | 1900 |
| 4122 | Raymond | M | 1150 | 1900 |
| 4123 | Richard | M | 1143 | 1900 |
| 4124 | Charlie | M | 1124 | 1900 |
| 4125 | Louis | M | 1099 | 1900 |
| 4126 | Jack | M | 1053 | 1900 |
| 4127 | Earl | M | 1037 | 1900 |
| 4128 | Carl | M | 1016 | 1900 |
| 4129 | Ernest | M | 1012 | 1900 |
| 4130 | Ralph | M | 896 | 1900 |
| 4131 | David | M | 889 | 1900 |
| 4132 | Samuel | M | 877 | 1900 |
| 4133 | Sam | M | 821 | 1900 |
| 4134 | Howard | M | 781 | 1900 |
| 4135 | Herbert | M | 751 | 1900 |
| 4136 | Andrew | M | 718 | 1900 |
| 4137 | Elmer | M | 699 | 1900 |
| 4138 | Lee | M | 675 | 1900 |
| 4139 | Lawrence | M | 659 | 1900 |
| 4140 | Francis | M | 652 | 1900 |
| 4141 | Alfred | M | 640 | 1900 |
| 4142 | Will | M | 603 | 1900 |
| 4143 | Daniel | M | 601 | 1900 |
| 4144 | Eugene | M | 587 | 1900 |
| 4145 | Leo | M | 570 | 1900 |
| 4146 | Oscar | M | 563 | 1900 |
| 4147 | Floyd | M | 555 | 1900 |
| 4148 | Herman | M | 550 | 1900 |
| 4149 | Jesse | M | 536 | 1900 |
| 4150 | Michael | M | 499 | 1900 |
| 4151 | Tom | M | 470 | 1900 |
| 4152 | Lewis | M | 469 | 1900 |
| 4153 | Leonard | M | 465 | 1900 |
| 4154 | Ray | M | 457 | 1900 |
| 4155 | Benjamin | M | 451 | 1900 |
| 4156 | Clyde | M | 451 | 1900 |
| 4157 | Peter | M | 443 | 1900 |
| 4158 | Claude | M | 433 | 1900 |
| 4159 | Lester | M | 432 | 1900 |
| 4160 | Theodore | M | 429 | 1900 |
| 4161 | Russell | M | 427 | 1900 |
| 4162 | Eddie | M | 426 | 1900 |
| 4163 | Frederick | M | 420 | 1900 |
| 4164 | Leroy | M | 419 | 1900 |
| 4165 | Clifford | M | 409 | 1900 |
| 4166 | Anthony | M | 406 | 1900 |
| 4167 | Jim | M | 399 | 1900 |
| 4168 | Jessie | M | 397 | 1900 |
| 4169 | Martin | M | 383 | 1900 |
| 4170 | Edgar | M | 374 | 1900 |
| 4171 | Chester | M | 364 | 1900 |
| 4172 | Ben | M | 355 | 1900 |
| 4173 | Edwin | M | 350 | 1900 |
| 4174 | Dewey | M | 345 | 1900 |
| 4175 | Cecil | M | 344 | 1900 |
| 4176 | Stanley | M | 342 | 1900 |
| 4177 | Lloyd | M | 337 | 1900 |
| 4178 | Donald | M | 328 | 1900 |
| 4179 | Homer | M | 326 | 1900 |
| 4180 | Harvey | M | 322 | 1900 |
| 4181 | Luther | M | 316 | 1900 |
| 4182 | Norman | M | 316 | 1900 |
| 4183 | Johnnie | M | 313 | 1900 |
| 4184 | Leon | M | 310 | 1900 |
| 4185 | Bernard | M | 308 | 1900 |
| 4186 | Ed | M | 297 | 1900 |
| 4187 | Hugh | M | 290 | 1900 |
| 4188 | Patrick | M | 285 | 1900 |
| 4189 | Kenneth | M | 279 | 1900 |
| 4190 | Leslie | M | 275 | 1900 |
| 4191 | Victor | M | 269 | 1900 |
| 4192 | Alexander | M | 261 | 1900 |
| 4193 | Philip | M | 259 | 1900 |
| 4194 | Oliver | M | 256 | 1900 |
| 4195 | Mack | M | 250 | 1900 |
| 4196 | Horace | M | 244 | 1900 |
| 4197 | Milton | M | 242 | 1900 |
| 4198 | Guy | M | 240 | 1900 |
| 4199 | Everett | M | 237 | 1900 |
+------+-----------+--------+------+------+
- 여자 정보 추출
girl = top100[top100.gender == 'F']
name gender born year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
5 Margaret F 1578 1880
6 Ida F 1472 1880
7 Alice F 1414 1880
8 Bertha F 1320 1880
9 Sarah F 1288 1880
10 Annie F 1258 1880
11 Clara F 1226 1880
12 Ella F 1156 1880
13 Florence F 1063 1880
14 Cora F 1045 1880
15 Martha F 1040 1880
16 Laura F 1012 1880
17 Nellie F 995 1880
18 Grace F 982 1880
19 Carrie F 949 1880
20 Maude F 858 1880
21 Mabel F 808 1880
22 Bessie F 794 1880
23 Jennie F 793 1880
24 Gertrude F 787 1880
25 Julia F 783 1880
26 Hattie F 769 1880
27 Edith F 768 1880
28 Mattie F 704 1880
29 Rose F 700 1880
... ... ... ... ...
26070 Destiny F 3689 2010
26071 Brooke F 3683 2010
26072 Trinity F 3646 2010
26073 Faith F 3633 2010
26074 Lucy F 3604 2010
26075 Madelyn F 3593 2010
26076 Madeline F 3574 2010
26077 Bailey F 3563 2010
26078 Payton F 3561 2010
26079 Andrea F 3547 2010
26080 Autumn F 3476 2010
26081 Melanie F 3474 2010
26082 Ariana F 3446 2010
26083 Serenity F 3437 2010
26084 Stella F 3407 2010
26085 Maria F 3384 2010
26086 Molly F 3338 2010
26087 Caroline F 3325 2010
26088 Genesis F 3294 2010
26089 Kaitlyn F 3264 2010
26090 Eva F 3247 2010
26091 Jessica F 3161 2010
26092 Angelina F 3110 2010
26093 Valeria F 3106 2010
26094 Gabrielle F 3104 2010
26095 Naomi F 3097 2010
26096 Mariah F 3070 2010
26097 Natalia F 3025 2010
26098 Paige F 2993 2010
26099 Rachel F 2962 2010
[13100 rows x 4 columns]
- 인덱스 제거 후 재그룹화
top100.reset_index(inplace=True, drop=True)
name gender born year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
5 Margaret F 1578 1880
6 Ida F 1472 1880
7 Alice F 1414 1880
8 Bertha F 1320 1880
9 Sarah F 1288 1880
10 Annie F 1258 1880
11 Clara F 1226 1880
12 Ella F 1156 1880
13 Florence F 1063 1880
14 Cora F 1045 1880
15 Martha F 1040 1880
16 Laura F 1012 1880
17 Nellie F 995 1880
18 Grace F 982 1880
19 Carrie F 949 1880
20 Maude F 858 1880
21 Mabel F 808 1880
22 Bessie F 794 1880
23 Jennie F 793 1880
24 Gertrude F 787 1880
25 Julia F 783 1880
26 Hattie F 769 1880
27 Edith F 768 1880
28 Mattie F 704 1880
29 Rose F 700 1880
... ... ... ... ...
26170 Xavier M 5701 2010
26171 Ian M 5510 2010
26172 Colton M 5270 2010
26173 Dominic M 5260 2010
26174 Juan M 5217 2010
26175 Cooper M 5206 2010
26176 Josiah M 5138 2010
26177 Luis M 5104 2010
26178 Ayden M 5096 2010
26179 Carson M 5064 2010
26180 Adam M 5062 2010
26181 Nathaniel M 5039 2010
26182 Brody M 5015 2010
26183 Tristan M 4854 2010
26184 Diego M 4693 2010
26185 Parker M 4687 2010
26186 Blake M 4666 2010
26187 Oliver M 4632 2010
26188 Cole M 4562 2010
26189 Carlos M 4559 2010
26190 Jaden M 4468 2010
26191 Jesus M 4425 2010
26192 Alex M 4409 2010
26193 Aidan M 4263 2010
26194 Eric M 4163 2010
26195 Hayden M 4151 2010
26196 Bryan M 3914 2010
26197 Max M 3819 2010
26198 Jaxon M 3802 2010
26199 Brian M 3744 2010
[26200 rows x 4 columns]
- 그룹을 가지고 피봇 테이블 구조 생성
total_born_name = top100.pivot_table('born', index='year', columns='name', aggfunc=sum)
- 부분집합 생성
subset = total_born_name[['Mary', 'Anna', 'John', 'William']]
name Mary Anna John William
year
1880 7065.0 2604.0 9655.0 9533.0
1881 6919.0 2698.0 8769.0 8524.0
1882 8149.0 3143.0 9557.0 9298.0
1883 8012.0 3306.0 8894.0 8387.0
1884 9217.0 3860.0 9387.0 8897.0
1885 9128.0 3994.0 8756.0 8044.0
1886 9891.0 4283.0 9026.0 8252.0
1887 9888.0 4227.0 8109.0 7470.0
1888 11754.0 4982.0 9248.0 8704.0
1889 11649.0 5062.0 8548.0 7772.0
1890 12078.0 5233.0 8502.0 7494.0
1891 11704.0 5099.0 7681.0 6763.0
1892 13174.0 5542.0 9039.0 7782.0
1893 12784.0 5695.0 8049.0 7223.0
1894 13151.0 5565.0 8238.0 7275.0
1895 13446.0 5949.0 8321.0 7277.0
1896 13811.0 5860.0 8139.0 7747.0
1897 13412.0 5429.0 7550.0 7199.0
1898 14406.0 5773.0 8160.0 7399.0
1899 13172.0 5115.0 6990.0 6086.0
1900 16710.0 6115.0 9834.0 8580.0
1901 13137.0 4923.0 6899.0 5990.0
1902 14485.0 5288.0 7908.0 6616.0
1903 14275.0 5098.0 7609.0 6311.0
1904 14962.0 5330.0 8108.0 6416.0
1905 16067.0 5424.0 8059.0 6496.0
1906 16371.0 5502.0 8263.0 6567.0
1907 17579.0 5575.0 8983.0 6905.0
1908 18664.0 5858.0 9342.0 7527.0
1909 19256.0 5803.0 9591.0 7914.0
... ... ... ... ...
1981 11030.0 5182.0 34857.0 24774.0
1982 10847.0 5260.0 34672.0 25584.0
1983 9889.0 5219.0 33139.0 25370.0
1984 9285.0 5302.0 32580.0 24867.0
1985 9235.0 5879.0 31479.0 24605.0
1986 8499.0 5894.0 30173.0 24311.0
1987 8394.0 6160.0 29526.0 24192.0
1988 8505.0 6426.0 29380.0 24103.0
1989 8641.0 6887.0 29830.0 24678.0
1990 8651.0 7283.0 29064.0 24878.0
1991 8743.0 7104.0 27782.0 23861.0
1992 8430.0 6830.0 26218.0 23060.0
1993 8089.0 6796.0 24975.0 22192.0
1994 7722.0 7516.0 24168.0 21483.0
1995 7408.0 8544.0 23209.0 20142.0
1996 6926.0 8543.0 22144.0 20533.0
1997 6616.0 8330.0 21364.0 19999.0
1998 6415.0 8365.0 20593.0 20822.0
1999 6350.0 9081.0 20328.0 20701.0
2000 6174.0 10569.0 20066.0 20643.0
2001 5715.0 10564.0 18869.0 20087.0
2002 5439.0 10372.0 17429.0 20103.0
2003 4996.0 9429.0 17206.0 19976.0
2004 4792.0 9510.0 16429.0 20213.0
2005 4439.0 9085.0 15747.0 19025.0
2006 4073.0 8590.0 15140.0 18915.0
2007 3665.0 7866.0 14405.0 18839.0
2008 3478.0 7236.0 13273.0 18337.0
2009 NaN 6755.0 12048.0 17852.0
2010 NaN 6242.0 11424.0 16870.0
[131 rows x 4 columns]
plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (15, 10)
subset.plot(subplots=True, grid=True, title='Name Trends')
plt.savefig('NameTrends.png', dpi=200)
plt.close()