본문 바로가기

Python_Intermediate/Pandas

Pandas - 1880 ~ 2010 년까지 출생 자료 분석 1

1. Sample Data

1880 ~ 2010 년까지 태어난 아이의 이름 / 성별 / 출생수

yob1880.txt

yob1881.txt

yob1882.txt

yob1883.txt

yob1884.txt

yob1885.txt

yob1886.txt

yob1887.txt

yob1888.txt

yob1889.txt

yob1890.txt

yob1891.txt

yob1892.txt

yob1893.txt

yob1894.txt

yob1895.txt

yob1896.txt

yob1897.txt

yob1898.txt

yob1899.txt

yob1900.txt

yob1901.txt

yob1902.txt

yob1903.txt

yob1904.txt

yob1905.txt

yob1906.txt

yob1907.txt

yob1908.txt

yob1909.txt

yob1910.txt

yob1911.txt

yob1912.txt

yob1913.txt

yob1914.txt

yob1915.txt

yob1916.txt

yob1917.txt

yob1918.txt

yob1919.txt

yob1920.txt

yob1921.txt

yob1922.txt

yob1923.txt

yob1924.txt

yob1925.txt

yob1926.txt

yob1927.txt

yob1928.txt

yob1929.txt

yob1930.txt

yob1931.txt

yob1932.txt

yob1933.txt

yob1934.txt

yob1935.txt

yob1936.txt

yob1937.txt

yob1938.txt

yob1939.txt

yob1940.txt

yob1941.txt

yob1942.txt

yob1943.txt

yob1944.txt

yob1945.txt

yob1946.txt

yob1947.txt

yob1948.txt

yob1949.txt

yob1950.txt

yob1951.txt

yob1952.txt

yob1953.txt

yob1954.txt

yob1955.txt

yob1956.txt

yob1957.txt

yob1958.txt

yob1959.txt

yob1960.txt

yob1961.txt

yob1962.txt

yob1963.txt

yob1964.txt

yob1965.txt

yob1966.txt

yob1967.txt

yob1968.txt

yob1969.txt

yob1970.txt

yob1971.txt

yob1972.txt

yob1973.txt

yob1974.txt

yob1975.txt

yob1976.txt

yob1977.txt

yob1978.txt

yob1979.txt

yob1980.txt

yob1981.txt

yob1982.txt

yob1983.txt

yob1984.txt

yob1985.txt

yob1986.txt

yob1987.txt

yob1988.txt

yob1989.txt

yob1990.txt

yob1991.txt

yob1992.txt

yob1993.txt

yob1994.txt

yob1995.txt

yob1996.txt

yob1997.txt

yob1998.txt

yob1999.txt

yob2000.txt

yob2001.txt

yob2002.txt

yob2003.txt

yob2004.txt

yob2005.txt

yob2006.txt

yob2007.txt

yob2008.txt

yob2009.txt

yob2010.txt


2. Import Module

import pandas as pd
import matplotlib.pyplot as plt
from print_df import print_df


3. Pandas Data Code

- 일부 파일에 DataFrame 작업(파일에 컬럼명이 없으므로 header=None / 필요한 컬럼명 지정)

names1880 = pd.read_csv('data\yob1880.txt', header=None, names=['name', 'gender', 'born'], encoding='utf-8')


- DataFrame 정보 확인

print(names1880.info())

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 2000 entries, 0 to 1999

Data columns (total 3 columns):

name      2000 non-null object

gender    2000 non-null object

born      2000 non-null int64

dtypes: int64(1), object(2)

memory usage: 47.0+ KB

None


- DataFrame에 Head 부분 확인

print_df(names1880.head())

<class 'pandas.core.frame.DataFrame'>

(5, 3)

+---+-----------+--------+------+

|   |    name   | gender | born |

+---+-----------+--------+------+

| 0 |    Mary   |   F    | 7065 |

| 1 |    Anna   |   F    | 2604 |

| 2 |    Emma   |   F    | 2003 |

| 3 | Elizabeth |   F    | 1939 |

| 4 |   Minnie  |   F    | 1746 |

+---+-----------+--------+------+


- Sample Data가 1880 ~ 2010년까지 많은 Data 인데 위에 대로 하면 하나하나 변수가 늘어나므로 리스트화

years = range(1880, 2011)
df_pieces = [] # 1880 ~ 2010년 까지의 데이터 프레임을 젖아할 리스트
for year in years: # 1880 ~ 2010
path = 'data\yob%d.txt' % year # 파일 이름
df = pd.read_csv(path, header=None, names=['name', 'gender', 'born'], encoding='utf-8')
df['year'] = year # 데이터 프레임에 year 컬럼 추가
df_pieces.append(df) # 데이터 프레임들의 리스트


- df_pieces 즉, DataFrame에 길이 확인

print(len(df_pieces))

131


- 131개의 DataFrame중에 첫번째에 Head 확인

print_df(df_pieces[0].head())

<class 'pandas.core.frame.DataFrame'>

(5, 4)

+---+-----------+--------+------+------+

|   |    name   | gender | born | year |

+---+-----------+--------+------+------+

| 0 |    Mary   |   F    | 7065 | 1880 |

| 1 |    Anna   |   F    | 2604 | 1880 |

| 2 |    Emma   |   F    | 2003 | 1880 |

| 3 | Elizabeth |   F    | 1939 | 1880 |

| 4 |   Minnie  |   F    | 1746 | 1880 |

+---+-----------+--------+------+------+


- Data 분석을 위한 Concat 작업(열 방향으로 Data 병합)

names = pd.concat(df_pieces, ignore_index=True)


- Concat Data Head 확인

print_df(names.head())

<class 'pandas.core.frame.DataFrame'>

(5, 4)

+---+-----------+--------+------+------+

|   |    name   | gender | born | year |

+---+-----------+--------+------+------+

| 0 |    Mary   |   F    | 7065 | 1880 |

| 1 |    Anna   |   F    | 2604 | 1880 |

| 2 |    Emma   |   F    | 2003 | 1880 |

| 3 | Elizabeth |   F    | 1939 | 1880 |

| 4 |   Minnie  |   F    | 1746 | 1880 |

+---+-----------+--------+------+------+


- Concat Data Tail 확인

print_df(names.tail())

<class 'pandas.core.frame.DataFrame'>

(5, 4)

+---------+-----------+--------+------+------+

|         |    name   | gender | born | year |

+---------+-----------+--------+------+------+

| 1690779 |  Zymaire  |   M    |  5   | 2010 |

| 1690780 |   Zyonne  |   M    |  5   | 2010 |

| 1690781 | Zyquarius |   M    |  5   | 2010 |

| 1690782 |   Zyran   |   M    |  5   | 2010 |

| 1690783 |   Zzyzx   |   M    |  5   | 2010 |

+---------+-----------+--------+------+------+


- 남(M) / 여(F) 아이의 출산율 증감세 확인 Pivot Table 구조 생성

'''
pivot_table(분석할 데이터프레임, index=행 인덱스로 들어갈 키 열 또는 키 열의 리스트,
columns=열 인덱스로 들어갈 키 열 또는 키 열의 리스트,
aggfunc=분석 메서드)
'''
total_born = names.pivot_table('born', index='year', columns='gender', aggfunc=sum)

<class 'pandas.core.frame.DataFrame'>

(131, 2)

+------+---------+---------+

|      |    F    |    M    |

+------+---------+---------+

| 1880 |  90993  |  110493 |

| 1881 |  91955  |  100748 |

| 1882 |  107851 |  113687 |

| 1883 |  112322 |  104632 |

| 1884 |  129021 |  114445 |

| 1885 |  133056 |  107802 |

| 1886 |  144538 |  110785 |

| 1887 |  145983 |  101412 |

| 1888 |  178631 |  120857 |

| 1889 |  178369 |  110590 |

| 1890 |  190377 |  111026 |

| 1891 |  185486 |  101198 |

| 1892 |  212350 |  122038 |

| 1893 |  212908 |  112319 |

| 1894 |  222923 |  115775 |

| 1895 |  233632 |  117398 |

| 1896 |  237924 |  119575 |

| 1897 |  234199 |  112760 |

| 1898 |  258771 |  122703 |

| 1899 |  233022 |  106218 |

| 1900 |  299873 |  150554 |

| 1901 |  239351 |  106478 |

| 1902 |  264079 |  122660 |

| 1903 |  261976 |  119240 |

| 1904 |  275375 |  128129 |

| 1905 |  291641 |  132319 |

| 1906 |  295301 |  133159 |

| 1907 |  318558 |  146838 |

| 1908 |  334277 |  154339 |

| 1909 |  347191 |  163983 |

| 1910 |  396416 |  194198 |

| 1911 |  418180 |  225936 |

| 1912 |  557939 |  429926 |

| 1913 |  624317 |  512482 |

| 1914 |  761376 |  654746 |

| 1915 |  983824 |  848647 |

| 1916 | 1044249 |  890142 |

| 1917 | 1081194 |  925512 |

| 1918 | 1157585 | 1013720 |

| 1919 | 1130149 |  980215 |

| 1920 | 1198214 | 1064468 |

| 1921 | 1232845 | 1101374 |

| 1922 | 1200796 | 1088380 |

| 1923 | 1206239 | 1096227 |

| 1924 | 1248821 | 1132671 |

| 1925 | 1217217 | 1115798 |

| 1926 | 1185078 | 1110440 |

| 1927 | 1192207 | 1126259 |

| 1928 | 1152836 | 1107113 |

| 1929 | 1116284 | 1074833 |

| 1930 | 1125521 | 1096663 |

| 1931 | 1064233 | 1038586 |

| 1932 | 1066930 | 1043512 |

| 1933 | 1007523 |  990677 |

| 1934 | 1043879 | 1031962 |

| 1935 | 1048264 | 1040649 |

| 1936 | 1040068 | 1036662 |

| 1937 | 1063722 | 1065964 |

| 1938 | 1103173 | 1108480 |

| 1939 | 1096394 | 1106328 |

| 1940 | 1143281 | 1158668 |

| 1941 | 1207799 | 1227621 |

| 1942 | 1350524 | 1380468 |

| 1943 | 1394891 | 1426400 |

| 1944 | 1327077 | 1362379 |

| 1945 | 1307299 | 1346227 |

| 1946 | 1570555 | 1622540 |

| 1947 | 1772246 | 1827415 |

| 1948 | 1697664 | 1753472 |

| 1949 | 1709607 | 1770104 |

| 1950 | 1713001 | 1789936 |

| 1951 | 1798537 | 1877553 |

| 1952 | 1854427 | 1943533 |

| 1953 | 1880072 | 1968772 |

| 1954 | 1941562 | 2036948 |

| 1955 | 1954508 | 2057390 |

| 1956 | 2007416 | 2113067 |

| 1957 | 2043983 | 2155427 |

| 1958 | 2010620 | 2120014 |

| 1959 | 2022889 | 2133147 |

| 1960 | 2022062 | 2132588 |

| 1961 | 2016762 | 2121629 |

| 1962 | 1966340 | 2068493 |

| 1963 | 1927055 | 2031443 |

| 1964 | 1894277 | 1992762 |

| 1965 | 1764815 | 1861138 |

| 1966 | 1691650 | 1783577 |

| 1967 | 1650576 | 1744222 |

| 1968 | 1639977 | 1738585 |

| 1969 | 1686705 | 1789326 |

| 1970 | 1747811 | 1858782 |

| 1971 | 1663043 | 1768325 |

| 1972 | 1521044 | 1622178 |

| 1973 | 1457913 | 1558562 |

| 1974 | 1467227 | 1572415 |

| 1975 | 1457349 | 1561378 |

| 1976 | 1464852 | 1569098 |

| 1977 | 1532574 | 1642747 |

| 1978 | 1531242 | 1641143 |

| 1979 | 1604588 | 1720781 |

| 1980 | 1659259 | 1782388 |

| 1981 | 1666833 | 1789568 |

| 1982 | 1692036 | 1812642 |

| 1983 | 1669486 | 1790670 |

| 1984 | 1682396 | 1802735 |

| 1985 | 1719450 | 1846162 |

| 1986 | 1714053 | 1839442 |

| 1987 | 1737508 | 1865113 |

| 1988 | 1779112 | 1911858 |

| 1989 | 1843057 | 1999840 |

| 1990 | 1897256 | 2052070 |

| 1991 | 1874110 | 2019018 |

| 1992 | 1842818 | 1995760 |

| 1993 | 1807795 | 1959712 |

| 1994 | 1784407 | 1930363 |

| 1995 | 1757240 | 1902100 |

| 1996 | 1751681 | 1892700 |

| 1997 | 1739331 | 1883571 |

| 1998 | 1765390 | 1909676 |

| 1999 | 1772139 | 1918267 |

| 2000 | 1813960 | 1961702 |

| 2001 | 1798284 | 1940498 |

| 2002 | 1794358 | 1938941 |

| 2003 | 1824406 | 1972439 |

| 2004 | 1833005 | 1981557 |

| 2005 | 1843890 | 1993285 |

| 2006 | 1896468 | 2050234 |

| 2007 | 1916888 | 2069242 |

| 2008 | 1883645 | 2032310 |

| 2009 | 1827643 | 1973359 |

| 2010 | 1759010 | 1898382 |

+------+---------+---------+


- Data 시각화 작업

plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (15, 10)

total_born.plot()
plt.title('F / M Total Born')
plt.ylabel('Count')
plt.xlabel('Year')
plt.grid()
plt.savefig('total_born.png', dpi=200)
plt.close()