본문 바로가기
Python 과 머신러닝/II. 데이터처리 문법

[Python 머신러닝] 3장. 그룹화 (group by & apply)

by JoyfulS 2019. 10. 18.

< 공부한 내용 >

1. group by (그룹화)

2. apply (그룹에 함수 적용)

 

1. group by

"""
DataFrame 그룹화
 - DF 객체 대상 특정 칼럼으로 그룹화
   형식) DF.groupby('집단변수').수학통계()
"""

import pandas as pd

iris.csv
0.00MB

iris = pd.read_csv("../data/iris.csv")
iris.info()

 

# 1. 집단변수 하나로 -> 전체 칼럼 대상으로 그룹객체

iris_g = iris.groupby('Species')
iris_g #object info
iris_g.size()
'''
setosa        50
versicolor    50
virginica     50
'''
# 그룹의 평균
iris_g.mean()
'''
            Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
Species                                                         
setosa             5.006        3.428         1.462        0.246
versicolor         5.936        2.770         4.260        1.326
virginica          6.588        2.974         5.552        2.026
'''

# 그룹의 표준편차
iris_g.std()

# 그룹의 시각화 (boxplot)
iris_g.boxplot(figsize = (20, 10))

 

# 2. 집단변수로 하나로 -> 2개 칼럼 그룹화

# 형식) DF[['칼럼1', '칼럼2']].groupby(DF['집단변수'])
iris_g2 = iris[['Sepal.Length', 'Petal.Length']].groupby(iris['Species'])
iris_g2.size()

iris_g2.mean()


# wine dataset

winequality-both.csv
0.37MB

wine = pd.read_csv("../data/winequality-both.csv")
wine.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed acidity           6497 non-null float64
'''

 

칼럼명에 공백 또는 점(.)이 포함된 경우 언더바(_)로 교체한 후 데이터를 다루는 것이 좋음
# 나중에 공백을 기준으로 나뉘어 두 개의 데이터로 인식할 가능성도 있기 때문
wine.columns = wine.columns.str.replace(' ', '_')
wine.columns

wine['type'].value_counts()  # 집단변수가 가지고 있는 빈도수 확인
'''
white    4898
red      1599
'''

# 집단변수를 이용해서 subset
red_wine = wine.loc[wine['type']=='red', :] # 행자리에 boolean 조건식 쓸 수 있음 => [True/False, :]
red_wine.info()
'''
Int64Index: 1599 entries, 0 to 1598
Data columns (total 13 columns):
'''

white_wine = wine.loc[wine['type']=='white', :]
white_wine.info()
'''
Int64Index: 4898 entries, 1599 to 6496
Data columns (total 13 columns):
'''

# 각 와인별 quality
red_quality = wine.loc[wine['type']=='red', 'quality']
white_quality = wine.loc[wine['type']=='white', 'quality']

# quality 칼럼 1개만 넘어옴
type(red_quality) # Series - 1차원
type(white_quality) # Series - 1차원


# 3. 집단변수로 2개로 -> 나머지 칼럼 그룹화

# 형식) DF.groupby(['칼럼1', '칼럼2'])

# 범주 확인
wine['type'].unique() # ['red', 'white']
wine['quality'].unique() # [5, 6, 7, 4, 8, 3, 9]

wine_g = wine.groupby(['type', 'quality'])
wine_g
wine_g_size = wine_g.size()
wine_g_size
'''
*** type과 quality는 index에 해당하고 데이터는 오른쪽에 한 줄만 해당
type   quality
red     3            10
         4            53
         5           681
         6           638
         7           199
         8            18
white  3            20
         4           163
         5          1457
         6          2198
         7           880
         8           175
         9             5
'''
type(wine_g_size) # Series - 1차원

# long(1차원) -> wide(2차원)
wine_g_df = wine_g_size.unstack()
wine_g_df
'''
quality     3      4       5       6      7      8    9
type                                                   
red      10.0   53.0   681.0   638.0  199.0   18.0  NaN
white    20.0  163.0  1457.0  2198.0  880.0  175.0  5.0
'''
type(wine_g_df) # DataFrame - 2차원


# 교차분할표 : NaN -> 0
pd.crosstab(wine['type'], wine['quality'])
'''
quality   3    4     5     6    7    8  9
type                                     
red      10   53   681   638  199   18  0
white    20  163  1457  2198  880  175  5
'''

# 누적형 가로막대 차트
wine_g_df.plot(kind='barh', title = 'red vs white wine quality')

 

2. apply

"""
1. group 객체 대상으로 외부 함수 적용
   agg(), apply()
2. data 정규화     
"""

import pandas as pd

tips.csv
0.01MB


tips = pd.read_csv("../data/tips.csv")
tips.info()

# 파생변수 : 팁비율
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.info()

# 칼럼명 변경
tips['gender'] = tips['sex']
del tips['sex']
tips.info()

tips.head()

# 그룹 생성 : 성별, 흡연자
# DF.groupby(['칼럼1', '칼럼2'])

tips_g = tips.groupby(['gender', 'smoker'])
tips_g_size = tips_g.size()
tips_g_size
'''
gender  smoker
Female  No        54
           Yes       33
Male    No        97
           Yes       60
'''

# 그룹 전체 요약통계량
tips_g.describe()

# 그룹 특정 칼럼 요약통계량
tips_g['tip_pct'].describe()

# 1. agg(), apply()

'''
group.agg(['func1', 'func2', ... , 'funcn'])
group.apply('func1')
'''

group.agg()  # agg는 pandas에서 제공하는 함수들 사용 가능, 함수 여러개
tips_g['tip_pct'].agg(['var', 'mean', 'max', 'min'])

group.apply()  # apply는 내장함수만 적용 가능, 함수 하나만
tips_g['tip_pct'].apply(sum)
tips_g['tip_pct'].apply(max)

from statistics import mean
tips_g['tip_pct'].apply(mean)


# 2. data 정규화 : 일정한 범위로 맞추는 작업

from numpy import min, max

def normal(x) :
    n = (x - min(x)) / (max(x) - min(x))
    return n

# 1차원 data 정규화
x = [10, 200005, 0, -105, 0.15]
normal(x)
# [5.74683924e-04, 1.00000000e+00, 5.24711409e-04, 0.00000000e+00, 5.25460996e-04]

# 2차원 data 정규화

iris.csv
0.00MB

iris = pd.read_csv("../data/iris.csv")
cols = list(iris.columns)
cols # ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']

iris_x = iris[cols[:4]]

iris_nor = iris_x.apply(normal)
iris_nor

iris_nor.describe() # 요약통계량

 

---------------------------------------------------------- example ----------------------------------------------------------

exam01.py
0.00MB
movie_rating.csv
0.00MB

댓글