from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

Matrix Factorization을 통한 상품 추천¶

1. 개요¶

목적

상품 구매 데이터 탐색
상품 추천 모델을 만들어 고객 만족도 증대 및 매출 증대

작업 기간

2020.12.15. ~ 2020.12.29.

데이터 출처

쇼핑몰 주문 데이터

참고

Collaborative Filtering for Implicit Feedback Datasets
https://yeomko.tistory.com/5
https://velog.io/@vvakki_/series/Recommendation-System
https://medium.com/code-states/%EC%B6%94%EC%B2%9C-%EC%8B%9C%EC%8A%A4%ED%85%9C-%EC%95%8C%EA%B3%A0%EB%A6%AC%EC%A6%98-4e5044960bdd

분석 요약¶

상품 구매 패턴
- 800명의 고객 중 상위 10명이 전체 구매개수의 23%를 발생시킴 -> 핵심 고객층
- 155개의 상품 중 상위 10개가 전체 판매개수의 69% 차지 -> 핵심 상품

상품 추천 모델을 만들어 고객 만족도 증대 및 매출 증대
- 5개 이상 구매한 고객들에게 적용할 수 있는 모델 구성

향후 개선 방안
- 고객들을 세그멘테이션하여 고객 유형별로 적합한 추천 모델 생성 가능
- 5개 미만 구매 고객들에게는 판매순위별로 상품을 추천해주는 것도 좋을 것으로 생각

2. 탐색¶

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


tbl_order = pd.read_csv("order_data.csv", encoding="cp949")

2.1. 데이터 개요¶

칼럼 정보¶

tbl_order.head()

User: 유저 고유 번호
Product: 상품 고유 번호
cnt: 상품 구매 개수

-> cnt는 왓챠 별점 같은 Explicit Data가 아니라, Implicit Data임을 고려하여 모델 선정 필요

결측치 및 자료형¶

tbl_order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   User     1953 non-null   int64
 1   Product  1953 non-null   int64
 2   cnt      1953 non-null   int64
dtypes: int64(3)
memory usage: 45.9 KB

-> 결측치 없음
-> User, Product 자료형을 Category로 변환 필요

2.2. 변수 탐색¶

User¶

len(tbl_order["User"].unique())

800

-> 고유유저수 800

Product¶

len(tbl_order["Product"].unique())

155

-> 고유상품수 155

cnt¶

tbl_order["cnt"].describe()

count    1953.000000
mean        3.510497
std         7.283488
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max        92.000000
Name: cnt, dtype: float64

-> cnt를 봤을 때, 상품은 최소 1개부터 최대 92개까지 판매됨
-> 평균 3.5개 판매되고 최소 75% 이상의 데이터는 cnt가 3미만

tbl_order["cnt"].value_counts().plot();

최다 구매자¶

def top10(data, index):
    df = data.pivot_table(
        index=index,
        values="cnt",
        aggfunc="sum"
    )
    
    df_pivot = df.sort_values(
        by="cnt",
        ascending=False
    ).head(10)
    
    return df_pivot

top10(tbl_order, "User")

tbl_order.pivot_table(
    index="User",
    values="cnt",
    aggfunc="mean",
).mean()

cnt    2.597855
dtype: float64

-> 1인당 평균구매개수는 약 3개이고, 최다 구매자는 311개, 구매개수 2~10위 고객은 약 100 ~ 200개의 상품 구매

print("%.2f"%(top10(tbl_order, "User")["cnt"].sum() / tbl_order["cnt"].sum()))

0.23

-> 800명의 고객 중 상위 10명이 전체 구매개수의 23%를 발생시킴
-> 판매 금액까지 살펴봐야겠지만, 이들을 핵심 고객층으로 분류할 수 있음

3. 전처리¶

# 타입 변환
tbl_order["User"] = tbl_order["User"].astype("category")
tbl_order["Product"] = tbl_order["Product"].astype("category")

# 기간 내 5개 미만 판매 상품, 구매 플레이어 제외 -> 모델 성능에 부정적인 영향
pro_cnt_pivot = tbl_order.pivot_table(
    index="Product",
    values="cnt",
    aggfunc="sum"
)
product_over_5 = pro_cnt_pivot[pro_cnt_pivot["cnt"] > 5].index

user_cnt_pivot = tbl_order.pivot_table(
    index="User",
    values="cnt",
    aggfunc="sum"
)
user_over_5 = user_cnt_pivot[user_cnt_pivot["cnt"] > 5].index

tbl_order = tbl_order.loc[tbl_order["User"].isin(user_over_5)]
tbl_order = tbl_order.loc[tbl_order["Product"].isin(product_over_5)]


# "user x product" matrix 생성
index = "User" 
columns = "Product"


pl_pro_mt = pd.DataFrame(
    index=tbl_order[index].unique(),
    columns=tbl_order[columns].unique(),
)
pl_pro_mt =  pl_pro_mt.fillna(0)


for user, product, cnt in tbl_order.to_numpy():
    pl_pro_mt.loc[user][product] = cnt

4. 모델링¶

R = pl_pro_mt.values

# 하이퍼 파라미터 설정
r_lambda = 40 
nf = 200 
alpha = 40 


# latent factor matrix 설정
nu = R.shape[0] # num of users
ni = R.shape[1] # num of items

X = np.random.rand(nu, nf) * 0.01
Y = np.random.rand(ni, nf) * 0.01


# 선호도 행렬 P 설정
# Pui = 1 if Rui > 0
# Pui = 0 if Rui = 0
P = np.copy(R)
P[P > 0] = 1


# 신뢰도 행렬 설정
# Cui = 1 + alpha * Rui
# Cui means confidence level of certain rating data
C = 1 + alpha * R


# loss function 설정
# xTy: predict matrix 
# Total_loss = (confidence_level * predict loss) + regularization loss
def loss_function(C, P, xTy, X, Y, r_lambda):
    predict_error = np.square(P - xTy)
    confidence_error = np.sum(C * predict_error)
    regularization = r_lambda * (np.sum(np.square(X)) + np.sum(np.square(Y)))
    total_loss = confidence_error + regularization
    return np.sum(predict_error), confidence_error, regularization, total_loss

# Optimization Function 설정
# X[u] = (yTCuy + lambda*I)^-1yTCuy
# Y[i] = (xTCix + lambda*I)^-1xTCix
# two formula is the same when it changes X to Y and u to i
def optimize_user(X, Y, C, P, nu, nf, r_lambda):
    yT = np.transpose(Y)
    for u in range(nu):
        Cu = np.diag(C[u])
        yT_Cu_y = np.matmul(np.matmul(yT, Cu), Y)
        lI = np.dot(r_lambda, np.identity(nf))
        yT_Cu_pu = np.matmul(np.matmul(yT, Cu), P[u])
        X[u] = np.linalg.solve(yT_Cu_y + lI, yT_Cu_pu)

def optimize_item(X, Y, C, P, ni, nf, r_lambda):
    xT = np.transpose(X)
    for i in range(ni):
        Ci = np.diag(C[:, i])
        xT_Ci_x = np.matmul(np.matmul(xT, Ci), X)
        lI = np.dot(r_lambda, np.identity(nf))
        xT_Ci_pi = np.matmul(np.matmul(xT, Ci), P[:, i])
        Y[i] = np.linalg.solve(xT_Ci_x + lI, xT_Ci_pi)

5. 학습¶

predict_errors = []
confidence_errors = []
regularization_list = []
total_losses = []

for i in range(11):
    if i!=0:
        optimize_user(X, Y, C, P, nu, nf, r_lambda)
        optimize_item(X, Y, C, P, ni, nf, r_lambda)
    predict = np.matmul(X, np.transpose(Y))
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, predict, X, Y, r_lambda)
    
    predict_errors.append(predict_error)
    confidence_errors.append(confidence_error)
    regularization_list.append(regularization)
    total_losses.append(total_loss)
    
    print('----------------step %d----------------' % i)
    print("predict error: %f" % predict_error)
    print("confidence error: %f" % confidence_error)
    print("regularization: %f" % regularization)
    print("total loss: %f" % total_loss)
    
predict = np.matmul(X, np.transpose(Y))
print('final predict')
print([predict])

----------------step 0----------------
predict error: 1040.071252
confidence error: 223629.451563
regularization: 78.815734
total loss: 223708.267297
----------------step 1----------------
predict error: 1957.755543
confidence error: 30228.909264
regularization: 40034.336375
total loss: 70263.245639
----------------step 2----------------
predict error: 1048.286184
confidence error: 2271.533435
regularization: 10199.389427
total loss: 12470.922863
----------------step 3----------------
predict error: 969.489384
confidence error: 1967.006288
regularization: 9819.870285
total loss: 11786.876574
----------------step 4----------------
predict error: 973.871507
confidence error: 1906.046927
regularization: 9596.527526
total loss: 11502.574452
----------------step 5----------------
predict error: 994.013704
confidence error: 1894.448882
regularization: 9447.675639
total loss: 11342.124521
----------------step 6----------------
predict error: 1015.812681
confidence error: 1897.535092
regularization: 9342.929456
total loss: 11240.464547
----------------step 7----------------
predict error: 1035.955050
confidence error: 1905.535916
regularization: 9266.252555
total loss: 11171.788471
----------------step 8----------------
predict error: 1053.732406
confidence error: 1914.954599
regularization: 9208.353826
total loss: 11123.308425
----------------step 9----------------
predict error: 1069.181503
confidence error: 1924.391448
regularization: 9163.533281
total loss: 11087.924728
----------------step 10----------------
predict error: 1082.553817
confidence error: 1933.283079
regularization: 9128.125285
total loss: 11061.408364
final predict
[array([[ 8.73487993e-01,  9.75443091e-01,  1.00173211e+00, ...,
        -2.34731658e-02, -1.67403464e-02, -2.34443269e-02],
       [ 9.26802135e-01,  5.52245998e-01,  3.55277235e-01, ...,
         9.42334578e-02,  1.44134342e-01,  1.61876154e-01],
       [ 9.12703866e-01,  6.25740915e-01,  9.90459870e-01, ...,
         1.69625627e-01,  1.52475065e-01,  6.01994853e-02],
       ...,
       [ 3.57098316e-01,  1.20569606e-01,  4.83834681e-01, ...,
         1.92470399e-01,  5.22289718e-02, -7.78740018e-03],
       [ 3.10154396e-02,  4.66320553e-04, -1.41674516e-01, ...,
        -4.97078438e-03,  3.94832490e-01,  1.38797734e-01],
       [ 2.26762704e-01, -9.91330317e-02, -7.92634548e-02, ...,
        -1.30759681e-02,  9.14591813e-01,  1.67444842e-01]])]

# visualize training
from matplotlib import pyplot as plt
%matplotlib inline

plt.subplots_adjust(wspace=100.0, hspace=20.0)
fig = plt.figure()
fig.set_figheight(10)
fig.set_figwidth(10)
predict_error_line = fig.add_subplot(2, 2, 1)
confidence_error_line = fig.add_subplot(2, 2, 2)
regularization_error_line = fig.add_subplot(2, 2, 3)
total_loss_line = fig.add_subplot(2, 2, 4)

predict_error_line.set_title("Predict Error") 
predict_error_line.plot(predict_errors)

confidence_error_line.set_title("Confidence Error")
confidence_error_line.plot(confidence_errors)

regularization_error_line.set_title("Regularization")
regularization_error_line.plot(regularization_list)

total_loss_line.set_title("Total Loss")
total_loss_line.plot(total_losses)
plt.show()

Text(0.5, 1.0, 'Predict Error')

[<matplotlib.lines.Line2D at 0x7fad41f244f0>]

Text(0.5, 1.0, 'Confidence Error')

[<matplotlib.lines.Line2D at 0x7fad41f24820>]

Text(0.5, 1.0, 'Regularization')

[<matplotlib.lines.Line2D at 0x7fad41f24940>]

Text(0.5, 1.0, 'Total Loss')

[<matplotlib.lines.Line2D at 0x7fad41f8c940>]

<Figure size 432x288 with 0 Axes>

6. 결과 추출¶

# 행열 이름 매핑
output_matrix = pd.DataFrame(predict)
output_matrix.index = pl_pro_mt.index
output_matrix.columns = pl_pro_mt.columns

# df 스택 
output_stack = output_matrix.stack().reset_index()
output_stack.columns = ["User", "Product", "preference"]
output_stack = output_stack.sort_values(by=["User", "preference"], ascending=(True, False))

output_stack

[kaggle]유튜브 인기 동영상 데이터 분석(파이썬) (2)	2020.02.26
카톡 분석 : 파이썬 (0)	2019.11.28
알바몬 분석: 알바몬 경기의 공고수는 몇개나 될까 (0)	2019.08.18
공공데이터 활용 - 교통사고 통계 리포트 (0)	2019.07.18

박범준의 일상로그

Matrix Factorization 알고리즘을 사용한 상품 추천

Matrix Factorization을 통한 상품 추천¶

1. 개요¶

분석 요약¶

2. 탐색¶

2.1. 데이터 개요¶

칼럼 정보¶

결측치 및 자료형¶

2.2. 변수 탐색¶

User¶

Product¶

cnt¶

최다 구매자¶

인기상품¶

3. 전처리¶

4. 모델링¶

5. 학습¶

6. 결과 추출¶

'데이터 분석' 카테고리의 다른 글

'데이터 분석'의 다른글

티스토리툴바

	User	Product	cnt
0	1	84	1
1	1	137	13
2	1	145	52
3	1	153	9
4	1	154	6

	cnt
User
88	311
633	193
26	160
361	159
133	153
259	135
104	132
128	115
9	114
292	104

	cnt
Product
145	1093
154	921
140	843
153	410
53	407
67	271
155	244
84	217
146	193
135	155

	User	Product	preference
4	1	154	1.005159
2	1	145	1.001732
3	1	153	0.991131
5	1	155	0.976081
1	1	137	0.975443
...	...	...	...
15077	799	92	-0.058193
15075	799	121	-0.059523
15050	799	145	-0.079263
15057	799	110	-0.092584
15049	799	137	-0.099133

Matrix Factorization 알고리즘을 사용한 상품 추천

Matrix Factorization을 통한 상품 추천¶

1. 개요¶

분석 요약¶

2. 탐색¶

2.1. 데이터 개요¶

칼럼 정보¶

결측치 및 자료형¶

2.2. 변수 탐색¶

User¶

Product¶

cnt¶

최다 구매자¶

인기상품¶

3. 전처리¶

4. 모델링¶

5. 학습¶

6. 결과 추출¶

'데이터 분석' 카테고리의 다른 글

'데이터 분석'의 다른글

관련글

티스토리툴바