
6
資料來源 : eCommerce Events History in Cosmetics Shop

環境準備,使用 Python NumPy、Pandas, Matplolib、Plotly、Seaborn
import pandas as pd
import numpy as np
import seaborn as sb
下載kaggle 原始資料
import kagglehub
path = kagglehub.dataset_download("mkechinov/ecommerce-events-history-in-cosmetics-shop")
print("Path to dataset files:", path)
import os
files = os.listdir(path)
print("資料夾內的檔案:")
for f in files:
print(f)

from glob import glob
data_path = path
csv_files = glob(os.path.join(data_path, "*.csv"))
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)
print("資料筆數:", len(df))
df.head()

df.columns
df.info()


讀取資料、查看基本訊息 Import data 、View basic information
event_time 事件時間
event_type 事件類型 click / cart / remove / purchase
product_id 商品代號
category_id 商品類別代號
category_code 商品類別
brand 品牌
price 單價
user_id 使用者uuid (唯一值)
user_session 使用者每次互動識別碼 (唯一值,一個識別碼可以有一連串動作)
這裡只抽取100萬筆來做


import plotly.express as px
df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')
df['event_year_month'] = df['event_time'].dt.strftime('%Y_%m')
event_type_count = df.groupby(["event_year_month", "event_type"]).count()["event_time"] # 在每個分組中,計算 event_time 欄位的非空值數量
fig = px.bar(event_type_count.reset_index(), x="event_year_month", y="event_time", color="event_type", title="Events by Month")
fig.show()

這裡我想用顧客每次一連串的動作(user_session):點擊、加入/移除產品到購物車、購買,來判斷下次類似行為的顧客購買機率
import pandas as pd
df['event_time'] = pd.to_datetime(df['event_time'])
#
def extract_session_features(session_df):
# user_id
user_id = session_df['user_id']
# 事件統計 event_type
num_cart = (session_df['event_type'] == 'cart').sum() # 加入購物車的次數
num_remove = (session_df['event_type'] == 'remove_from_cart').sum() # 從購物車移除的次數
num_purchase = (session_df['event_type'] == 'purchase').sum() # 購買的商品數量
# 價格統計 event_type price
total_cart_value = session_df[session_df['event_type'] == 'cart']['price'].sum() # 加入購物車商品的總金額
total_purchase_value = session_df[session_df['event_type'] == 'purchase']['price'].sum() # 最終購買的總金額
# 品牌數量 brand
unique_brands = session_df['brand'].nunique()
purchase_brands = session_df[session_df['event_type'] == 'purchase']['brand'].dropna() # 出現過的品牌數量(不管事件類型)
most_purchased_brand = purchase_brands.mode().iloc[0] if not purchase_brands.empty else 'unknown' # 購買時涉及的品牌數量
num_purchase_brands = purchase_brands.nunique()
# 購買path product_id
cart_set = set(session_df[session_df['event_type'] == 'cart']['product_id']) # 該 session 中最常購買的產品
purchase_set = set(session_df[session_df['event_type'] == 'purchase']['product_id'])
direct_purchase = purchase_set - cart_set # 購買但沒加入購物車的商品數
carted_purchase = purchase_set & cart_set # 有先加購物車後購買的商品數
# 時間統計 event_time
duration = (session_df['event_time'].max() - session_df['event_time'].min()).total_seconds() / 60 # session 活動時間
session_hour = session_df['event_time'].iloc[0].hour # session 發生的起始小時點,查看偏好購物時間
# event_time
even_time = session_df['event_time']
start_time = session_df['event_time'].min() # 開始時間
weekday_str = start_time.strftime('%a') # Mon~Sun
weeknum_str = f"week_{start_time.isocalendar()[1]}"
return pd.Series({
'even_time' : even_time,
'weekday': weekday_str,
'weeknum': weeknum_str,
'user_id': user_id,
'num_cart': num_cart,
'num_remove': num_remove,
'num_purchase': num_purchase,
'total_cart_value': total_cart_value,
'total_purchase_value': total_purchase_value,
'unique_brands': unique_brands,
'num_purchase_brands': num_purchase_brands,
'most_purchased_brand': most_purchased_brand,
'session_duration_min': duration,
'session_start_hour': session_hour,
'num_purchase_only': len(direct_purchase),
'num_purchase_in_cart': len(carted_purchase),
'purchase_direct_ratio': len(direct_purchase) / (len(purchase_set) or 1), # 直接購買的比例
'purchase_occurred': int(num_purchase > 0) # 是否發生購買(0 or 1)
})
# 每個 session 的統計特徵
!pip install tqdm
from tqdm import tqdm
tqdm.pandas()
session_features_df = df.groupby('user_session').progress_apply(extract_session_features).reset_index()
session_features_df

session_features_df.info()

!pip install -U xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
# category 使用 enable_categorical=True
session_features_df['most_purchased_brand'] = session_features_df['most_purchased_brand'].astype('category')
session_features_df['weekday'] = session_features_df['weekday'].astype('category')
session_features_df['weeknum'] = session_features_df['weeknum'].astype('category')
# features label,有關購買的都不能放入
# user_id 不放入,最後merge回來,發送購買提醒
X = session_features_df[[
'num_cart',
'num_remove',
'total_cart_value',
'unique_brands',
'session_duration_min',
'session_start_hour',
'most_purchased_brand',
'weekday',
'weeknum'
]]
y = session_features_df['purchase_occurred']
# train 0.6 / valid 0.2 / test 0.2
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=123) # 0.25 x 0.8 = 0.2
y_test.value_counts()

# model
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) # 0過多,這裡調整類別權重
clf = XGBClassifier(
tree_method='hist', # exact
device='cuda',
use_label_encoder=False,
enable_categorical=True,
eval_metric='logloss',
scale_pos_weight=scale_pos_weight,
early_stopping_rounds=50
)
clf.fit(
X_train, y_train,
eval_set=[(X_valid, y_valid)],
verbose=True
)
for t in [0.5]: # 0.3, 0.4, 0.5
y_proba = clf.predict_proba(X_test)[:, 1]
y_pred = (y_proba > t).astype(int)
print(f"\nThreshold: {t}")
print(classification_report(y_test, y_pred))
#
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
PS y_proba 會返回一個 N×2 的陣列(其中 N 是測試樣本的數量),第一列是預測為類別 0 的機率,第二列是預測為類別 1 的機率
一開始閥值我設0.5

對於0的預測準確率高0.98,召回率(實際為0,預測也為0)高0.9
但對於1的預測準確率只有0.37,召回率(實際為1,預測也為1)高0.8。說明實際為1預測為1的很準,但很多實際為0的也被預測為1,出現很高的假陽性率
這時候要就要提到經典的醫療診斷案例,是「漏掉一個正例(假陰性)」的成本更高,還是「錯誤地將負例判斷為正例(假陽性)」的成本更高?
假設,假陰性成本很高 (例如,癌症腫瘤病人), 0.80 的召回率可能不錯,但如果我要節省行銷成本,準確率就有點過低


之後比較了 0.6, 0.7,發現 0.7 對1的預測準確率有0.8,但召回率下降到0.68,意思是很多實際為1的也被預測為0,出現很高的假陰性率。所以最後選擇 0.6
# 預測結果加到 X_test
X_test = X_test.copy()
X_test['predicted_purchase'] = y_pred
X_test['purchase_probability'] = y_proba
# 抓出預測會購買的 session
predicted_sessions = X_test.loc[(X_test['predicted_purchase'] == 1) & (X_test['purchase_probability'] > 0.5)].index
# 用 index 從原始的 session_features_df 抓 user_session, user_id
predicted_df = session_features_df.loc[predicted_sessions, ['user_session', 'user_id']].copy()
# 預測機率也一起放進來
predicted_df = predicted_df.merge(
X_test[['predicted_purchase', 'purchase_probability']],
left_index=True, right_index=True
)
print(f"預測會購買的 session 數量: {predicted_df.shape[0]}")
predicted_df_sorted = predicted_df.sort_values('purchase_probability', ascending=False)
predicted_df_sorted

# check
# 重要特徵
import matplotlib.pyplot as plt
import xgboost as xgb
#
xgb.plot_importance(clf)
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
# 先確保 weekday 是有序類別(星期一到星期日)
ordered_days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
session_features_df['weekday'] = pd.Categorical(session_features_df['weekday'], categories=ordered_days, ordered=True)
#
weekday_purchase = session_features_df.groupby('weekday')['purchase_occurred'].sum().reindex(ordered_days)
#
plt.figure(figsize=(8, 5))
sns.barplot(x=weekday_purchase.index, y=weekday_purchase.values, palette='Blues_d')
plt.title('Number of Purchases by Day of Week')
plt.xlabel('Weekday')
plt.ylabel('Number of purchases')
plt.tight_layout()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
# 每小時的購買次數(只統計 purchase_occurred == 1)
hourly_purchase = (
session_features_df
.groupby('session_start_hour')['purchase_occurred']
.sum()
.reindex(range(24), fill_value=0) # 0-23 都要有
)
#
plt.figure(figsize=(10, 5))
sns.barplot(x=hourly_purchase.index, y=hourly_purchase.values, palette='Oranges_d')
plt.title('Purchase Activity by Hour')
plt.xlabel('Hour (0-23)')
plt.ylabel('Number of purchases')
plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()
