
6
資料來源 : eCommerce Events History in Cosmetics Shop

環境準備,使用 Python NumPy、Pandas, Matplolib、Plotly、Seaborn
import pandas as pd
import numpy as np
import seaborn as sb
下載kaggle 原始資料
import kagglehub
path = kagglehub.dataset_download("mkechinov/ecommerce-events-history-in-cosmetics-shop")
print("Path to dataset files:", path)
import os
files = os.listdir(path)
print("資料夾內的檔案:")
for f in files:
print(f)

from glob import glob
data_path = path
csv_files = glob(os.path.join(data_path, "*.csv"))
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)
print("資料筆數:", len(df))
df.head()

df.columns
df.info()


讀取資料、查看基本訊息 Import data 、View basic information
event_time 事件時間
event_type 事件類型 click / cart / remove / purchase
product_id 商品代號
category_id 商品類別代號
category_code 商品類別
brand 品牌
price 單價
user_id 使用者uuid (唯一值)
user_session 使用者每次互動識別碼 (唯一值,一個識別碼可以有一連串動作)
這裡只抽取100萬筆來做

import plotly.express as px
df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')
df['event_year_month'] = df['event_time'].dt.strftime('%Y_%m')
event_type_count = df.groupby(["event_year_month", "event_type"]).count()["event_time"] # 在每個分組中,計算 event_time 欄位的非空值數量
fig = px.bar(event_type_count.reset_index(), x="event_year_month", y="event_time", color="event_type", title="Events by Month")
fig.show()

先提取出後面會用到的時間feature
df_2['event_time'] = pd.to_datetime(df_2['event_time'])
df_2['year'] = df_2['event_time'].dt.year
df_2['month'] = df_2['event_time'].dt.month
df_2['day'] = df_2['event_time'].dt.day
df_2['hour'] = df_2['event_time'].dt.hour
df_2['weekday'] = df_2['event_time'].dt.day_name().astype('category')
df_2['weeknum'] = 'week_' + df_2['event_time'].dt.isocalendar().week.astype(str)
df_2['weeknum'] = df_2['weeknum'].astype('category')
設一個新df,計算每個user平均session時長
df_2 = df.copy()
# 每個 session 的停留時間(分鐘)
session_duration_df = df_2.groupby(['user_id', 'user_session'])['event_time'].agg(session_start='min', session_end='max')
session_duration_df['session_duration'] = (session_duration_df['session_end'] - session_duration_df['session_start']).dt.total_seconds() / 60
session_duration_df = session_duration_df.reset_index()
# 每個 user 的平均 session 時長
user_session_stats = session_duration_df.groupby('user_id')['session_duration'].mean().reset_index()
提取其他用戶特徵,再合併
# 其他 user features
user_features = df_2.groupby('user_id').agg({
'event_time': ['min', 'max', 'nunique'], # 活躍時間範圍與天數
'event_type': 'count', # 行為總次數
'product_id': 'nunique', # 互動過的商品數
'brand': 'nunique', # 涉及的品牌數
'price': ['sum', 'mean'], # 商品價格總和與平均
'year': 'nunique', # 活躍的年數
'month': 'nunique',
'day': 'nunique',
'hour': 'nunique',
'weekday': 'nunique',
'weeknum': 'nunique'
})
user_features.columns = ['_'.join(col).strip() for col in user_features.columns.values] # 扁平化 (flatten) 多層欄位名稱
user_features = user_features.reset_index()
# 合併 session duration
user_features = user_features.merge(user_session_stats, on='user_id', how='left')

算出有購買的人的總購買金額,設為target
# 取購買金額為 target
purchase_df = df_2[df_2['event_type'] == 'purchase']
user_target = purchase_df.groupby('user_id')['price'].sum().reset_index()
user_target.columns = ['user_id', 'total_purchase_value']
user_target

user_target 合併進 user_features
# 合併特徵與目標
df_final = user_features.merge(user_target, on='user_id', how='left')
df_final['total_purchase_value'] = df_final['total_purchase_value'].fillna(0)
print(df_final.shape[0])
df_final.columns
df_final.head(3)

# model
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# 篩選出有購買行為的用戶(target > 0)
reg_df = df_final[df_final['total_purchase_value'] > 0].copy()
# 類別欄位轉為 category dtype
for col in ['weekday_nunique', 'weeknum_nunique']:
reg_df[col] = reg_df[col].astype('category')
#
X = reg_df[[
'event_time_nunique',
'event_type_count',
'product_id_nunique',
'brand_nunique',
'price_mean',
'year_nunique',
'month_nunique',
'day_nunique',
'hour_nunique',
'weekday_nunique',
'weeknum_nunique',
'session_duration'
]]
y = reg_df['total_purchase_value']
# 資料拆分(Train / Valid / Test)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=777)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=123) # 0.25 * 0.8 = 0.2
# model
reg_model = XGBRegressor(
tree_method='hist',
device='cuda',
enable_categorical=True,
eval_metric='rmse',
n_estimators=500,
learning_rate=0.05,
early_stopping_rounds=30
)
reg_model.fit(
X_train, y_train,
eval_set=[(X_valid, y_valid)],
verbose=True
)
#
y_pred = reg_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
#
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("True Purchase Value")
plt.ylabel("Predicted Purchase Value")
plt.title("Actual vs Predicted Purchase Value")
plt.show()

LSTM 會記住時間順序,因此不篩選 purchase
用過去行為預測下個動作是不是 purchase ,並估價格
df_2 = df.copy()
df_2['event_time'] = pd.to_datetime(df_2['event_time'])
df_2 = df_2.sort_values(['user_id', 'event_time'])
from tqdm import tqdm
# 參數
# 過去10筆使用者行為,當作輸入
sequence_length = 10
sequences = []
targets = []
# 每個user_id,做使用者的行為序列處理
for user_id, user_df in tqdm(df_2.groupby('user_id'), desc="Building sequences"):
user_df = user_df[['event_time', 'event_type', 'price']].copy()
# 數值化 event_type
event_map = {'view': 0, 'cart': 1, 'purchase': 2, 'remove_from_cart': 3}
user_df['event_type'] = user_df['event_type'].map(event_map)
# 時間特徵
user_df['hour'] = user_df['event_time'].dt.hour
user_df['dayofweek'] = user_df['event_time'].dt.dayofweek
user_df['timediff'] = user_df['event_time'].diff().dt.total_seconds().fillna(0)
# 輸入欄
input_features = ['event_type', 'price', 'hour', 'dayofweek', 'timediff']
user_df = user_df[input_features]
# 產生序列與目標
if len(user_df) >= sequence_length + 1:
for i in range(len(user_df) - sequence_length):
seq = user_df.iloc[i:i+sequence_length].values
label_row = user_df.iloc[i + sequence_length]
event_type_val = label_row['event_type']
price_val = label_row['price']
label = price_val if event_type_val == 2 else 0
sequences.append(seq)
targets.append(label)
預測使用者下一筆行為是否為購買(purchase)
若是購買,就預測金額,否則為 0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
# 轉為 numpy 陣列
X = np.array(sequences)
y = np.array(targets)
# 分割資料
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=2222)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=2222)
#
model = Sequential([
Input(shape=(X.shape[1], X.shape[2])), # (10, 5)
LSTM(64, return_sequences=False),
Dropout(0.3),
Dense(128, activation='relu'),
Dense(32, activation='relu'),
Dense(1)
])
model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(
X_train, y_train,
validation_data=(X_valid, y_valid),
epochs=50,
batch_size=64,
callbacks=[early_stop],
verbose=2
)
#
y_pred = model.predict(X_test)
# 評估指標
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
#
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.4)
plt.xlabel("True Purchase Value")
plt.ylabel("Predicted Purchase Value")
plt.title("LSTM Predicted vs. Actual Purchase Value")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') # y=x reference line
plt.grid(True)
plt.tight_layout()
plt.show()

df_2 = df.copy()
from sklearn.preprocessing import LabelEncoder
# 篩選出購買資料
purchase_df = df_2[df_2['event_type'] == 'purchase'][['user_id', 'product_id']].dropna()
#
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
purchase_df['user_enc'] = user_encoder.fit_transform(purchase_df['user_id'])
purchase_df['item_enc'] = item_encoder.fit_transform(purchase_df['product_id'])
num_users = purchase_df['user_enc'].nunique()
num_items = purchase_df['item_enc'].nunique()
print("user 數量:", num_users)
print("商品數量:", num_items)

# 所有 user-item 配對正樣本
positive_samples = purchase_df[['user_enc', 'item_enc']]
positive_samples['label'] = 1
# 負樣本隨機配對
neg_samples = []
for user in positive_samples['user_enc'].unique():
for _ in range(10): # 每個 user 加入 10 個負樣本
random_item = np.random.randint(0, num_items)
if not ((positive_samples['user_enc'] == user) & (positive_samples['item_enc'] == random_item)).any():
neg_samples.append([user, random_item, 0])
neg_df = pd.DataFrame(neg_samples, columns=['user_enc', 'item_enc', 'label'])
# 合併正負樣本
train_df = pd.concat([positive_samples, neg_df], ignore_index=True)
查看第一位的樣本
train_df.sort_values('user_enc', inplace=True)
train_df.head(20)

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))
# 嵌入層
user_embed = Embedding(num_users, 32)(user_input)
item_embed = Embedding(num_items, 32)(item_input)
# 展平 (batch, 1, 32) -> (batch, 32)
user_vec = Flatten()(user_embed)
item_vec = Flatten()(item_embed)
# 合併後接 MLP
merged = Concatenate()([user_vec, item_vec])
x = Dense(128, activation='relu')(merged)
x = Dense(64, activation='relu')(merged)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

from sklearn.model_selection import train_test_split
X_user = train_df['user_enc'].values
X_item = train_df['item_enc'].values
y = train_df['label'].values
X_user_temp, X_user_test, X_item_temp, X_item_test, y_temp, y_test = train_test_split(
X_user, X_item, y, test_size=0.2, random_state=444
)
X_user_train, X_user_val, X_item_train, X_item_val, y_train, y_val = train_test_split(
X_user_temp, X_item_temp, y_temp, test_size=0.2, random_state=444
)
#
early_stop = EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True
)
model.fit(
[X_user_train, X_item_train], y_train,
validation_data=([X_user_val, X_item_val], y_val),
epochs=50,
batch_size=64,
callbacks=[early_stop],
verbose=2
)
#
loss, acc = model.evaluate([X_user_test, X_item_test], y_test, verbose=0)
print(f"Test Accuracy: {acc:.4f}")
from sklearn.metrics import roc_auc_score
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
y_pred_prob = model.predict([X_user_test, X_item_test])
y_pred_label = (y_pred_prob > 0.5).astype(int)
print(classification_report(y_test, y_pred_label, digits=4))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_prob):.4f}")

# 整個df看
results_df = pd.DataFrame({
'user_enc': X_user_test,
'item_enc': X_item_test,
'true_label': y_test,
'pred_prob': y_pred_prob.flatten(),
'pred_label': y_pred_label.flatten()
})
results_df.sort_values('user_enc', inplace=True)
results_df.head(20)

# 加入原始
results_df['user_id'] = user_encoder.inverse_transform(results_df['user_enc'])
results_df['product_id'] = item_encoder.inverse_transform(results_df['item_enc'])
results_df

# 篩選出模型預測為會購買的
filtered = results_df[results_df['pred_label'] == 1]
grouped = filtered.groupby('user_enc')['item_enc'].apply(list).reset_index()
grouped.rename(columns={'item_enc': 'predicted_items'}, inplace=True)
# 還原 user_id
grouped['user_id'] = user_encoder.inverse_transform(grouped['user_enc'])
# 還原 product_id
grouped['predicted_product_ids'] = grouped['predicted_items'].apply(
lambda x: item_encoder.inverse_transform(x)
)
grouped
