
6
資料來源 : eCommerce Events History in Cosmetics Shop

環境準備,使用 Python NumPy、Pandas, Matplolib、Plotly、Seaborn
import pandas as pd
import numpy as np
import seaborn as sb
下載kaggle 原始資料
import kagglehub
path = kagglehub.dataset_download("mkechinov/ecommerce-events-history-in-cosmetics-shop")
print("Path to dataset files:", path)
import os
files = os.listdir(path)
print("資料夾內的檔案:")
for f in files:
print(f)

from glob import glob
data_path = path
csv_files = glob(os.path.join(data_path, "*.csv"))
df_list = [pd.read_csv(f) for f in csv_files]
df = pd.concat(df_list, ignore_index=True)
print("資料筆數:", len(df))
df.head()

df.columns
df.info()


event_time 事件時間
event_type 事件類型 click / cart / remove / purchase
product_id 商品代號
category_id 商品類別代號
category_code 商品類別
brand 品牌
price 單價
user_id 使用者uuid (唯一值)
user_session 使用者每次互動識別碼 (唯一值,一個識別碼可以有一連串動作)
這裡只抽取100萬筆來做

import plotly.express as px
df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')
df['event_year_month'] = df['event_time'].dt.strftime('%Y_%m')
event_type_count = df.groupby(["event_year_month", "event_type"]).count()["event_time"] # 在每個分組中,計算 event_time 欄位的非空值數量
fig = px.bar(event_type_count.reset_index(), x="event_year_month", y="event_time", color="event_type", title="Events by Month")
fig.show()

df_2 = df.copy()
df_2['category_code'].value_counts(dropna=False)

df_2['event_time'] = pd.to_datetime(df_2['event_time'])
df_2['year'] = df_2['event_time'].dt.year
df_2['month'] = df_2['event_time'].dt.month
df_2['day'] = df_2['event_time'].dt.day
df_2['hour'] = df_2['event_time'].dt.hour
df_2['weekday'] = df_2['event_time'].dt.day_name().astype('category')
df_2['weeknum'] = 'week_' + df_2['event_time'].dt.isocalendar().week.astype(str)
df_2['weeknum'] = df_2['weeknum'].astype('category')
# 每個 session 的停留時間(分鐘)
session_duration_df = df_2.groupby(['user_id', 'user_session'])['event_time'].agg(session_start='min', session_end='max')
session_duration_df['session_duration'] = (session_duration_df['session_end'] - session_duration_df['session_start']).dt.total_seconds() / 60
session_duration_df = session_duration_df.reset_index()
# 提取喜好的開始購物時間
def time_bucket(hour):
if 5 <= hour < 12:
return 'morning'
elif 12 <= hour < 18:
return 'afternoon'
elif 18 <= hour < 23:
return 'evening'
else:
return 'midnight'
session_duration_df['start_hour'] = session_duration_df['session_start'].dt.hour
session_duration_df['start_period'] = session_duration_df['start_hour'].apply(time_bucket)
# 每個 user 的平均 session 時長
user_session_stats = session_duration_df.groupby('user_id')['session_duration'].mean().reset_index()
user_session_stats.head(2)

# 找出每個user最愛的購物時段
user_period_pref = session_duration_df.groupby('user_id')['start_period'] \
.agg(lambda x: x.mode().iloc[0]) \
.reset_index()
user_period_pref.columns = ['user_id', 'preferred_period']
# 轉換成 one-hot(dummies)
user_period_dummies = pd.get_dummies(user_period_pref['preferred_period'], prefix='period')
# 合併 user_id + dummies
user_period_final = pd.concat([user_period_pref['user_id'], user_period_dummies], axis=1)
user_period_final.head(2)

# 找出每個user最常買的 category_code
df_2[['main_category', 'sub_category']] = df_2['category_code'].str.split('.', n=1, expand=True)
user_preferred_category = df_2.groupby('user_id')['main_category'] \
.agg(lambda x: x.mode().iloc[0] if not x.dropna().empty else np.nan) \
.reset_index()
user_preferred_category.columns = ['user_id', 'preferred_main_category']
# 轉換成 one-hot(dummies)
user_category_dummies = pd.get_dummies(user_preferred_category['preferred_main_category'], prefix='period')
# 合併 user_id + dummies
user_category_final = pd.concat([user_preferred_category['user_id'], user_category_dummies], axis=1)
user_category_final.head(2)

# 取購買金額為 target
purchase_df = df_2[df_2['event_type'] == 'purchase']
user_target = purchase_df.groupby('user_id')['price'].sum().reset_index()
user_target.columns = ['user_id', 'total_purchase_value']
user_target

# 其他 user features
user_features = df_2.groupby('user_id').agg({
'event_time': ['min', 'max', 'nunique'], # 活躍時間範圍與天數
'event_type': 'count', # 行為總次數
'product_id': 'nunique', # 互動過的商品數
'brand': 'nunique', # 涉及的品牌數
'price': ['sum', 'mean'], # 商品價格總和與平均
'year': 'nunique', # 活躍的年數
'month': 'nunique',
'day': 'nunique',
'hour': 'nunique',
'weekday': 'nunique',
'weeknum': 'nunique'
})
user_features.columns = ['_'.join(col).strip() for col in user_features.columns.values] # 扁平化 (flatten) 多層欄位名稱
user_features = user_features.reset_index()
# 合併 session duration
df_final = user_features.merge(user_session_stats, on='user_id', how='left')
df_final = df_final.merge(user_period_final, on='user_id', how='left')
# df_final = df_final.merge(user_brand_final, on='user_id', how='left')
df_final = df_final.merge(user_category_final, on='user_id', how='left')
df_final = df_final.merge(user_target, on='user_id', how='left')
df_final.head(2)

features_imputed = df_final.drop(columns=['user_id', 'event_time_min', 'event_time_max'])
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
imputer = SimpleImputer(strategy='mean') # NaN 值補為該欄平均數
features_imputed = imputer.fit_transform(features_imputed)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_imputed)
筆數太多,抽樣來看一下要分幾群
from sklearn.metrics import silhouette_score
from sklearn.utils import resample
from tqdm import tqdm # 加入進度條
import matplotlib.pyplot as plt
import seaborn as sns
sample_size = 10000 # 抽樣筆數
X_sampled = resample(X_scaled, n_samples=sample_size, random_state=42)
k_values = range(2, 21)
silhouette_scores = []
for k in tqdm(k_values, desc="Evaluating KMeans clustering"):
kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_scaled)
# 針對抽樣資料計算分數
sampled_labels = kmeans.predict(X_sampled)
score = silhouette_score(X_sampled, sampled_labels)
silhouette_scores.append(score)
print(f"k = {k}, silhouette score = {score:.4f}")

final_k = 16
final_kmeans = KMeans(n_clusters=final_k, random_state=2222, n_init='auto')
cluster_labels = final_kmeans.fit_predict(X_scaled)
# 加回原資料
df_final['cluster'] = cluster_labels
df_final['cluster'].value_counts()
df_final

熱力圖查看
period_columns = ['period_morning', 'period_afternoon', 'period_evening', 'period_midnight']
for col in period_columns:
df_final[col] = df_final[col].fillna(False)
df_final[col] = df_final[col].astype(int)
overall_mean = df_final.groupby('cluster').mean(numeric_only=True).mean()
cluster_means = df_final.groupby('cluster').mean(numeric_only=True)
relative_diff = cluster_means - overall_mean
relative_diff = relative_diff.round(2)
overall_mean = df_final.groupby('cluster').mean(numeric_only=True).mean()
cluster_means = df_final.groupby('cluster').mean(numeric_only=True)
relative_diff = cluster_means - overall_mean
relative_diff = relative_diff.round(2)
plt.figure(figsize=(20, 8))
sns.heatmap(relative_diff.T, annot=True, cmap='coolwarm', center=0, fmt=".2f") # <-- 這裡添加了 fmt 參數
plt.title("Relative difference of each cluster from the overall mean")
plt.xlabel("Cluster")
plt.ylabel("Feature")
plt.show()

觀察熱力圖特徵
'event_time_nunique':事件(如瀏覽、點擊、購買)的不重複天數
'event_type_count':事件類型(如 view, purchase, cart 等)上的總行為次數
'product_id_nunique':互動過(瀏覽、加入購物車、購買等)的不重複商品數量
'brand_nunique':互動過(瀏覽、購買等)的不重複品牌數量
'price_sum':所有互動事件中商品價格的總和
'price_mean':所有互動事件中商品價格的平均
'year_nunique':發生事件的不重複年份數量
'month_nunique':發生事件的不重複月份數量
'day_nunique':發生事件的不重複天數數量
'hour_nunique':發生事件的不重複小時數量
'weekday_nunique':發生事件的不重複星期幾數量
'weeknum_nunique':發生事件的不重複週數
'session_duration':所有事件的平均持續時間
'period_afternoon'
'period_evening'
'period_midnight'
'period_morning'
'period_accessories'
'period_apparel'
'period_appliances'
'period_furniture'
'period_sport'
'period_stationery'
'total_purchase_value':總購買金額
可以看出
(Cluster 0):大多數為負值,價值低且活躍度低的潛在流失客戶
(Cluster 15):價值高且活躍度高,主要購買appreal,次要購買apparel的客戶