
6
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
下載kaggle 原始資料,我放在自己的github上


使用COLAB,複製ㄧ份到github


再從COLAB中,網址讀取
url = 'https://raw.githubusercontent.com/06Cata/Kaggle_Wine_Quality/main/raw_data/WineQT.csv'
df_wine = pd.read_csv(url)
df_wine.head(10)

- 以克/升(g/dm³)計量
- ‘fixed acidity’ : 固定酸度
- ‘volatile acidity’: 揮發性酸度,這些酸對於葡萄酒的味道有重要影響
- ‘citric acid’ : 檸檬酸,可以增加葡萄酒的新鮮感和酸度
- ‘residual sugar’: 殘留糖分
- ‘chlorides’: 氯化物,對葡萄酒的口感和味道有影響
- ‘free sulfur dioxide’: 游離二氧化硫,以毫克/升(mg/dm³)計量,防止葡萄酒氧化和細菌感染的重要添加劑
- ‘total sulfur dioxide’ : 總二氧化硫,以毫克/升(mg/dm³)計量,與游離二氧化硫密切相關
‘density’: 密度,以克/立方厘米(g/cm³)計量,用來估計葡萄酒的酒精含量- ‘pH’: 酸性程度
- ‘sulphates’: 硫酸鹽,以克/升(g/dm³)計量,可以增加葡萄酒的防腐能力和抗氧化性
- ‘alcohol’: 酒精含量,以百分比計量,影響葡萄酒的口感和醉酒感
- ‘quality’: 品質評分,介於 0 到 10 之間的整數
- ‘Id’
資訊、數值統計摘要
df_wine.describe()

df_wine.isna().sum()

df_wine.info()

# 檢查是否有重複資料
filtered_data = df_wine[df_wine.duplicated(keep=False)]
filtered_data

# 盒鬚圖,查看數值範圍
lst = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides',
'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
fig, ax = plt.subplots(2, 6, figsize=(15,8))
ax = ax.flatten()
index = 0
for index, val in enumerate(lst):
sb.boxplot(df_wine, x='quality', y=val, ax=ax[index])
ax[index].set_title(val)
index +=1
plt.tight_layout(pad=0.4)
plt.show()

# 盒鬚圖,查看數值與quality關係
fig, ax = plt.subplots(2,6, figsize=(15, 8))
ax = ax.flatten()
index = 0
for i in df_wine.columns:
if (i != 'quality') & (i != 'Id'):
sb.barplot(df_wine, x='quality', y=i, ax=ax[index])
ax[index].set_title(i)
index += 1
plt.tight_layout(pad=0.4)
plt.show()

# 查看['quality']數值出現總量
# groups = df_wine.groupby(by="quality").size()
# groups.plot.bar()
quality_counts = df_wine['quality'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
plt.bar(quality_counts.index, quality_counts.values, color='skyblue')
plt.title('Wine Quality Value Counts')
plt.xlabel('Quality')
plt.ylabel('Counts')
plt.grid(axis='y')
plt.show()

沒有缺漏,不用補
都是數值型,不需轉換
# 資料正規化_1
# 標準化,適合於數據呈高斯分佈的情況,標準化後的數據沒有固定範圍,主要取決於數據的原始分佈。對算法如支持向量機(SVM)和線性回歸效果較好
df_wine_standard = df_wine.copy()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
feature_columns = df_wine_standard.columns.difference(["quality"])
df_wine_standard[feature_columns] = scaler.fit_transform(df_wine_standard[feature_columns])
df_wine_standard

# 資料正規化_2
# 最小-最大正規化 適合於數據範圍固定且已知,固定為0到1(或自定義範圍)。對神經網絡和深度學習效果較好
df_wine_minmax = df_wine.copy()
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
df_wine_minmax[feature_columns] = min_max_scaler.fit_transform(df_wine_minmax[feature_columns])
df_wine_minmax

# quality二分為"good" and "bad"
bins = (0, 6.0, 10)
group_names = ['bad', 'good']
df_wine_standard['quality'] = pd.cut(df_wine_standard['quality'], bins = bins, labels = group_names)
bins = (0, 6.0, 10)
group_names = ['bad', 'good']
df_wine_minmax['quality'] = pd.cut(df_wine_minmax['quality'], bins = bins, labels = group_names)
# quality 轉為 LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_quality = LabelEncoder()
df_wine_standard['quality'] = label_quality.fit_transform(df_wine_standard['quality'])
label_quality = LabelEncoder()
df_wine_minmax['quality'] = label_quality.fit_transform(df_wine_minmax['quality'])
df_wine_standard.head(3)
df_wine_minmax.head(3)

# 1
# df_wine_standard
# 相關係數矩陣
# fig, ax = plt.subplots(figsize = (16, 10))
# sb.heatmap(df_wine.corr(), annot=True, linewidth=0.2)
# plt.show()
df_encoded_standard = df_wine_standard.select_dtypes(include=[np.number])
correlation_matrix_standard = df_encoded_standard.corr().round(2)
plt.figure(figsize=(12,9))
sb.heatmap(correlation_matrix_standard,annot=True,cmap='RdBu_r',linewidths=0.2)
fig=plt.gcf()
plt.title("Correlation Heatmap of wine quality")
plt.show()
print(correlation_matrix_standard['quality'].sort_values(ascending=False))


# 1
# df_wine_minmax
# 相關係數矩陣
df_encoded_minmax = df_wine_minmax.select_dtypes(include=[np.number])
correlation_matrix_minmax = df_encoded_minmax.corr().round(2)
plt.figure(figsize=(12,9))
sb.heatmap(correlation_matrix_minmax,annot=True,cmap='RdBu_r',linewidths=0.2)
fig=plt.gcf()
plt.title("Correlation Heatmap of wine quality")
plt.show()
print(correlation_matrix_minmax['quality'].sort_values(ascending=False))



# 1
# df_wine_standard
# 特徵重要性
from sklearn.ensemble import RandomForestClassifier
X = df_encoded_standard.drop(['quality'], axis=1)
y = df_encoded_standard['quality']
model = RandomForestClassifier()
model.fit(X, y)
# 特徵重要性
feature_importance = model.feature_importances_
feature_importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': feature_importance
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
ax.set_title('Feature Importances')
ax.tick_params(axis='x', rotation=45)
fig.tight_layout()
plt.show()
feature_importance_df

# 1
# df_wine_minmax
# 特徵重要性
from sklearn.ensemble import RandomForestClassifier
X = df_encoded_minmax.drop(['quality'], axis=1)
y = df_encoded_minmax['quality']
model = RandomForestClassifier()
model.fit(X, y)
# 特徵重要性
feature_importance = model.feature_importances_
feature_importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': feature_importance
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
ax.set_title('Feature Importances')
ax.tick_params(axis='x', rotation=45)
fig.tight_layout()
plt.show()
feature_importance_df

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# 查看準確度
# df_wine_standard
X = df_wine_standard.drop('quality', axis = 1)
y = df_wine_standard['quality']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
# Logistic_Regression
log = LogisticRegression(random_state=0, max_iter=3000)
scores_log = cross_val_score(log, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_log)
# Decision_Tree
decision_tree = DecisionTreeClassifier()
scores_decision_tree = cross_val_score(decision_tree, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_decision_tree)
# Random_Forest_Classifier
rfc = RandomForestClassifier(n_estimators=100)
scores_rfc = cross_val_score(rfc, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_rfc)
# Support_Vector_Machines
svc = SVC()
scores_svc = cross_val_score(svc, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_svc)
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
scores_knn = cross_val_score(knn, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_knn)
# Gaussian_Naive_Baye
gaussian = GaussianNB()
scores_gaussian = cross_val_score(gaussian, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_gaussian)
# Gradient_Boosting_Classifier
Gradient = GradientBoostingClassifier()
scores_gradient = cross_val_score(gaussian, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_gradient)

# 查看準確度
# df_wine_minmax
X = df_wine_minmax.drop('quality', axis = 1)
y = df_wine_minmax['quality']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
# Logistic_Regression
log = LogisticRegression(random_state=0, max_iter=3000)
scores_log_minmax = cross_val_score(log, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_log_minmax)
# Decision_Tree
decision_tree = DecisionTreeClassifier()
scores_decision_tree_minmax = cross_val_score(decision_tree, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_decision_tree_minmax)
# Random_Forest_Classifier
rfc = RandomForestClassifier(n_estimators=100)
scores_rfc_minmax = cross_val_score(rfc, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_rfc_minmax)
# Support_Vector_Machines
svc = SVC()
scores_svc_minmax = cross_val_score(svc, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_svc_minmax)
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
scores_knn_minmax = cross_val_score(knn, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_knn_minmax)
# Gaussian_Naive_Baye
gaussian = GaussianNB()
scores_gaussian_minmax = cross_val_score(gaussian, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_gaussian_minmax)
# Gradient_Boosting_Classifier
Gradient = GradientBoostingClassifier()
scores_gradient_minmax = cross_val_score(gaussian, train_X, train_y.values.ravel(),cv=5,scoring='accuracy').mean()
print(scores_gradient_minmax)

選擇df_wine_minmax的Random_Forest_Classifier
# df_wine_minmax
# Random_Forest_Classifier
X = df_wine_minmax.drop('quality', axis = 1)
y = df_wine_minmax['quality']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(train_X, train_y)
y_pred = rfc.predict(test_X)
report = classification_report(test_y, y_pred)
print(report)
accuracy = accuracy_score(test_y, y_pred)
print("Random Forest Classifier Accuracy:", accuracy)

查看,假設減少重要性 < 0.07 的特徵,準確率是否比沒減少來的好
# 查看,假設減少重要性 < 0.07 的特徵,準確率是否比沒減少來的好
features_to_drop = ['Id','free sulfur dioxide', 'residual sugar', 'fixed acidity', 'chlorides', 'pH']
X_reduced = X.drop(features_to_drop, axis=1)
train_X_reduced, test_X_reduced, train_y, test_y = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(train_X_reduced, train_y)
y_pred = rfc.predict(test_X_reduced)
report = classification_report(test_y, y_pred)
print(report)
accuracy = accuracy_score(test_y, y_pred)
print("Random Forest Classifier Accuracy with Reduced Features:", accuracy)
#準確率有變高,後面我選擇減少這些重要性低的特徵

因為樣本數不平均,試試上採樣、下採樣
# 處理不平衡['quality']
# upsampling、downsampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
X = df_wine_minmax.drop(['quality','Id','free sulfur dioxide', 'residual sugar', 'fixed acidity', 'chlorides', 'pH'], axis=1)
y = df_wine_minmax['quality']
# 使用 SMOTE 上採樣
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(pd.Series(y_resampled).value_counts())
print()
#
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
rfc_smote = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_smote.fit(X_train, y_train)
y_pred_smote = rfc_smote.predict(X_test)
#
X_test_with_predictions = X_test.copy()
X_test_with_predictions['quality'] = y_test.values # 添加原始的 quality 欄位
X_test_with_predictions['quality_pred'] = y_pred_smote # 添加預測的 quality 欄位
#
report_smote = classification_report(y_test, y_pred_smote)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
print("SMOTE 上採樣後的隨機森林分類器分類報告:")
print(report_smote)
print()
print("SMOTE 上採樣後的隨機森林分類器準確度:", accuracy_smote)

# 使用隨機下採樣
undersample = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X, y)
print(pd.Series(y_resampled).value_counts())
print()
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
rfc_undersample = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_undersample.fit(X_train, y_train)
y_pred_undersample = rfc_undersample.predict(X_test)
#
report_undersample = classification_report(y_test, y_pred_undersample)
accuracy_undersample = accuracy_score(y_test, y_pred_undersample)
print("下採樣後的隨機森林分類器分類報告:")
print(report_undersample)
print()
print("下採樣後的隨機森林分類器準確度:", accuracy_undersample)

有比較好,因此選擇「上採樣」
# 損失率
# 上採樣
from sklearn.metrics import log_loss
# 計算訓練集和測試集的預測概率
train_proba = rfc_smote.predict_proba(X_train)
test_proba = rfc_smote.predict_proba(X_test)
train_loss = log_loss(y_train, train_proba)
test_loss = log_loss(y_test, test_proba)
loss_list = [train_loss] * 10
test_loss_list = [test_loss] * 10 # 模擬多個 epoch 的損失率 # 這裡用相同的損失值來模擬
print(f'Train Loss: {train_loss:.4f}')
print(f'Test Loss: {test_loss:.4f}')
#
plt.plot(loss_list, label="Training Loss", linewidth=3)
plt.plot(test_loss_list, label="Validation Loss", linewidth=3)
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Log Loss")
plt.title("Log Loss over Epochs")
plt.show()

優化模型
# 優化模型
rfc = RandomForestClassifier(random_state=42)
Parameters = {
'max_depth' : [5, 10, 20],
'n_estimators': [10, 50, 100, 150],
}
cv = RandomizedSearchCV(rfc, Parameters, cv=5)
cv.fit(train_X, train_y)
# 查看最佳參數
print("Best parameters found: ", cv.best_params_)
# 查看最佳模型的分數
print("Best score found: ", cv.best_score_)
#
results = pd.DataFrame(cv.cv_results_)
results.head(3)

# 最佳參數已經找到 (n_estimators: 100, max_depth: 20),使用這些最佳參數來訓練一個新的隨機森林分類器
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
X = df_wine_minmax.drop(['quality','Id','free sulfur dioxide', 'residual sugar', 'fixed acidity', 'chlorides', 'pH'], axis=1)
y = df_wine_minmax['quality']
#
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(pd.Series(y_resampled).value_counts())
print()
#
train_X, test_X, train_y, test_y = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size = 0.1, random_state = 43, stratify=train_y)
best_rfc = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
best_rfc.fit(train_X, train_y)
y_pred_best = best_rfc.predict(test_X)
# 加回去新欄位
X_test_with_predictions = test_X.copy()
X_test_with_predictions['quality'] = test_y.values # 添加原始的 quality 欄位
X_test_with_predictions['quality_pred_best'] = y_pred_best # 添加預測的 quality 欄位
#
report_smote_best = classification_report(test_y, y_pred_best)
accuracy_smote_best = accuracy_score(test_y, y_pred_best)
print("優化後,SMOTE 上採樣後的隨機森林分類器分類報告:")
print(report_smote_best)
print()
print("優化後,SMOTE 上採樣後的隨機森林分類器準確度:", accuracy_smote_best)

import torch
# 創建模型架構
class Model(torch.nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.hidden1 = torch.nn.Linear(input_size, 64)
self.hidden2 = torch.nn.Linear(64, 32)
self.predict = torch.nn.Linear(32, output_size)
def forward(self, x):
output1 = torch.relu(self.hidden1(x))
output2 = torch.relu(self.hidden2(output1))
output = torch.sigmoid(self.predict(output2))
return output
# 模型、優化器初始化
model = Model(test_X.shape[1], 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.05, momentum=0.0) # 模型參數優化器
loss_func = torch.nn.BCELoss()
# 資料格式轉換為 torch 張量
train_X_data = torch.tensor(train_X.values, dtype=torch.float32)
train_y_data = torch.tensor(np.expand_dims(train_y, axis=1), dtype=torch.float32)
val_X_data = torch.tensor(val_X.values, dtype=torch.float32)
val_y_data = torch.tensor(np.expand_dims(val_y, axis=1), dtype=torch.float32)
test_X_data = torch.tensor(test_X.values, dtype=torch.float32)
test_y_data = torch.tensor(np.expand_dims(test_y, axis=1), dtype=torch.float32)
# 訓練模型
batch_size = 32
num_epochs = 200
training_losses = []
val_losses = []
for epoch in range(num_epochs):
for i in range(0, len(train_X_data), batch_size):
prediction = model(train_X_data[i:i+batch_size])
loss = loss_func(prediction, train_y_data[i:i+batch_size])
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss = loss_func(model(train_X_data), train_y_data)
training_losses.append(float(loss))
y_pred = model(val_X_data)
val_loss = loss_func(y_pred, val_y_data)
print("training loss:{}, val loss:{}, val acc:{}".format(float(loss), val_loss, accuracy_score(val_y_data, np.where(y_pred >= 0.5, 1, 0))))
val_losses.append(float(val_loss))
# 評估模型在測試集上的表現
model.eval()
with torch.no_grad():
test_prediction = model(test_X_data)
test_prediction = test_prediction.round() # 將預測值四捨五入為0或1
accuracy = (test_prediction.eq(test_y_data).sum().float() / test_y_data.shape[0]).item()
print(f'Test Accuracy: {accuracy:.4f}')

import numpy as np
from sklearn.metrics import classification_report
# 使用模型進行預測
with torch.no_grad():
y_pred = model(test_X_data)
y_pred = np.where(y_pred >= 0.5, 1, 0) # 將預測結果轉換為 0 或 1
#
print(classification_report(test_y_data, y_pred))

X_test_with_predictions_pytorch = test_X.copy()
X_test_with_predictions_pytorch['quality'] = test_y.values # 添加原始的 quality 欄位
X_test_with_predictions_pytorch['quality_pred_pytorch'] = y_pred # 添加預測的 quality 欄位
X_test_with_predictions_pytorch

# 損失率
plt.plot(training_losses, linewidth=3)
plt.plot(val_losses, linewidth=3)
plt.legend(("Training Loss", "Validation Loss"))
plt.xlabel("Epoch")
plt.ylabel("BCE Loss")

# 準確率
accuracy_score(test_y, y_pred)

!pip install keras==2.15.0
!pip install tensorflow==2.12
!pip install tensorflow-addons==0.23.0
!pip install typeguard==2.13.3
import tensorflow_addons as tfa
print(tfa.__version__)
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
model = tf.keras.models.Sequential([
layers.Dense(64, name="hidden1", activation="relu"),
layers.Dense(32, name="hidden2", activation="relu"),
layers.Dense(1, name="output", activation=tf.nn.sigmoid),
])
optimizer = keras.optimizers.SGD(lr=0.05)
model.compile(optimizer=optimizer, loss=tfa.losses.SigmoidFocalCrossEntropy(), metrics=["Accuracy"])
model.fit(train_X, train_y, validation_data=(val_X, val_y), epochs=200, batch_size=32)

y_pred = model.predict(test_X)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(test_y, y_pred))

X_test_with_predictions_pytorch = test_X.copy()
X_test_with_predictions_pytorch['quality'] = test_y.values # 添加原始的 quality 欄位
X_test_with_predictions_pytorch['quality_pred_kera'] = y_pred # 添加預測的 quality 欄位
X_test_with_predictions_pytorch

accuracy = accuracy_score(test_y, y_pred)
