【Hung-yi Lee 機器學習 - L2 : Phoneme Classification(Classification) 】

Contents 目錄

參考資料

【機器學習 2022】再探寶可夢、數碼寶貝分類器 — 淺談機器學習原理

ML Lecture 4: Classification

ML Lecture 5: Logistic Regression

ML Lecture 6: Brief Introduction of Deep Learning

Lecture 2 : Deep Learing Introduction

為什麼要用分類？而非線性代數？

線性代數會找到平均Loss最小的，反而紫色線的結果會<綠色線

第一步: 猜測符合raw_data的函式

第二步: 定義 Gaussian distribution，找到最大可能性

假設有一個 Gaussian distribution 可以找到同一個 class 所有點的 mean µ

帶入新的x，算出新的機率
目的是尋找最大的可能性，越接近的mean µ，越有可能為該 class

假設水系寶可夢61隻、一般系寶可夢79隻

為了要減少參數（參數多，越有可能造成 overfitting)，因此可以共用 covariance matrix

HW 2-1 : Phoneme Classification

使用音檔轉換好的vec＋音素label，判斷測試集的音素label

多類別分類（multiclass classification），目標是利用深度神經網路（DNN，適合用於 MFCC) 訓練一個語音分類器

def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False         # 讓 PyTorch 不去自動尋找最優的計算方式
    torch.backends.cudnn.deterministic = True      # 確保 cuDNN 使用確定性演算法，避免某些運算（如捲積層）產生不穩定的隨機變化

def predict(test_loader, model, device):
    model.eval()  # 設置模型為評估模式
    predict = []  # 用來存儲預測結果
    with torch.no_grad():  # 不計算梯度
        for i, data in enumerate(test_loader):
            inputs = data
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, test_pred = torch.max(outputs, 1)  # 取得預測的類別

            for y in test_pred.cpu().numpy():
                predict.append(y)

    return predict

# 資料集
# 處理因變數、自變數

import torch
from torch.utils.data import Dataset

class TIMITDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = torch.from_numpy(X).float() # 轉換為 FloatTensor
        if y is not None:
            y = y.astype(int)
            self.label = torch.LongTensor(y)   # 訓練集
        else:
            self.label = None   # 測試集

    def __getitem__(self, idx): # 取得單筆數據
        if self.label is not None:
            return self.data[idx], self.label[idx]  # 回傳 (特徵, 標籤)
        else:
            return self.data[idx] # 測試集只回傳特徵

    def __len__(self):
        return len(self.data)

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(429, 1024)
        self.layer2 = nn.Linear(1024, 512)
        self.layer3 = nn.Linear(512, 128)
        self.out = nn.Linear(128, 39)

        self.act_fn = nn.ReLU()   # nn.Sigmoid()

    # def forward(self, x):
    #     x = self.layer1(x)
    #     x = self.act_fn(x)

    #     x = self.layer2(x)
    #     x = self.act_fn(x)

    #     x = self.layer3(x)
    #     x = self.act_fn(x)

    #     x = self.out(x)

    #     return x


    def forward(self, x):
        x = self.act_fn(self.layer1(x))
        x = self.act_fn(self.layer2(x))
        x = self.act_fn(self.layer3(x))
        x = self.out(x)
        return F.log_softmax(x, dim=1)  # 適用於分類問題

總共有三個檔案
train_11.npy: training data 語音特徵，已經提取MFCC
train_label_11.npy: training label 對應的音素類別
test_11.npy: testing data (x)

!gdown --id '1HPkcmQmFGu-3OknddKIa5dNDsR05lIQR' --output data.zip
!unzip data.zip
!ls

import numpy as np

print('Loading data ...')

data_root='./timit_11/'
train_ori = np.load(data_root + 'train_11.npy')
train_label_ori = np.load(data_root + 'train_label_11.npy')
test = np.load(data_root + 'test_11.npy')

print('Size of training data: {}'.format(train_ori.shape))
print('Size of testing data: {}'.format(test.shape))

# 查看數據類型
print(f"Train Data Type: {train_ori.dtype}")
print(f"Train Label Type: {train_label_ori.dtype}")
print(f"Test Data Type: {test.dtype}")

# 查看部分數據（前 5 筆）
print("\nTrain Data Sample:\n", train_ori[:5])
print("\nTrain Labels Sample:\n", train_label_ori[:5])
print("\nTest Data Sample:\n", test[:5])

查看音速類別有幾種

import numpy as np
labels = np.load(".../colab_ml/timit_11/train_label_11.npy")
print(np.unique(labels))

因為載的dataset沒有test的解答，這裡不使用
改成先切出 10% test data，剩下再分 80% train data + 20% vaild data

import numpy as np

np.random.seed(77777)

# train_ori 和 train_label 分別是音檔的特徵和標籤
# 先把 10% 切出作為測試數據
TEST_RATIO = 0.1  
data = list(zip(train_ori, train_label_ori))
np.random.shuffle(data)
train_ori_shuffled, train_label_ori_shuffled = zip(*data)
train_ori_shuffled = np.array(train_ori_shuffled)
train_label_ori_shuffled = np.array(train_label_ori_shuffled)

train_size = int(train_ori_shuffled.shape[0] * (1 - TEST_RATIO))
train = train_ori_shuffled[:train_size]  # 90% 訓練資料
train_label = train_label_ori_shuffled[:train_size]  # 90% 訓練標籤

test = train_ori_shuffled[train_size:]  # 10% 測試資料
test_label = train_label_ori_shuffled[train_size:]  # 10% 測試標籤

print("\nTrain Data Sample:\n", train[:5])
print("\nTrain Labels Sample:\n", train_label[:5])

print("\nTest Data Sample:\n", test[:5])
print("\nTest Labels Sample:\n", test_label[:5])

# 10%的正確結果先存
import pandas as pd
test_label_df = pd.DataFrame(test_label, columns=['label'])
test_label_df.to_csv('test_data_y_hat.csv', index=False)

# 剩下再分 80% train data + 20% vaild data
VAL_RATIO = 0.2

percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))

BATCH_SIZE = 64

from torch.utils.data import DataLoader

train_set = TIMITDataset(train_x, train_y)
val_set = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

# 引入 garbage collection 模組，釋放未使用的記憶體
import gc

del train, train_label, train_x, train_y, val_x, val_y
gc.collect()

same_seeds(0)

device = get_device()
print(f'DEVICE: {device}')

num_epoch = 50               
learning_rate = 0.0001       # learning rate

model_path = './model.ckpt'

model = Classifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# training

early_stop_count = 0
best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train()
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        batch_loss = criterion(outputs, labels)
        _, train_pred = torch.max(outputs, 1)    # 取得最高機率的預測類別 # 代表最大值本身（不需要使用，所以用 _ 忽略）
        batch_loss.backward()
        optimizer.step()

        # 累加訓練準確率與訓練損失
        train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
        train_loss += batch_loss.item()

    # validation
    if len(val_set) > 0:
        model.eval()
        with torch.no_grad():
            for i, data in enumerate(val_loader):
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                batch_loss = criterion(outputs, labels)
                _, val_pred = torch.max(outputs, 1)

                val_acc += (val_pred.cpu() == labels.cpu()).sum().item()
                val_loss += batch_loss.item()

            # {:03d} : 3 位數整數，不足補 0（例如 001）
            # {:3.6f} : 顯示 6 位小數點的浮點數
            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
            ))

            # 如果驗證損失數值較低，保存數據，否則 停止的數字＋1
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
                early_stop_count = 0
            else:
                early_stop_count += 1

            if early_stop_count >= 400:
                print('\nEarly stop')
                break
    else:
        print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
            epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
        ))

# if not validating, save the last epoch
if len(val_set) == 0:
    torch.save(model.state_dict(), model_path)
    print('saving model at last epoch')

測試並存檔

# testing
test_set = TIMITDataset(test, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

model = Classifier().to(device)
model.load_state_dict(torch.load(model_path))

preds = predict(test_loader, model, device)
pd.DataFrame(preds, columns=['Predictions']).to_csv('prediction.csv', index=False)

from sklearn.metrics import accuracy_score, confusion_matrix

# 檔案對檔案對比

# 預測結果
pred_df = pd.read_csv('prediction.csv')
preds = pred_df['Predictions'].values

# 真實標籤
test_data_y_hat_df = pd.read_csv('test_data_y_hat.csv')
test_data_y_hat = test_data_y_hat_df['label'].values

# 計算準確率
accuracy = accuracy_score(test_data_y_hat, preds)
print(f'Accuracy: {accuracy:.4f}')

# 計算混淆矩陣
cm = confusion_matrix(test_data_y_hat, preds)
print('Confusion Matrix:')
print(cm)