# HTML Formatting
import requests
from IPython.core.display import HTML
styles = requests.get(
    "https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/"
    "content/styles/cs109.css"
).text
HTML(styles);

# Install dependencies
# !pip install librosa 
# !pip install evaluate 
# !pip install transformers
# !pip install torchinfo

# Import libraries
import re
import time
import os
import wave
import scipy
import librosa
import evaluate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import csv

from tensorflow import keras
from keras.models import Model, Sequential, load_model
from keras import layers
from keras import losses
from keras import optimizers
from keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint
from tensorflow.keras import layers
from keras.layers import Input, Embedding, SimpleRNN, GRU, LSTM, TimeDistributed, Bidirectional, Dense
from keras.layers import  BatchNormalization, Activation, Dropout, GaussianNoise, LayerNormalization
from keras.layers import Conv2D, MaxPooling2D, Flatten, Layer
from keras.regularizers import L1
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from IPython.display import Audio

from datasets import Dataset
from datasets import Audio as AudioCast
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from torchinfo import summary
import seaborn as sns

2024-05-09 00:35:32.333996: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-09 00:35:32.379252: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 00:35:32.379286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 00:35:32.380385: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-09 00:35:32.387083: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

path = os.getcwd() + '/Audio_Speech_Actors_01-24'
path

'/home/u_388354/project/Audio_Speech_Actors_01-24'

def read_wav_file(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        num_channels = wav_file.getnchannels()
        sample_width = wav_file.getsampwidth()
        frame_rate = wav_file.getframerate()
        num_frames = wav_file.getnframes()

        # Read the raw audio data
        raw_data = wav_file.readframes(num_frames)

    # Convert the raw audio data to a numpy array
    if sample_width == 2:
        data_type = np.int16
    elif sample_width == 4:
        data_type = np.int32
    else:
        raise ValueError("Unsupported sample width")

    audio_data = np.frombuffer(raw_data, dtype=data_type)

    # Reshape the numpy array if there are multiple channels
    if num_channels > 1:
        audio_data = audio_data.reshape(-1, num_channels)

    return audio_data, frame_rate

wav_paths = []

# os.walk gives files recursively
for root, dirs, files in os.walk(os.getcwd()):
    for file in files:
        # ignore .DS_Store
        if file.endswith('.wav'):
            wav_path = os.path.join(root, file)
            wav_paths.append(wav_path)

# Pick padding length that enables prime factorization
MAX_LEN = 3**4 * 5**5

# Helper function for info, can be made categorical instead of numeric
id2label = {
                0: 'neutral', 
                1: 'calm', 
                2: 'happy',
                3: 'sad', 
                4: 'angry',
                5: 'fearful',
                6: 'disgust',
                7: 'surprised'
              }

label2id = {v: k for k, v in id2label.items()}

def info_dict(path):
    dict = {}
    emotion_number = int(path[-18:-16]) - 1
    dict['Emotion_Number'] = emotion_number
    dict['Emotion'] = id2label[emotion_number]
    dict['Intensity'] = int(path[-15:-13])
    statement_number = int(path[-12:-10])
    dict['Statement_Number'] = statement_number
    dict['Statement'] = ['OOB', 'Kids', 'Dogs'][statement_number]
    dict['Repetition'] = int(path[-9:-7])
    dict['Actor'] = int(path[-6:-4])   #0 = female, 1 = male
    gn = int((int(path[-6:-4]) % 2 == 1))
    dict['Gender_Number'] = int((int(path[-6:-4]) % 2 == 1))
    dict['Gender'] = ['Female', 'Male'][gn]
    return dict

data_list = []

for path in wav_paths:
    # helper function above
    dict = info_dict(path)
    data, fr = read_wav_file(path)
    dict['Frame_Rate'] = fr
    
    # Length without padding
    dict['Num_Frames'] = len(data)

    # Check for 5 cases where data is doubled
    if len(data.shape) != 1:
        data = data.T[0]

    # Do padding
    new_data = np.pad(data, (MAX_LEN - len(data), 0), 'constant')

    dict['Data'] = new_data
    data_list.append(dict)
    
df = pd.DataFrame(data_list)

df.head(3)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Emotion_Number    1440 non-null   int64 
 1   Emotion           1440 non-null   object
 2   Intensity         1440 non-null   int64 
 3   Statement_Number  1440 non-null   int64 
 4   Statement         1440 non-null   object
 5   Repetition        1440 non-null   int64 
 6   Actor             1440 non-null   int64 
 7   Gender_Number     1440 non-null   int64 
 8   Gender            1440 non-null   object
 9   Frame_Rate        1440 non-null   int64 
 10  Num_Frames        1440 non-null   int64 
 11  Data              1440 non-null   object
dtypes: int64(8), object(4)
memory usage: 135.1+ KB

# Example usage
test_path = wav_paths[100]

file_path = test_path
audio_data, frame_rate = read_wav_file(file_path)
print("Audio data shape:", audio_data.shape)
print("Frame rate:", frame_rate)
audio_data

Audio data shape: (166566,)
Frame rate: 48000

array([ 1,  2,  2, ..., -5, -4, -5], dtype=int16)

Audio(audio_data, rate=frame_rate)

ms = range(audio_data.shape[0])
fig, ax = plt.subplots(figsize=(100, 20)) 
ax.plot(ms, audio_data, c='darkorange')
ax.axis('off')
plt.show()

histos=df['Num_Frames'].hist(by=df['Emotion'], sharex=True, figsize=(20,10), layout=(2,4), density=True)

histos[0,0].set_ylabel('Density')
histos[1,0].set_ylabel('Density')

histos[1,0].set_xlabel('Number of Frames')
histos[1,1].set_xlabel('Number of Frames')
histos[1,2].set_xlabel('Number of Frames')
histos[1,3].set_xlabel('Number of Frames')

plt.suptitle('Distribution of Audio Length by Emotion');

mean_lens=df[['Emotion_Number', 'Num_Frames']].groupby(by=['Emotion_Number']).mean()['Num_Frames']
sd_lens=df[['Emotion_Number', 'Num_Frames']].groupby(by=['Emotion_Number']).std()['Num_Frames']
xs=list(id2label.values())

fig,ax=plt.subplots(1,1,figsize=(9,3))
plt.bar(height=mean_lens, x=xs,  yerr=sd_lens, color='lightblue')
plt.ylim(125000, 210000)
plt.xlabel('Emotion')
plt.ylabel('Length (# of Frames)')
plt.title('Average Audio Length by Emotion +/- Standard Deviation');

emotions = range(0, 8)

# Create subplots
fig, ax = plt.subplots(figsize=(20, 10))

# Position for each box plot
positions = np.arange(1, len(emotions) + 1)

# Iterate over emotions
box_data = []
for i, emotion in enumerate(emotions):
    # Filter the DataFrame
    filtered_df = df[(df['Emotion_Number'] == emotion)]
    
    # Combine the data into a single array
    vals = np.concatenate(filtered_df['Data'].values)
    for j in range(-20, 21):
        vals = vals[vals != j]
    
    box_data.append(vals)

# Create the box plot
boxplot = ax.boxplot(box_data, positions=positions, vert=True, patch_artist=True, showfliers=False)

# Add labels and grid
ax.set_ylabel('Amplitude')
ax.set_xlabel('Emotion')
ax.set_xticklabels([id2label[emotion] for emotion in emotions])
ax.set_title("Emotion Ampltitude Distributions")

# Customize colors
for patch in boxplot['boxes']:
    patch.set_facecolor('lightblue')

for median in boxplot['medians']:
    median.set(color='black')
    
plt.show()

# Create subplots
fig, axs = plt.subplots(4, 2, figsize=(10, 10))
fig.suptitle('Histograms of Audio Data for Different Emotions and Intensities')

# Iterate over emotions
for i, emotion in enumerate(emotions):
    # Calculate subplot position
    row = i // 2
    col = i % 2

    # Filter the DataFrame
    filtered_df = df[(df['Emotion_Number'] == emotion)]
    
    # Combine the data into a single array
    vals = np.concatenate(filtered_df['Data'].values)
    for j in range(-20, 21):
        vals = vals[vals != j]
    
    # Plot the histogram
    axs[row, col].hist(vals, bins=100, color='skyblue', edgecolor='black')
    axs[row, col].set_title(id2label[emotion])
    axs[row, col].set_xlabel('Amplitude')
    axs[row, col].set_ylabel('Count')

# Adjust layout
plt.tight_layout()
plt.show()

data_array = np.array(list(df['Data']))
data_averages = np.mean(data_array.reshape(1440, int(data_array.shape[1]/5), 5), axis=-1)
# Make an array of the deltas between time steps
data_differences = data_averages[:,1:] - data_averages[:,:-1]
# Store the standard deviations of this differences
stds = np.std(data_differences, axis = 1)
df['Change_Deviation'] = list(stds)

# Can see the more passionate emotions have higher variance in change
df[['Change_Deviation','Emotion']].groupby(['Emotion']).mean()

length = len(df)
frame_rate = 48000
def get_sample():
    is_neutral = True
    # Never plays neutral audio
    while(is_neutral):
        rand_index = np.random.randint(0,length)
        audio_data = df.iloc[rand_index]['Data']
        true_val = df.iloc[rand_index]['Emotion']
        is_neutral = (true_val == 'neutral')
    return (Audio(audio_data, rate = frame_rate, autoplay = True), rand_index, true_val)

# Will delete the csv, do not run
def reset_csv(file_name):
    with open(file_name, 'w', newline='') as csvfile:
        fieldnames = ['index', 'true', 'pred']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

def add_to_csv(file_name, id, true, guess):
    with open(file_name, 'a', newline='') as csvfile:
        fieldnames = ['index', 'true', 'pred']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writerow({'index' : id, 'true' : true, 'pred' : guess})

audio, id, true_val = get_sample()
display(audio)

for i in range(1,9):
    print(i, id2label[i-1])

1 neutral
2 calm
3 happy
4 sad
5 angry
6 fearful
7 disgust
8 surprised

guess = 4
add_to_csv('baseline.csv', id, true_val, guess)

baseline_df = pd.read_csv('baseline.csv')
baseline_df['pred'] = baseline_df['pred'] - 1
baseline_df['pred_emotion'] = baseline_df['pred'].apply(lambda x: id2label[x])
baseline_df['true_num'] = baseline_df['true'].apply(lambda x: label2id[x])

y_preds = list(baseline_df['pred'])

y_test = list(baseline_df['true_num'])

emotion_names = list(id2label.values())
ConfusionMatrixDisplay(confusion_matrix(y_test, y_preds), display_labels = emotion_names[1:]).plot(cmap='Blues', xticks_rotation = 40)
plt.title('Human Benchmark')
plt.show()

print(f'\nAccuracy of Human Model: {np.round(accuracy_score(y_preds,y_test), 5)}')

Accuracy of Human Model: 0.7541

# Baseline model uses change deviation and the length of the clip in logistic regression
import warnings
warnings.filterwarnings("ignore")

X = df[['Change_Deviation', 'Num_Frames']]
y = df['Emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1777)
baseline_logreg = LogisticRegression(random_state=10, max_iter=10000, multi_class='ovr').fit(X_train, y_train)
y_preds = baseline_logreg.predict(X_test)

ConfusionMatrixDisplay(confusion_matrix(y_test, y_preds), display_labels = emotion_names).plot(cmap='Blues', xticks_rotation = 40)
plt.title('Logistic Regression Confusion Matrix')
plt.show()

print(f'\nAccuracy of Baseline Model: {np.round(accuracy_score(y_preds,y_test), 5)}')

Accuracy of Baseline Model: 0.37847

# Hyper parameters for Melspectrogram
sr = 48000
n_fft = 2048
hop_length = 512

def get_melspectrogram(audio, n_mels=128):
    # First make sure audio data is casted to float
    audio_as_float = audio.astype(np.float32)
    mel = librosa.feature.melspectrogram(y = audio_as_float, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    return mel

df['mel128'] = df['Data'].apply(lambda x: get_melspectrogram(x, n_mels=128))
df['mel256'] = df['Data'].apply(lambda x: get_melspectrogram(x, n_mels=256))

# Mel-SPECTROGRAM CONVERTED DATASET
mel_128 = np.array(list(df['mel128']))
mel_256 = np.array(list(df['mel256']))

# Visualize spectrogram (for 128 Mel features)
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(mel_128[3], ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time',
                         y_axis='mel', sr=sr,
                         fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram');

# Prepare dataset
X = mel_128
y = df['Emotion_Number'].values

def evaluate_predictions(model, model_name, X_test=X_test, y_test=y_test):
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)

    # Create Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred_classes)
    emotion_labels = [id2label[i] for i in range(len(emotions))]  
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=emotion_labels)

    # Graph Confusion Matrix
    disp.plot(cmap='Blues', xticks_rotation = 40)
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()

def plot_history(history, model_name):
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    best_val_acc_loc = np.argmax(history.history['val_accuracy'])
    best_val_accuracy = max(history.history['val_accuracy'])
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='validation')
    plt.axvline(best_val_acc_loc, linestyle='--', c='k', label=("best val acc: {:.4f}".format(best_val_accuracy)))
    plt.title(f'{model_name} accuracy')
    plt.xticks(range(0, len(history.history['accuracy']), 2))
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    
    plt.subplot(1, 2, 2)
    best_val_loss_loc = np.argmax(history.history['val_loss'])
    best_val_loss = max(history.history['val_loss'])
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'],  label='validation')
    plt.axvline(best_val_loss_loc, linestyle='--', c='k', label=("best val loss"))
    plt.title(f'{model_name} loss')
    plt.xticks(range(0, len(history.history['accuracy']), 2))
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    
    plt.tight_layout()
    plt.show()

# Prepare dataset
X_ffnn = mel_128
y = df['Emotion_Number'].values

X_train, X_test, y_train, y_test = train_test_split(X_ffnn, y, test_size=0.2, random_state=109, stratify=y)

input_shape = (X_train.shape[1], X_train.shape[2], 1)

n_filters = 10

kernel_regularizer = L1(l1=0.015)
bias_regularizer = L1(l1=0.015)
dropout_rate = 0.5

inputs = Input(shape=input_shape)

# # Flatten the convolutional layers output before the fully connected layers
x = Flatten()(inputs)

# Dense layers
x = Dense(500, activation='relu', kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)(x)
x = Dropout(dropout_rate)(x)
x = Dense(500, activation='relu', kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer)(x)
x = Dropout(dropout_rate)(x)

# Output layer
outputs = Dense(len(emotions), activation='softmax')(x)

ffnn = Model(inputs=inputs, outputs=outputs, name='ffnn')

2024-05-09 00:37:50.611235: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:50.624500: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:50.627381: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:50.630919: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:50.633797: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:50.636493: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:51.196531: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:51.198234: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:51.199777: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 00:37:51.201261: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13775 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5

ffnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

ffnn.summary()

Model: "ffnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 128, 495, 1)]     0         
                                                                 
 flatten (Flatten)           (None, 63360)             0         
                                                                 
 dense (Dense)               (None, 500)               31680500  
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                                 
 dense_1 (Dense)             (None, 500)               250500    
                                                                 
 dropout_1 (Dropout)         (None, 500)               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 4008      
                                                                 
=================================================================
Total params: 31935008 (121.82 MB)
Trainable params: 31935008 (121.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

ffnn_history = ffnn.fit(X_train, 
                      y_train, 
                      validation_data=(X_test, y_test), 
                      epochs=100, 
                      batch_size=16,
                      callbacks=[early_stopping])

Epoch 1/100

2024-05-09 00:37:58.719473: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-09 00:37:59.941116: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fd126dd7b30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-09 00:37:59.941158: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-05-09 00:37:59.949294: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-09 00:37:59.989075: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1715215080.093829     652 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

72/72 [==============================] - 5s 24ms/step - loss: 14110775296.0000 - accuracy: 0.1606 - val_loss: 4197664000.0000 - val_accuracy: 0.2292
Epoch 2/100
72/72 [==============================] - 1s 18ms/step - loss: 9057529856.0000 - accuracy: 0.2908 - val_loss: 5306324992.0000 - val_accuracy: 0.2257
Epoch 3/100
72/72 [==============================] - 1s 18ms/step - loss: 5125284352.0000 - accuracy: 0.3203 - val_loss: 3121011456.0000 - val_accuracy: 0.3125
Epoch 4/100
72/72 [==============================] - 1s 19ms/step - loss: 5616505856.0000 - accuracy: 0.3299 - val_loss: 4104053248.0000 - val_accuracy: 0.3299
Epoch 5/100
72/72 [==============================] - 1s 18ms/step - loss: 3889230848.0000 - accuracy: 0.3863 - val_loss: 3887718400.0000 - val_accuracy: 0.3090
Epoch 6/100
72/72 [==============================] - 1s 18ms/step - loss: 2214633984.0000 - accuracy: 0.4123 - val_loss: 5547031552.0000 - val_accuracy: 0.2778
Epoch 7/100
72/72 [==============================] - 1s 18ms/step - loss: 1584530048.0000 - accuracy: 0.4358 - val_loss: 5191700992.0000 - val_accuracy: 0.3438
Epoch 8/100
72/72 [==============================] - 1s 17ms/step - loss: 1952501504.0000 - accuracy: 0.4523 - val_loss: 3553359360.0000 - val_accuracy: 0.3125
Epoch 9/100
72/72 [==============================] - 1s 17ms/step - loss: 1480818176.0000 - accuracy: 0.4939 - val_loss: 5062780928.0000 - val_accuracy: 0.2604
Epoch 10/100
72/72 [==============================] - 1s 18ms/step - loss: 953954560.0000 - accuracy: 0.4835 - val_loss: 3849034752.0000 - val_accuracy: 0.2917
Epoch 11/100
72/72 [==============================] - 1s 17ms/step - loss: 1857220224.0000 - accuracy: 0.4887 - val_loss: 4090415360.0000 - val_accuracy: 0.2917
Epoch 12/100
72/72 [==============================] - 1s 17ms/step - loss: 2037850112.0000 - accuracy: 0.4974 - val_loss: 3606037760.0000 - val_accuracy: 0.2812
Epoch 13/100
72/72 [==============================] - 1s 18ms/step - loss: 1596498944.0000 - accuracy: 0.4818 - val_loss: 4720262656.0000 - val_accuracy: 0.3056
Epoch 14/100
72/72 [==============================] - 1s 18ms/step - loss: 1335536384.0000 - accuracy: 0.5165 - val_loss: 5685683712.0000 - val_accuracy: 0.3056
Epoch 15/100
72/72 [==============================] - 1s 17ms/step - loss: 2407896832.0000 - accuracy: 0.5217 - val_loss: 4185713408.0000 - val_accuracy: 0.2778
Epoch 16/100
72/72 [==============================] - 1s 17ms/step - loss: 783191936.0000 - accuracy: 0.5399 - val_loss: 4871812096.0000 - val_accuracy: 0.2986
Epoch 17/100
72/72 [==============================] - 1s 19ms/step - loss: 1004823808.0000 - accuracy: 0.5703 - val_loss: 7729543168.0000 - val_accuracy: 0.3021

plot_history(ffnn_history, 'FFNN')

evaluate_predictions(ffnn, 'FFNN', X_test=X_test, y_test=y_test)

9/9 [==============================] - 0s 3ms/step

X_lstm = mel_128
y = df['Emotion_Number'].values

X_train, X_test, y_train, y_test = train_test_split(X_lstm, y, test_size=0.2, random_state=109)

# adjust shapes so can pass in sequentially
X_train = np.transpose(X_train, (0, 2, 1))
X_test = np.transpose(X_test, (0, 2, 1))

lstm = Sequential([
    GaussianNoise(0.1, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(units=512, return_sequences=True),
    Dropout(0.2),
    LSTM(units=512),
    Dropout(0.2),
    Dense(units=64, activation='relu'),
    Dense(units=8, activation='softmax')
])

lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 gaussian_noise_1 (Gaussian  (None, 495, 128)          0         
 Noise)                                                          
                                                                 
 lstm_2 (LSTM)               (None, 495, 512)          1312768   
                                                                 
 dropout_4 (Dropout)         (None, 495, 512)          0         
                                                                 
 lstm_3 (LSTM)               (None, 512)               2099200   
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                32832     
                                                                 
 dense_6 (Dense)             (None, 8)                 520       
                                                                 
=================================================================
Total params: 3445320 (13.14 MB)
Trainable params: 3445320 (13.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

early_stopping = EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True)

filepath = "lstm_weights.h5"
checkpoint_callback = ModelCheckpoint(filepath, 
                                      monitor="val_accuracy", 
                                      save_weights_only=True, 
                                      save_best_only=True, 
                                      verbose=0)


lstm_history = lstm.fit(X_train, 
          y_train, 
          validation_data=(X_test, y_test), 
          epochs=100, 
          batch_size=16, 
          callbacks=[early_stopping, checkpoint_callback])

Epoch 1/100
72/72 [==============================] - 14s 132ms/step - loss: 2.0926 - accuracy: 0.1562 - val_loss: 2.0440 - val_accuracy: 0.1562
Epoch 2/100
72/72 [==============================] - 8s 114ms/step - loss: 2.0240 - accuracy: 0.1684 - val_loss: 2.0292 - val_accuracy: 0.1840
Epoch 3/100
72/72 [==============================] - 8s 115ms/step - loss: 2.0061 - accuracy: 0.2014 - val_loss: 2.0075 - val_accuracy: 0.1910
Epoch 4/100
72/72 [==============================] - 8s 115ms/step - loss: 1.9868 - accuracy: 0.1953 - val_loss: 1.9888 - val_accuracy: 0.2083
Epoch 5/100
72/72 [==============================] - 8s 115ms/step - loss: 1.9336 - accuracy: 0.2300 - val_loss: 1.9004 - val_accuracy: 0.2708
Epoch 6/100
72/72 [==============================] - 8s 113ms/step - loss: 1.9460 - accuracy: 0.2422 - val_loss: 1.8903 - val_accuracy: 0.2708
Epoch 7/100
72/72 [==============================] - 8s 115ms/step - loss: 1.8901 - accuracy: 0.2457 - val_loss: 1.9052 - val_accuracy: 0.2951
Epoch 8/100
72/72 [==============================] - 8s 113ms/step - loss: 1.8744 - accuracy: 0.2465 - val_loss: 1.8877 - val_accuracy: 0.2465
Epoch 9/100
72/72 [==============================] - 8s 113ms/step - loss: 1.8049 - accuracy: 0.2891 - val_loss: 1.9476 - val_accuracy: 0.2535
Epoch 10/100
72/72 [==============================] - 8s 115ms/step - loss: 1.7872 - accuracy: 0.2934 - val_loss: 1.7501 - val_accuracy: 0.3160
Epoch 11/100
72/72 [==============================] - 8s 113ms/step - loss: 1.7603 - accuracy: 0.2995 - val_loss: 1.8089 - val_accuracy: 0.2951
Epoch 12/100
72/72 [==============================] - 9s 119ms/step - loss: 1.7413 - accuracy: 0.3134 - val_loss: 1.7929 - val_accuracy: 0.2951
Epoch 13/100
72/72 [==============================] - 8s 115ms/step - loss: 1.7304 - accuracy: 0.3299 - val_loss: 1.8384 - val_accuracy: 0.3264
Epoch 14/100
72/72 [==============================] - 8s 115ms/step - loss: 1.7154 - accuracy: 0.3394 - val_loss: 1.7962 - val_accuracy: 0.3507
Epoch 15/100
72/72 [==============================] - 8s 113ms/step - loss: 1.6768 - accuracy: 0.3533 - val_loss: 1.7283 - val_accuracy: 0.3472
Epoch 16/100
72/72 [==============================] - 9s 122ms/step - loss: 1.6604 - accuracy: 0.3368 - val_loss: 1.7129 - val_accuracy: 0.3715
Epoch 17/100
72/72 [==============================] - 8s 113ms/step - loss: 1.6743 - accuracy: 0.3420 - val_loss: 1.7631 - val_accuracy: 0.3021
Epoch 18/100
72/72 [==============================] - 8s 113ms/step - loss: 1.6615 - accuracy: 0.3759 - val_loss: 1.7741 - val_accuracy: 0.3333
Epoch 19/100
72/72 [==============================] - 8s 113ms/step - loss: 1.6373 - accuracy: 0.3524 - val_loss: 1.7386 - val_accuracy: 0.3438
Epoch 20/100
72/72 [==============================] - 8s 115ms/step - loss: 1.5615 - accuracy: 0.3915 - val_loss: 1.6879 - val_accuracy: 0.3924
Epoch 21/100
72/72 [==============================] - 8s 115ms/step - loss: 1.5467 - accuracy: 0.4167 - val_loss: 1.6062 - val_accuracy: 0.4097
Epoch 22/100
72/72 [==============================] - 8s 113ms/step - loss: 1.5082 - accuracy: 0.4071 - val_loss: 1.6971 - val_accuracy: 0.3542
Epoch 23/100
72/72 [==============================] - 8s 113ms/step - loss: 1.4825 - accuracy: 0.4262 - val_loss: 1.6937 - val_accuracy: 0.3646
Epoch 24/100
72/72 [==============================] - 8s 113ms/step - loss: 1.4557 - accuracy: 0.4262 - val_loss: 1.6134 - val_accuracy: 0.4028
Epoch 25/100
72/72 [==============================] - 8s 115ms/step - loss: 1.4281 - accuracy: 0.4332 - val_loss: 1.6540 - val_accuracy: 0.4167
Epoch 26/100
72/72 [==============================] - 8s 113ms/step - loss: 1.4360 - accuracy: 0.4280 - val_loss: 1.6597 - val_accuracy: 0.3646
Epoch 27/100
72/72 [==============================] - 8s 113ms/step - loss: 1.4239 - accuracy: 0.4332 - val_loss: 1.6897 - val_accuracy: 0.3681
Epoch 28/100
72/72 [==============================] - 8s 113ms/step - loss: 1.4599 - accuracy: 0.4410 - val_loss: 1.7109 - val_accuracy: 0.3715
Epoch 29/100
72/72 [==============================] - 8s 113ms/step - loss: 1.3896 - accuracy: 0.4635 - val_loss: 1.7180 - val_accuracy: 0.3924
Epoch 30/100
72/72 [==============================] - 8s 112ms/step - loss: 1.3812 - accuracy: 0.4618 - val_loss: 1.7227 - val_accuracy: 0.3611
Epoch 31/100
72/72 [==============================] - 8s 113ms/step - loss: 1.3511 - accuracy: 0.4931 - val_loss: 1.7895 - val_accuracy: 0.3403
Epoch 32/100
72/72 [==============================] - 8s 113ms/step - loss: 1.2899 - accuracy: 0.5087 - val_loss: 1.8142 - val_accuracy: 0.3958
Epoch 33/100
72/72 [==============================] - 8s 113ms/step - loss: 1.2784 - accuracy: 0.5165 - val_loss: 1.6934 - val_accuracy: 0.3854
Epoch 34/100
72/72 [==============================] - 8s 113ms/step - loss: 1.2440 - accuracy: 0.5399 - val_loss: 1.6373 - val_accuracy: 0.3889
Epoch 35/100
72/72 [==============================] - 8s 115ms/step - loss: 1.1871 - accuracy: 0.5391 - val_loss: 1.6150 - val_accuracy: 0.4306
Epoch 36/100
72/72 [==============================] - 8s 113ms/step - loss: 1.2689 - accuracy: 0.5174 - val_loss: 1.6526 - val_accuracy: 0.4201
Epoch 37/100
72/72 [==============================] - 8s 113ms/step - loss: 1.1615 - accuracy: 0.5616 - val_loss: 1.7449 - val_accuracy: 0.4062
Epoch 38/100
72/72 [==============================] - 8s 112ms/step - loss: 1.1311 - accuracy: 0.5677 - val_loss: 1.6842 - val_accuracy: 0.4028
Epoch 39/100
72/72 [==============================] - 8s 115ms/step - loss: 1.0898 - accuracy: 0.6163 - val_loss: 1.7551 - val_accuracy: 0.4479
Epoch 40/100
72/72 [==============================] - 8s 113ms/step - loss: 1.1076 - accuracy: 0.5929 - val_loss: 1.6909 - val_accuracy: 0.3924
Epoch 41/100
72/72 [==============================] - 8s 113ms/step - loss: 1.0520 - accuracy: 0.6137 - val_loss: 1.6024 - val_accuracy: 0.4444
Epoch 42/100
72/72 [==============================] - 8s 113ms/step - loss: 1.0191 - accuracy: 0.6259 - val_loss: 1.6725 - val_accuracy: 0.4132
Epoch 43/100
72/72 [==============================] - 8s 113ms/step - loss: 1.0450 - accuracy: 0.6276 - val_loss: 1.8967 - val_accuracy: 0.4167
Epoch 44/100
72/72 [==============================] - 8s 113ms/step - loss: 0.9484 - accuracy: 0.6476 - val_loss: 1.8479 - val_accuracy: 0.4236
Epoch 45/100
72/72 [==============================] - 8s 113ms/step - loss: 0.9562 - accuracy: 0.6380 - val_loss: 1.6428 - val_accuracy: 0.4479
Epoch 46/100
72/72 [==============================] - 8s 113ms/step - loss: 0.9567 - accuracy: 0.6727 - val_loss: 1.9062 - val_accuracy: 0.4028
Epoch 47/100
72/72 [==============================] - 8s 113ms/step - loss: 0.8372 - accuracy: 0.6814 - val_loss: 1.8974 - val_accuracy: 0.4201
Epoch 48/100
72/72 [==============================] - 8s 113ms/step - loss: 0.8651 - accuracy: 0.6892 - val_loss: 1.8136 - val_accuracy: 0.4167
Epoch 49/100
72/72 [==============================] - 8s 113ms/step - loss: 0.8634 - accuracy: 0.6814 - val_loss: 1.8532 - val_accuracy: 0.4028
Epoch 50/100
72/72 [==============================] - 8s 112ms/step - loss: 0.8390 - accuracy: 0.6858 - val_loss: 1.8163 - val_accuracy: 0.4306
Epoch 51/100
72/72 [==============================] - 8s 113ms/step - loss: 0.7814 - accuracy: 0.7109 - val_loss: 1.8316 - val_accuracy: 0.4271
Epoch 52/100
72/72 [==============================] - 8s 112ms/step - loss: 0.9433 - accuracy: 0.6641 - val_loss: 1.8615 - val_accuracy: 0.4132
Epoch 53/100
72/72 [==============================] - 8s 113ms/step - loss: 0.9097 - accuracy: 0.6710 - val_loss: 1.8992 - val_accuracy: 0.4028
Epoch 54/100
72/72 [==============================] - 8s 113ms/step - loss: 0.8308 - accuracy: 0.6910 - val_loss: 1.8929 - val_accuracy: 0.4236

plot_history(lstm_history, "LSTM")

evaluate_predictions(lstm, 'LSTM', X_test=X_test, y_test=y_test)

9/9 [==============================] - 1s 53ms/step

# Prepare dataset
X_transformer = np.transpose(mel_256, (0, 2, 1))
y = df['Emotion_Number'].values

# Train-val split
X_train, X_val, y_train, y_val = train_test_split(X_transformer, y, test_size=0.2, stratify=y, random_state=109)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

def make_dataset(x, y, batch_size=32):
    data = tf.data.Dataset.from_tensor_slices((x, y))
    data = data.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data

# Make tf datasets
train_ds = make_dataset(X_train, y_train)
val_ds = make_dataset(X_val, y_val)

def get_relative_positions(max_seq_length):
    # Create a matrix where the element at [i, j] is j-i; i.e., the relative distance from i to j
    range_vec = tf.range(max_seq_length)
    range_mat = tf.reshape(range_vec, [1, -1])
    distance_mat = range_mat - tf.transpose(range_mat)
    return distance_mat

def get_relative_positional_encoding(max_seq_length, d_model):
    # Compute relative positions
    relative_positions = get_relative_positions(max_seq_length)
    
    # Adjust positions to be within the model's scale
    max_relative_position = max_seq_length - 1
    
    # Clamp the values in the matrix to be within [-max_relative_position, max_relative_position]
    relative_positions = tf.clip_by_value(relative_positions, -max_relative_position, max_relative_position)
    
    # Embeddings for each relative position
    relative_position_embeddings = tf.keras.layers.Embedding(
        2 * max_relative_position + 1, d_model)(relative_positions + max_relative_position)
    
    # Reduce over sequence length to match shape for broadcasting
    relative_position_embeddings = tf.reduce_mean(relative_position_embeddings, axis=1)
    
    return relative_position_embeddings

def transformer_encoder(inputs, embed_dim, num_heads, ff_dim, rate=0.2):
    # Multi-head attention
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention_output = layers.Dropout(rate)(attention_output)
    attention_output = layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    
    # Feed-forward network
    ffn_output = layers.Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = layers.Dense(embed_dim)(ffn_output)
    ffn_output = layers.Dropout(rate)(ffn_output)
    ffn_output = layers.LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)
    return ffn_output

def build_model(input_shape, embed_dim, num_heads, ff_dim, max_seq_length, num_classes, num_layers):
    # Input validation for num_heads
    if isinstance(num_heads, int):
        # If num_heads is an integer, use the same number of heads across all layers
        num_heads_list = [num_heads] * num_layers
    elif isinstance(num_heads, list):
        # If num_heads is a list, check that its length matches num_layers
        if len(num_heads) != num_layers:
            raise ValueError(f"The length of num_heads list must be equal to num_layers ({num_layers}).")
        num_heads_list = num_heads
    else:
        raise TypeError("num_heads must be either an integer or a list of integers.")
    
    inputs = layers.Input(shape=input_shape)
    x = layers.GaussianNoise(0.1)(inputs)
    x = layers.Dense(embed_dim)(x)
    x += get_relative_positional_encoding(max_seq_length, embed_dim)
    for i in range(num_layers):
        num_heads = num_heads_list[i]
        x = transformer_encoder(x, embed_dim, num_heads, ff_dim)
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    return model

input_shape = (495, 256)  # 495 timesteps, 256 Mel features
embed_dim = 50  # Size of the embedding vector
num_heads = 10   # Number of attention heads
ff_dim = 192  # Hidden layer size in feed forward network inside transformer
max_seq_length = 495  # Maximum sequence length
num_classes = 8  # Number of emotions
num_layers = 3 # Number of transformer blocks

transformer = build_model(input_shape, embed_dim, num_heads, ff_dim, max_seq_length, num_classes, num_layers)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=1000,
    decay_rate=0.9)
optimizer_sch = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

transformer.compile(optimizer=optimizer_sch, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

transformer = load_model('models/transformer.h5') # Load in model with above architecture and best weights

transformer.summary()

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 input_17 (InputLayer)       [(None, 495, 256)]           0         []                            
                                                                                                  
 gaussian_noise_16 (Gaussia  (None, 495, 256)             0         ['input_17[0][0]']            
 nNoise)                                                                                          
                                                                                                  
 dense_123 (Dense)           (None, 495, 50)              12850     ['gaussian_noise_16[0][0]']   
                                                                                                  
 tf.__operators__.add_108 (  (None, 495, 50)              0         ['dense_123[0][0]']           
 TFOpLambda)                                                                                      
                                                                                                  
 multi_head_attention_47 (M  (None, 495, 50)              101550    ['tf.__operators__.add_108[0][
 ultiHeadAttention)                                                 0]',                          
                                                                     'tf.__operators__.add_108[0][
                                                                    0]']                          
                                                                                                  
 dropout_98 (Dropout)        (None, 495, 50)              0         ['multi_head_attention_47[0][0
                                                                    ]']                           
                                                                                                  
 tf.__operators__.add_109 (  (None, 495, 50)              0         ['tf.__operators__.add_108[0][
 TFOpLambda)                                                        0]',                          
                                                                     'dropout_98[0][0]']          
                                                                                                  
 layer_normalization_92 (La  (None, 495, 50)              100       ['tf.__operators__.add_109[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 dense_124 (Dense)           (None, 495, 192)             9792      ['layer_normalization_92[0][0]
                                                                    ']                            
                                                                                                  
 dense_125 (Dense)           (None, 495, 50)              9650      ['dense_124[0][0]']           
                                                                                                  
 dropout_99 (Dropout)        (None, 495, 50)              0         ['dense_125[0][0]']           
                                                                                                  
 tf.__operators__.add_110 (  (None, 495, 50)              0         ['layer_normalization_92[0][0]
 TFOpLambda)                                                        ',                            
                                                                     'dropout_99[0][0]']          
                                                                                                  
 layer_normalization_93 (La  (None, 495, 50)              100       ['tf.__operators__.add_110[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 multi_head_attention_48 (M  (None, 495, 50)              101550    ['layer_normalization_93[0][0]
 ultiHeadAttention)                                                 ',                            
                                                                     'layer_normalization_93[0][0]
                                                                    ']                            
                                                                                                  
 dropout_100 (Dropout)       (None, 495, 50)              0         ['multi_head_attention_48[0][0
                                                                    ]']                           
                                                                                                  
 tf.__operators__.add_111 (  (None, 495, 50)              0         ['layer_normalization_93[0][0]
 TFOpLambda)                                                        ',                            
                                                                     'dropout_100[0][0]']         
                                                                                                  
 layer_normalization_94 (La  (None, 495, 50)              100       ['tf.__operators__.add_111[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 dense_126 (Dense)           (None, 495, 192)             9792      ['layer_normalization_94[0][0]
                                                                    ']                            
                                                                                                  
 dense_127 (Dense)           (None, 495, 50)              9650      ['dense_126[0][0]']           
                                                                                                  
 dropout_101 (Dropout)       (None, 495, 50)              0         ['dense_127[0][0]']           
                                                                                                  
 tf.__operators__.add_112 (  (None, 495, 50)              0         ['layer_normalization_94[0][0]
 TFOpLambda)                                                        ',                            
                                                                     'dropout_101[0][0]']         
                                                                                                  
 layer_normalization_95 (La  (None, 495, 50)              100       ['tf.__operators__.add_112[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 multi_head_attention_49 (M  (None, 495, 50)              101550    ['layer_normalization_95[0][0]
 ultiHeadAttention)                                                 ',                            
                                                                     'layer_normalization_95[0][0]
                                                                    ']                            
                                                                                                  
 dropout_102 (Dropout)       (None, 495, 50)              0         ['multi_head_attention_49[0][0
                                                                    ]']                           
                                                                                                  
 tf.__operators__.add_113 (  (None, 495, 50)              0         ['layer_normalization_95[0][0]
 TFOpLambda)                                                        ',                            
                                                                     'dropout_102[0][0]']         
                                                                                                  
 layer_normalization_96 (La  (None, 495, 50)              100       ['tf.__operators__.add_113[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 dense_128 (Dense)           (None, 495, 192)             9792      ['layer_normalization_96[0][0]
                                                                    ']                            
                                                                                                  
 dense_129 (Dense)           (None, 495, 50)              9650      ['dense_128[0][0]']           
                                                                                                  
 dropout_103 (Dropout)       (None, 495, 50)              0         ['dense_129[0][0]']           
                                                                                                  
 tf.__operators__.add_114 (  (None, 495, 50)              0         ['layer_normalization_96[0][0]
 TFOpLambda)                                                        ',                            
                                                                     'dropout_103[0][0]']         
                                                                                                  
 layer_normalization_97 (La  (None, 495, 50)              100       ['tf.__operators__.add_114[0][
 yerNormalization)                                                  0]']                          
                                                                                                  
 global_average_pooling1d_1  (None, 50)                   0         ['layer_normalization_97[0][0]
 5 (GlobalAveragePooling1D)                                         ']                            
                                                                                                  
 dense_130 (Dense)           (None, 8)                    408       ['global_average_pooling1d_15[
                                                                    0][0]']                       
                                                                                                  
==================================================================================================
Total params: 376834 (1.44 MB)
Trainable params: 376834 (1.44 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

early_stopping = EarlyStopping(
    monitor='val_accuracy', 
    patience=20, 
    restore_best_weights=True)

callbacks = [early_stopping]

### Not training here, best model loaded in above
# history = transformer.fit(train_ds, validation_data=val_ds, epochs=200, verbose=1, callbacks=callbacks)

loss, accuracy = transformer.evaluate(val_ds)
print(f"Validation Loss: {loss:.2f}, Validation Accuracy: {accuracy:.2%}")

9/9 [==============================] - 2s 70ms/step - loss: 2.5667 - accuracy: 0.5139
Validation Loss: 2.57, Validation Accuracy: 51.39%

evaluate_predictions(transformer, "Transformer", X_test=X_val, y_test=y_val)

9/9 [==============================] - 1s 64ms/step

audio_dataset = Dataset.from_dict({"audio": wav_paths}).cast_column("audio", AudioCast())
audio_dataset = audio_dataset.add_column('label', df['Emotion_Number']-1)

audio_dataset = audio_dataset.train_test_split(test_size=0.2)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

audio = audio_dataset.cast_column("audio", AudioCast(sampling_rate=16_000))

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, padding='longest')
    return inputs

encoded_audio = audio.map(preprocess_function, remove_columns="audio", batched=True)

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

num_labels = len(id2label)

##Load Wav2Vec Model 
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

summary(model)

================================================================================
Layer (type:depth-idx)                                  Param #
================================================================================
Wav2Vec2ForSequenceClassification                       --
├─Wav2Vec2Model: 1-1                                    768
│    └─Wav2Vec2FeatureEncoder: 2-1                      --
│    │    └─ModuleList: 3-1                             4,200,448
│    └─Wav2Vec2FeatureProjection: 2-2                   --
│    │    └─LayerNorm: 3-2                              1,024
│    │    └─Linear: 3-3                                 393,984
│    │    └─Dropout: 3-4                                --
│    └─Wav2Vec2Encoder: 2-3                             --
│    │    └─Wav2Vec2PositionalConvEmbedding: 3-5        4,719,488
│    │    └─LayerNorm: 3-6                              1,536
│    │    └─Dropout: 3-7                                --
│    │    └─ModuleList: 3-8                             85,054,464
├─Linear: 1-2                                           196,864
├─Linear: 1-3                                           2,056
================================================================================
Total params: 94,570,632
Trainable params: 94,570,632
Non-trainable params: 0
================================================================================

#Load model that was trained on Google Colab
# model = AutoModelForAudioClassification.from_pretrained('models/wav2vec10epochs')

# training_args = TrainingArguments(
#     output_dir="wav2vec2_audio",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=3e-5,
#     per_device_train_batch_size=32,
#     gradient_accumulation_steps=4,
#     per_device_eval_batch_size=32,
#     num_train_epochs=10,
#     warmup_ratio=0.1,
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy"
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=encoded_audio["train"],
#     eval_dataset=encoded_audio["test"],
#     tokenizer=feature_extractor,
#     compute_metrics=compute_metrics,
# )

# trainer.train()

#load saved results from training on colab
metrics=pd.read_csv('TrainingResults.csv')

epochs=metrics['Epoch'].to_list()
train_loss=metrics['Training Loss'].to_list()
val_loss=metrics['Validation Loss'].to_list()
val_accuracy=metrics['Validation Accuracy'].to_list()

plt.figure(figsize=(10, 4))

# Plot validation accuracy
plt.subplot(1, 2, 1)
plt.plot(epochs, val_accuracy, c='darkorange', label='validation')
plt.title('SOTA Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(epochs[1:], train_loss[1:], label='train')
plt.plot(epochs, val_loss, label='validation')
plt.title('SOTA Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

### Get model predictions on validation set

# output = trainer.predict(encoded_audio['test'])

# predictions = output.predictions
# label_ids = output.label_ids
# metrics = output.metrics

# preds=[np.argmax(x) for x in predictions]

#Load saved predictions
outputs_df = pd.read_csv('predictions.csv')

sns.heatmap(confusion_matrix(outputs_df['labels'], outputs_df['preds']), annot=True, xticklabels = [id2label[i] for i in range(8)], yticklabels = [id2label[i] for i in range(8)], cmap='Blues');
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Wav2Vec2 Confusion Matrix')
plt.show()

	Emotion_Number	Emotion	Intensity	Statement_Number	Statement	Repetition	Actor	Gender_Number	Gender	Frame_Rate	Num_Frames	Data
0	5	fearful	1	2	Dogs	1	17	1	Male	48000	169770	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1	3	sad	2	2	Dogs	2	17	1	Male	48000	171371	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2	4	angry	2	1	Kids	2	17	1	Male	48000	179379	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

	Change_Deviation
Emotion
angry	899.235758
calm	50.527442
disgust	167.523488
fearful	446.503306
happy	339.889837
neutral	68.235074
sad	118.996169
surprised	198.101180

Data Science 2: Advanced Topics in Data Science

Final Project: Sentiment Analysis from Audio Data¶

Notebook Contents¶

Setup¶

Problem Statement¶

Data Preprocessing¶

Data Description¶

Summary of Data¶

Data Preprocessing¶

EDA¶

Visualize Audio Sample¶

Length Variation¶

Amplitude Variation¶

Standard Deviation of $\Delta$amplitude¶

Summary of Findings¶

Baseline Model¶

Baseline Evaluation¶

Data Collection Pipeline & Tools¶

Data Collection Results¶

Baseline Logistic Regression¶

Interpretation & Analysis¶

Final Models¶

Modeling Preparation¶

Mel Spectrogram¶

Helper Functions¶

FFNN¶

Data Preparation¶

Build & Compile Model¶

Train Model¶

Evaluation & Analysis¶

LSTM¶

Data Preparation¶

Build & Compile Model¶

Train Model¶

Evaluation & Analysis¶

Transformer¶

Data Preparation¶

Build & Compile Model¶

Train Model¶

Evaluation & Analysis¶

SOTA¶

Data Preparation¶

Load Model¶

Configure & Train Model¶

Evaluation & Analysis¶

Discussion¶