uni/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.py

# -*- coding: utf-8 -*-
"""assignment.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ILT6kccc8NHrY7xonz-ERej4MkGAw1Qv

Set up
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

train_data = pd.read_csv('/content/drive/MyDrive/wildfires_training.csv')
test_data = pd.read_csv('/content/drive/MyDrive/wildfires_test.csv')

X_train = train_data.drop(columns=['fire'])
y_train = train_data['fire']

X_test = test_data.drop(columns=['fire'])
y_test = test_data['fire']

"""RandomForestClassifier with default parameters:"""

# intialise the randomforestclassifier with a set random seed
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

# train and get accuracy
train_predictions = rfc.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_report = classification_report(y_train, train_predictions)

print(f"Training Accuracy: {train_accuracy:.4f}\n")
print("Classification Report of Testing Results:")
print(train_report)

# test and get accuracy
test_predictions = rfc.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_report = classification_report(y_test, test_predictions)

print(f"Testing Accuracy: {test_accuracy:.4f}")
print("Classification Report of Testing Results:")
print(test_report)

# create a confusion matrix to visualise the data
cm = confusion_matrix(y_test, test_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

"""RandomForestClassifier with tuning:"""

# initialise a range of hyperparameters to loop over
n_estimators_range = [1, 3, 5, 10, 50, 100, 250, 500, 1000]
max_depth_range = [1, 5, 10, 20, 30, None]

# matrix to store the accuracy of each hyperparameter pair
accuracy_matrix = np.zeros((len(max_depth_range), len(n_estimators_range)))

# variable to track
best_accuracy = 0
best_n_estimators = None
best_max_depth = None

# looping over each hyperparam value
for i, max_depth in enumerate(max_depth_range):
    for j, n_estimators in enumerate(n_estimators_range):
        rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
        rfc.fit(X_train, y_train)

        y_pred = rfc.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_matrix[i, j] = accuracy

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_n_estimators = n_estimators
            best_max_depth = max_depth

# heatmap of accuracies
plt.figure(figsize=(10, 6))
sns.heatmap(accuracy_matrix, annot=True, fmt=".3f", cmap="YlGnBu",
            xticklabels=n_estimators_range,
            yticklabels=[str(depth) if depth is not None else "None" for depth in max_depth_range])
plt.title('Accuracy for different n_estimators and max_depth values')
plt.xlabel('n_estimators')
plt.ylabel('max_depth')
plt.show()

print(f"Best Accuracy: {best_accuracy:.3f}")
print(f"Best n_estimators: {best_n_estimators}")
print(f"Best max_depth: {best_max_depth}")

"""SVC classifier with default params on the unprocessed data"""

# svc with default params
svc = SVC()
svc.fit(X_train, y_train)

# get training accuracy
svc_train_predictions = svc.predict(X_train)
svc_train_accuracy = accuracy_score(y_train, svc_train_predictions)
svc_train_report = classification_report(y_train, svc_train_predictions)

print(f"Training Accuracy: {svc_train_accuracy:.4f}\n")
print("Classification Report of Training Results:")
print(svc_train_report)

# get testing accuracy
svc_test_predictions = svc.predict(X_test)
svc_test_accuracy = accuracy_score(y_test, svc_test_predictions)
svc_test_report = classification_report(y_test, svc_test_predictions)

print(f"Testing Accuracy: {svc_test_accuracy:.4f}")
print("Classification Report of Testing Results:")
print(svc_test_report)

# confusino matrix of the testing accuracy
cm = confusion_matrix(y_test, svc_test_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No', 'Yes'],
            yticklabels=['No', 'Yes'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Testing Results')
plt.show()

"""SVC with hyperparameter tuning"""

# initialise a range of hyperparameters to loop over
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']

# matrix to store the accuracy of each hyperparameter pair
accuracy_matrix = np.zeros((len(kernel_types), len(C_values)))

# variables to track the best accuracy and corresponding hyperparameters
best_accuracy = 0
best_C = None
best_kernel = None

# looping over each hyperparameter value
for i, kernel in enumerate(kernel_types):
    for j, C in enumerate(C_values):
        svc = SVC(C=C, kernel=kernel)
        svc.fit(X_train, y_train)

        svc_test_predictions = svc.predict(X_test)
        accuracy = accuracy_score(y_test, svc_test_predictions)
        accuracy_matrix[i, j] = accuracy

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_C = C
            best_kernel = kernel

# heatmap of accuracies
plt.figure(figsize=(10, 6))
sns.heatmap(accuracy_matrix, annot=True, fmt=".2f", cmap='Blues',
            xticklabels=[f"{C:.3f}" for C in C_values],
            yticklabels=kernel_types)
plt.title('Accuracy for different C values and kernel types')
plt.xlabel('C Value')
plt.ylabel('Kernel Type')
plt.show()

print(f"Best Accuracy: {best_accuracy:.3f}")
print(f"Best C: {best_C:.3f}")
print(f"Best Kernel: {best_kernel}")