[CT4101]: Add Assignment 1 code
This commit is contained in:
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -0,0 +1,187 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""assignment.ipynb
|
||||
|
||||
Automatically generated by Colab.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1ILT6kccc8NHrY7xonz-ERej4MkGAw1Qv
|
||||
|
||||
Set up
|
||||
"""
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import accuracy_score, classification_report
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||
from sklearn.svm import SVC
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
train_data = pd.read_csv('/content/drive/MyDrive/wildfires_training.csv')
|
||||
test_data = pd.read_csv('/content/drive/MyDrive/wildfires_test.csv')
|
||||
|
||||
X_train = train_data.drop(columns=['fire'])
|
||||
y_train = train_data['fire']
|
||||
|
||||
X_test = test_data.drop(columns=['fire'])
|
||||
y_test = test_data['fire']
|
||||
|
||||
"""RandomForestClassifier with default parameters:"""
|
||||
|
||||
# intialise the randomforestclassifier with a set random seed
|
||||
rfc = RandomForestClassifier(random_state=0)
|
||||
rfc.fit(X_train, y_train)
|
||||
|
||||
# train and get accuracy
|
||||
train_predictions = rfc.predict(X_train)
|
||||
train_accuracy = accuracy_score(y_train, train_predictions)
|
||||
train_report = classification_report(y_train, train_predictions)
|
||||
|
||||
print(f"Training Accuracy: {train_accuracy:.4f}\n")
|
||||
print("Classification Report of Testing Results:")
|
||||
print(train_report)
|
||||
|
||||
# test and get accuracy
|
||||
test_predictions = rfc.predict(X_test)
|
||||
test_accuracy = accuracy_score(y_test, test_predictions)
|
||||
test_report = classification_report(y_test, test_predictions)
|
||||
|
||||
print(f"Testing Accuracy: {test_accuracy:.4f}")
|
||||
print("Classification Report of Testing Results:")
|
||||
print(test_report)
|
||||
|
||||
# create a confusion matrix to visualise the data
|
||||
cm = confusion_matrix(y_test, test_predictions)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
|
||||
plt.ylabel('Actual')
|
||||
plt.xlabel('Predicted')
|
||||
plt.title('Confusion Matrix')
|
||||
plt.show()
|
||||
|
||||
"""RandomForestClassifier with tuning:"""
|
||||
|
||||
# initialise a range of hyperparameters to loop over
|
||||
n_estimators_range = [1, 3, 5, 10, 50, 100, 250, 500, 1000]
|
||||
max_depth_range = [1, 5, 10, 20, 30, None]
|
||||
|
||||
# matrix to store the accuracy of each hyperparameter pair
|
||||
accuracy_matrix = np.zeros((len(max_depth_range), len(n_estimators_range)))
|
||||
|
||||
# variable to track
|
||||
best_accuracy = 0
|
||||
best_n_estimators = None
|
||||
best_max_depth = None
|
||||
|
||||
# looping over each hyperparam value
|
||||
for i, max_depth in enumerate(max_depth_range):
|
||||
for j, n_estimators in enumerate(n_estimators_range):
|
||||
rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
|
||||
rfc.fit(X_train, y_train)
|
||||
|
||||
y_pred = rfc.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
accuracy_matrix[i, j] = accuracy
|
||||
|
||||
if accuracy > best_accuracy:
|
||||
best_accuracy = accuracy
|
||||
best_n_estimators = n_estimators
|
||||
best_max_depth = max_depth
|
||||
|
||||
# heatmap of accuracies
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.heatmap(accuracy_matrix, annot=True, fmt=".3f", cmap="YlGnBu",
|
||||
xticklabels=n_estimators_range,
|
||||
yticklabels=[str(depth) if depth is not None else "None" for depth in max_depth_range])
|
||||
plt.title('Accuracy for different n_estimators and max_depth values')
|
||||
plt.xlabel('n_estimators')
|
||||
plt.ylabel('max_depth')
|
||||
plt.show()
|
||||
|
||||
print(f"Best Accuracy: {best_accuracy:.3f}")
|
||||
print(f"Best n_estimators: {best_n_estimators}")
|
||||
print(f"Best max_depth: {best_max_depth}")
|
||||
|
||||
"""SVC classifier with default params on the unprocessed data"""
|
||||
|
||||
# svc with default params
|
||||
svc = SVC()
|
||||
svc.fit(X_train, y_train)
|
||||
|
||||
# get training accuracy
|
||||
svc_train_predictions = svc.predict(X_train)
|
||||
svc_train_accuracy = accuracy_score(y_train, svc_train_predictions)
|
||||
svc_train_report = classification_report(y_train, svc_train_predictions)
|
||||
|
||||
print(f"Training Accuracy: {svc_train_accuracy:.4f}\n")
|
||||
print("Classification Report of Training Results:")
|
||||
print(svc_train_report)
|
||||
|
||||
# get testing accuracy
|
||||
svc_test_predictions = svc.predict(X_test)
|
||||
svc_test_accuracy = accuracy_score(y_test, svc_test_predictions)
|
||||
svc_test_report = classification_report(y_test, svc_test_predictions)
|
||||
|
||||
print(f"Testing Accuracy: {svc_test_accuracy:.4f}")
|
||||
print("Classification Report of Testing Results:")
|
||||
print(svc_test_report)
|
||||
|
||||
# confusino matrix of the testing accuracy
|
||||
cm = confusion_matrix(y_test, svc_test_predictions)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['No', 'Yes'],
|
||||
yticklabels=['No', 'Yes'])
|
||||
plt.xlabel('Predicted Label')
|
||||
plt.ylabel('True Label')
|
||||
plt.title('Confusion Matrix for Testing Results')
|
||||
plt.show()
|
||||
|
||||
"""SVC with hyperparameter tuning"""
|
||||
|
||||
# initialise a range of hyperparameters to loop over
|
||||
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
|
||||
kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']
|
||||
|
||||
# matrix to store the accuracy of each hyperparameter pair
|
||||
accuracy_matrix = np.zeros((len(kernel_types), len(C_values)))
|
||||
|
||||
# variables to track the best accuracy and corresponding hyperparameters
|
||||
best_accuracy = 0
|
||||
best_C = None
|
||||
best_kernel = None
|
||||
|
||||
# looping over each hyperparameter value
|
||||
for i, kernel in enumerate(kernel_types):
|
||||
for j, C in enumerate(C_values):
|
||||
svc = SVC(C=C, kernel=kernel)
|
||||
svc.fit(X_train, y_train)
|
||||
|
||||
svc_test_predictions = svc.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, svc_test_predictions)
|
||||
accuracy_matrix[i, j] = accuracy
|
||||
|
||||
if accuracy > best_accuracy:
|
||||
best_accuracy = accuracy
|
||||
best_C = C
|
||||
best_kernel = kernel
|
||||
|
||||
# heatmap of accuracies
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.heatmap(accuracy_matrix, annot=True, fmt=".2f", cmap='Blues',
|
||||
xticklabels=[f"{C:.3f}" for C in C_values],
|
||||
yticklabels=kernel_types)
|
||||
plt.title('Accuracy for different C values and kernel types')
|
||||
plt.xlabel('C Value')
|
||||
plt.ylabel('Kernel Type')
|
||||
plt.show()
|
||||
|
||||
print(f"Best Accuracy: {best_accuracy:.3f}")
|
||||
print(f"Best C: {best_C:.3f}")
|
||||
print(f"Best Kernel: {best_kernel}")
|
Reference in New Issue
Block a user