187 lines
6.1 KiB
Python
187 lines
6.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""assignment.ipynb
|
|
|
|
Automatically generated by Colab.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1ILT6kccc8NHrY7xonz-ERej4MkGAw1Qv
|
|
|
|
Set up
|
|
"""
|
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.metrics import accuracy_score, classification_report
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
from sklearn.svm import SVC
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
|
|
train_data = pd.read_csv('/content/drive/MyDrive/wildfires_training.csv')
|
|
test_data = pd.read_csv('/content/drive/MyDrive/wildfires_test.csv')
|
|
|
|
X_train = train_data.drop(columns=['fire'])
|
|
y_train = train_data['fire']
|
|
|
|
X_test = test_data.drop(columns=['fire'])
|
|
y_test = test_data['fire']
|
|
|
|
"""RandomForestClassifier with default parameters:"""
|
|
|
|
# intialise the randomforestclassifier with a set random seed
|
|
rfc = RandomForestClassifier(random_state=0)
|
|
rfc.fit(X_train, y_train)
|
|
|
|
# train and get accuracy
|
|
train_predictions = rfc.predict(X_train)
|
|
train_accuracy = accuracy_score(y_train, train_predictions)
|
|
train_report = classification_report(y_train, train_predictions)
|
|
|
|
print(f"Training Accuracy: {train_accuracy:.4f}\n")
|
|
print("Classification Report of Testing Results:")
|
|
print(train_report)
|
|
|
|
# test and get accuracy
|
|
test_predictions = rfc.predict(X_test)
|
|
test_accuracy = accuracy_score(y_test, test_predictions)
|
|
test_report = classification_report(y_test, test_predictions)
|
|
|
|
print(f"Testing Accuracy: {test_accuracy:.4f}")
|
|
print("Classification Report of Testing Results:")
|
|
print(test_report)
|
|
|
|
# create a confusion matrix to visualise the data
|
|
cm = confusion_matrix(y_test, test_predictions)
|
|
plt.figure(figsize=(8, 6))
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
|
xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
|
|
plt.ylabel('Actual')
|
|
plt.xlabel('Predicted')
|
|
plt.title('Confusion Matrix')
|
|
plt.show()
|
|
|
|
"""RandomForestClassifier with tuning:"""
|
|
|
|
# initialise a range of hyperparameters to loop over
|
|
n_estimators_range = [1, 3, 5, 10, 50, 100, 250, 500, 1000]
|
|
max_depth_range = [1, 5, 10, 20, 30, None]
|
|
|
|
# matrix to store the accuracy of each hyperparameter pair
|
|
accuracy_matrix = np.zeros((len(max_depth_range), len(n_estimators_range)))
|
|
|
|
# variable to track
|
|
best_accuracy = 0
|
|
best_n_estimators = None
|
|
best_max_depth = None
|
|
|
|
# looping over each hyperparam value
|
|
for i, max_depth in enumerate(max_depth_range):
|
|
for j, n_estimators in enumerate(n_estimators_range):
|
|
rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
|
|
rfc.fit(X_train, y_train)
|
|
|
|
y_pred = rfc.predict(X_test)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
accuracy_matrix[i, j] = accuracy
|
|
|
|
if accuracy > best_accuracy:
|
|
best_accuracy = accuracy
|
|
best_n_estimators = n_estimators
|
|
best_max_depth = max_depth
|
|
|
|
# heatmap of accuracies
|
|
plt.figure(figsize=(10, 6))
|
|
sns.heatmap(accuracy_matrix, annot=True, fmt=".3f", cmap="YlGnBu",
|
|
xticklabels=n_estimators_range,
|
|
yticklabels=[str(depth) if depth is not None else "None" for depth in max_depth_range])
|
|
plt.title('Accuracy for different n_estimators and max_depth values')
|
|
plt.xlabel('n_estimators')
|
|
plt.ylabel('max_depth')
|
|
plt.show()
|
|
|
|
print(f"Best Accuracy: {best_accuracy:.3f}")
|
|
print(f"Best n_estimators: {best_n_estimators}")
|
|
print(f"Best max_depth: {best_max_depth}")
|
|
|
|
"""SVC classifier with default params on the unprocessed data"""
|
|
|
|
# svc with default params
|
|
svc = SVC()
|
|
svc.fit(X_train, y_train)
|
|
|
|
# get training accuracy
|
|
svc_train_predictions = svc.predict(X_train)
|
|
svc_train_accuracy = accuracy_score(y_train, svc_train_predictions)
|
|
svc_train_report = classification_report(y_train, svc_train_predictions)
|
|
|
|
print(f"Training Accuracy: {svc_train_accuracy:.4f}\n")
|
|
print("Classification Report of Training Results:")
|
|
print(svc_train_report)
|
|
|
|
# get testing accuracy
|
|
svc_test_predictions = svc.predict(X_test)
|
|
svc_test_accuracy = accuracy_score(y_test, svc_test_predictions)
|
|
svc_test_report = classification_report(y_test, svc_test_predictions)
|
|
|
|
print(f"Testing Accuracy: {svc_test_accuracy:.4f}")
|
|
print("Classification Report of Testing Results:")
|
|
print(svc_test_report)
|
|
|
|
# confusino matrix of the testing accuracy
|
|
cm = confusion_matrix(y_test, svc_test_predictions)
|
|
plt.figure(figsize=(8, 6))
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
|
xticklabels=['No', 'Yes'],
|
|
yticklabels=['No', 'Yes'])
|
|
plt.xlabel('Predicted Label')
|
|
plt.ylabel('True Label')
|
|
plt.title('Confusion Matrix for Testing Results')
|
|
plt.show()
|
|
|
|
"""SVC with hyperparameter tuning"""
|
|
|
|
# initialise a range of hyperparameters to loop over
|
|
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
|
|
kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']
|
|
|
|
# matrix to store the accuracy of each hyperparameter pair
|
|
accuracy_matrix = np.zeros((len(kernel_types), len(C_values)))
|
|
|
|
# variables to track the best accuracy and corresponding hyperparameters
|
|
best_accuracy = 0
|
|
best_C = None
|
|
best_kernel = None
|
|
|
|
# looping over each hyperparameter value
|
|
for i, kernel in enumerate(kernel_types):
|
|
for j, C in enumerate(C_values):
|
|
svc = SVC(C=C, kernel=kernel)
|
|
svc.fit(X_train, y_train)
|
|
|
|
svc_test_predictions = svc.predict(X_test)
|
|
accuracy = accuracy_score(y_test, svc_test_predictions)
|
|
accuracy_matrix[i, j] = accuracy
|
|
|
|
if accuracy > best_accuracy:
|
|
best_accuracy = accuracy
|
|
best_C = C
|
|
best_kernel = kernel
|
|
|
|
# heatmap of accuracies
|
|
plt.figure(figsize=(10, 6))
|
|
sns.heatmap(accuracy_matrix, annot=True, fmt=".2f", cmap='Blues',
|
|
xticklabels=[f"{C:.3f}" for C in C_values],
|
|
yticklabels=kernel_types)
|
|
plt.title('Accuracy for different C values and kernel types')
|
|
plt.xlabel('C Value')
|
|
plt.ylabel('Kernel Type')
|
|
plt.show()
|
|
|
|
print(f"Best Accuracy: {best_accuracy:.3f}")
|
|
print(f"Best C: {best_C:.3f}")
|
|
print(f"Best Kernel: {best_kernel}") |