diff --git a/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/CT4101_A1_Hayes_Andrew_code.zip b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/CT4101_A1_Hayes_Andrew_code.zip new file mode 100644 index 00000000..63873f0b Binary files /dev/null and b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/CT4101_A1_Hayes_Andrew_code.zip differ diff --git a/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.ipynb b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.ipynb new file mode 100644 index 00000000..afbda25e --- /dev/null +++ b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.ipynb @@ -0,0 +1,464 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Set up" + ], + "metadata": { + "id": "N51l2h_JXXSK" + } + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "CbtWRrEj43kP" + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", + "from sklearn.svm import SVC\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "\n", + "train_data = pd.read_csv('/content/drive/MyDrive/wildfires_training.csv')\n", + "test_data = pd.read_csv('/content/drive/MyDrive/wildfires_test.csv')\n", + "\n", + "X_train = train_data.drop(columns=['fire'])\n", + "y_train = train_data['fire']\n", + "\n", + "X_test = test_data.drop(columns=['fire'])\n", + "y_test = test_data['fire']" + ] + }, + { + "cell_type": "markdown", + "source": [ + "RandomForestClassifier with default parameters:" + ], + "metadata": { + "id": "BZzXcEDVXHUB" + } + }, + { + "cell_type": "code", + "source": [ + "# intialise the randomforestclassifier with a set random seed\n", + "rfc = RandomForestClassifier(random_state=0)\n", + "rfc.fit(X_train, y_train)\n", + "\n", + "# train and get accuracy\n", + "train_predictions = rfc.predict(X_train)\n", + "train_accuracy = accuracy_score(y_train, train_predictions)\n", + "train_report = classification_report(y_train, train_predictions)\n", + "\n", + "print(f\"Training Accuracy: {train_accuracy:.4f}\\n\")\n", + "print(\"Classification Report of Testing Results:\")\n", + "print(train_report)\n", + "\n", + "# test and get accuracy\n", + "test_predictions = rfc.predict(X_test)\n", + "test_accuracy = accuracy_score(y_test, test_predictions)\n", + "test_report = classification_report(y_test, test_predictions)\n", + "\n", + "print(f\"Testing Accuracy: {test_accuracy:.4f}\")\n", + "print(\"Classification Report of Testing Results:\")\n", + "print(test_report)\n", + "\n", + "# create a confusion matrix to visualise the data\n", + "cm = confusion_matrix(y_test, test_predictions)\n", + "plt.figure(figsize=(8, 6))\n", + "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n", + " xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))\n", + "plt.ylabel('Actual')\n", + "plt.xlabel('Predicted')\n", + "plt.title('RandomForestClassifier with Default Hyperparametersk')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 963 + }, + "id": "1DbUwCudVrzZ", + "outputId": "aa496dbc-0fd2-4244-ab1f-01a2887cea15" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training Accuracy: 1.0000\n", + "\n", + "Classification Report of Testing Results:\n", + " precision recall f1-score support\n", + "\n", + " no 1.00 1.00 1.00 75\n", + " yes 1.00 1.00 1.00 79\n", + "\n", + " accuracy 1.00 154\n", + " macro avg 1.00 1.00 1.00 154\n", + "weighted avg 1.00 1.00 1.00 154\n", + "\n", + "Testing Accuracy: 0.8200\n", + "Classification Report of Testing Results:\n", + " precision recall f1-score support\n", + "\n", + " no 0.76 0.86 0.81 22\n", + " yes 0.88 0.79 0.83 28\n", + "\n", + " accuracy 0.82 50\n", + " macro avg 0.82 0.82 0.82 50\n", + "weighted avg 0.83 0.82 0.82 50\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "RandomForestClassifier with tuning:" + ], + "metadata": { + "id": "UMdZFkU_Ag6L" + } + }, + { + "cell_type": "code", + "source": [ + "# initialise a range of hyperparameters to loop over\n", + "n_estimators_range = [1, 3, 5, 10, 50, 100, 250, 500, 1000]\n", + "max_depth_range = [1, 5, 10, 20, 30, None]\n", + "\n", + "# matrix to store the accuracy of each hyperparameter pair\n", + "accuracy_matrix = np.zeros((len(max_depth_range), len(n_estimators_range)))\n", + "\n", + "# variable to track\n", + "best_accuracy = 0\n", + "best_n_estimators = None\n", + "best_max_depth = None\n", + "\n", + "# looping over each hyperparam value\n", + "for i, max_depth in enumerate(max_depth_range):\n", + " for j, n_estimators in enumerate(n_estimators_range):\n", + " rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)\n", + " rfc.fit(X_train, y_train)\n", + "\n", + " y_pred = rfc.predict(X_test)\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " accuracy_matrix[i, j] = accuracy\n", + "\n", + " if accuracy > best_accuracy:\n", + " best_accuracy = accuracy\n", + " best_n_estimators = n_estimators\n", + " best_max_depth = max_depth\n", + "\n", + "# heatmap of accuracies\n", + "plt.figure(figsize=(10, 5))\n", + "sns.heatmap(accuracy_matrix, annot=True, fmt=\".3f\", cmap=\"YlGnBu\",\n", + " xticklabels=n_estimators_range,\n", + " yticklabels=[str(depth) if depth is not None else \"None\" for depth in max_depth_range])\n", + "plt.title('Accuracy for different n_estimators and max_depth values')\n", + "plt.xlabel('n_estimators')\n", + "plt.ylabel('max_depth')\n", + "plt.show()\n", + "\n", + "print(f\"Best Accuracy: {best_accuracy:.3f}\")\n", + "print(f\"Best n_estimators: {best_n_estimators}\")\n", + "print(f\"Best max_depth: {best_max_depth}\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 539 + }, + "id": "xXRRktXOBJVE", + "outputId": "2411cb09-4bd8-48a6-8853-b07593bcf9b9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best Accuracy: 0.860\n", + "Best n_estimators: 3\n", + "Best max_depth: 5\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Pre-process: encode categorical variables" + ], + "metadata": { + "id": "3QRK2tBj_EUI" + } + }, + { + "cell_type": "code", + "source": [ + "encoder = OneHotEncoder(drop='first', sparse_output=False)\n", + "\n", + "y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).ravel()\n", + "y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).ravel()\n", + "\n", + "# rename y_train and y_test for convenience\n", + "y_train = y_train_encoded\n", + "y_test = y_test_encoded" + ], + "metadata": { + "id": "oRu7kR48BnF4" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "SVC classifier with default params" + ], + "metadata": { + "id": "mUYkdx6o4edT" + } + }, + { + "cell_type": "code", + "source": [ + "# svc with default params\n", + "svc = SVC()\n", + "svc.fit(X_train, y_train)\n", + "\n", + "# get training accuracy\n", + "svc_train_predictions = svc.predict(X_train)\n", + "svc_train_accuracy = accuracy_score(y_train, svc_train_predictions)\n", + "svc_train_report = classification_report(y_train, svc_train_predictions)\n", + "\n", + "print(f\"Training Accuracy: {svc_train_accuracy:.4f}\\n\")\n", + "print(\"Classification Report of Training Results:\")\n", + "print(svc_train_report)\n", + "\n", + "# get testing accuracy\n", + "svc_test_predictions = svc.predict(X_test)\n", + "svc_test_accuracy = accuracy_score(y_test, svc_test_predictions)\n", + "svc_test_report = classification_report(y_test, svc_test_predictions)\n", + "\n", + "print(f\"Testing Accuracy: {svc_test_accuracy:.4f}\")\n", + "print(\"Classification Report of Testing Results:\")\n", + "print(svc_test_report)\n", + "\n", + "# confusino matrix of the testing accuracy\n", + "cm = confusion_matrix(y_test, svc_test_predictions)\n", + "plt.figure(figsize=(8, 6))\n", + "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n", + " xticklabels=['No', 'Yes'],\n", + " yticklabels=['No', 'Yes'])\n", + "plt.xlabel('Predicted Label')\n", + "plt.ylabel('True Label')\n", + "plt.title('Confusion Matrix for Testing Results')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "aDKZXhKd4mAj", + "outputId": "98e68bd9-1b61-4137-935e-6e00b61b22a3" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training Accuracy: 0.5130\n", + "\n", + "Classification Report of Training Results:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.00 0.00 0.00 75\n", + " 1.0 0.51 1.00 0.68 79\n", + "\n", + " accuracy 0.51 154\n", + " macro avg 0.26 0.50 0.34 154\n", + "weighted avg 0.26 0.51 0.35 154\n", + "\n", + "Testing Accuracy: 0.5600\n", + "Classification Report of Testing Results:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.00 0.00 0.00 22\n", + " 1.0 0.56 1.00 0.72 28\n", + "\n", + " accuracy 0.56 50\n", + " macro avg 0.28 0.50 0.36 50\n", + "weighted avg 0.31 0.56 0.40 50\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "SVC with hyperparameter tuning" + ], + "metadata": { + "id": "KQ8Qd6px7YCl" + } + }, + { + "cell_type": "code", + "source": [ + "# initialise a range of hyperparameters to loop over\n", + "C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]\n", + "kernel_types = ['linear', 'poly', 'rbf', 'sigmoid']\n", + "\n", + "# matrix to store the accuracy of each hyperparameter pair\n", + "accuracy_matrix = np.zeros((len(kernel_types), len(C_values)))\n", + "\n", + "# variables to track the best accuracy and corresponding hyperparameters\n", + "best_accuracy = 0\n", + "best_C = None\n", + "best_kernel = None\n", + "\n", + "# looping over each hyperparameter value\n", + "for i, kernel in enumerate(kernel_types):\n", + " for j, C in enumerate(C_values):\n", + " svc = SVC(C=C, kernel=kernel)\n", + " svc.fit(X_train, y_train)\n", + "\n", + " svc_test_predictions = svc.predict(X_test)\n", + " accuracy = accuracy_score(y_test, svc_test_predictions)\n", + " accuracy_matrix[i, j] = accuracy\n", + "\n", + " if accuracy > best_accuracy:\n", + " best_accuracy = accuracy\n", + " best_C = C\n", + " best_kernel = kernel\n", + "\n", + "# heatmap of accuracies\n", + "plt.figure(figsize=(10, 6))\n", + "sns.heatmap(accuracy_matrix, annot=True, fmt=\".2f\", cmap='Blues',\n", + " xticklabels=[f\"{C:.3f}\" for C in C_values],\n", + " yticklabels=kernel_types)\n", + "plt.title('Accuracy for different C values and kernel types')\n", + "plt.xlabel('C Value')\n", + "plt.ylabel('Kernel Type')\n", + "plt.show()\n", + "\n", + "print(f\"Best Accuracy: {best_accuracy:.3f}\")\n", + "print(f\"Best C: {best_C:.3f}\")\n", + "print(f\"Best Kernel: {best_kernel}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 616 + }, + "id": "Go2iJS6045OA", + "outputId": "000ce619-442a-435f-c906-0e509f702e34" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best Accuracy: 0.920\n", + "Best C: 10.000\n", + "Best Kernel: linear\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.py b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.py new file mode 100644 index 00000000..fe2bf9a5 --- /dev/null +++ b/year4/semester1/CT4101: Machine Learning/assignments/assignment1/code/assignment.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +"""assignment.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1ILT6kccc8NHrY7xonz-ERej4MkGAw1Qv + +Set up +""" + +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, classification_report +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.svm import SVC +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +train_data = pd.read_csv('/content/drive/MyDrive/wildfires_training.csv') +test_data = pd.read_csv('/content/drive/MyDrive/wildfires_test.csv') + +X_train = train_data.drop(columns=['fire']) +y_train = train_data['fire'] + +X_test = test_data.drop(columns=['fire']) +y_test = test_data['fire'] + +"""RandomForestClassifier with default parameters:""" + +# intialise the randomforestclassifier with a set random seed +rfc = RandomForestClassifier(random_state=0) +rfc.fit(X_train, y_train) + +# train and get accuracy +train_predictions = rfc.predict(X_train) +train_accuracy = accuracy_score(y_train, train_predictions) +train_report = classification_report(y_train, train_predictions) + +print(f"Training Accuracy: {train_accuracy:.4f}\n") +print("Classification Report of Testing Results:") +print(train_report) + +# test and get accuracy +test_predictions = rfc.predict(X_test) +test_accuracy = accuracy_score(y_test, test_predictions) +test_report = classification_report(y_test, test_predictions) + +print(f"Testing Accuracy: {test_accuracy:.4f}") +print("Classification Report of Testing Results:") +print(test_report) + +# create a confusion matrix to visualise the data +cm = confusion_matrix(y_test, test_predictions) +plt.figure(figsize=(8, 6)) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=np.unique(y_test), yticklabels=np.unique(y_test)) +plt.ylabel('Actual') +plt.xlabel('Predicted') +plt.title('Confusion Matrix') +plt.show() + +"""RandomForestClassifier with tuning:""" + +# initialise a range of hyperparameters to loop over +n_estimators_range = [1, 3, 5, 10, 50, 100, 250, 500, 1000] +max_depth_range = [1, 5, 10, 20, 30, None] + +# matrix to store the accuracy of each hyperparameter pair +accuracy_matrix = np.zeros((len(max_depth_range), len(n_estimators_range))) + +# variable to track +best_accuracy = 0 +best_n_estimators = None +best_max_depth = None + +# looping over each hyperparam value +for i, max_depth in enumerate(max_depth_range): + for j, n_estimators in enumerate(n_estimators_range): + rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0) + rfc.fit(X_train, y_train) + + y_pred = rfc.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + accuracy_matrix[i, j] = accuracy + + if accuracy > best_accuracy: + best_accuracy = accuracy + best_n_estimators = n_estimators + best_max_depth = max_depth + +# heatmap of accuracies +plt.figure(figsize=(10, 6)) +sns.heatmap(accuracy_matrix, annot=True, fmt=".3f", cmap="YlGnBu", + xticklabels=n_estimators_range, + yticklabels=[str(depth) if depth is not None else "None" for depth in max_depth_range]) +plt.title('Accuracy for different n_estimators and max_depth values') +plt.xlabel('n_estimators') +plt.ylabel('max_depth') +plt.show() + +print(f"Best Accuracy: {best_accuracy:.3f}") +print(f"Best n_estimators: {best_n_estimators}") +print(f"Best max_depth: {best_max_depth}") + +"""SVC classifier with default params on the unprocessed data""" + +# svc with default params +svc = SVC() +svc.fit(X_train, y_train) + +# get training accuracy +svc_train_predictions = svc.predict(X_train) +svc_train_accuracy = accuracy_score(y_train, svc_train_predictions) +svc_train_report = classification_report(y_train, svc_train_predictions) + +print(f"Training Accuracy: {svc_train_accuracy:.4f}\n") +print("Classification Report of Training Results:") +print(svc_train_report) + +# get testing accuracy +svc_test_predictions = svc.predict(X_test) +svc_test_accuracy = accuracy_score(y_test, svc_test_predictions) +svc_test_report = classification_report(y_test, svc_test_predictions) + +print(f"Testing Accuracy: {svc_test_accuracy:.4f}") +print("Classification Report of Testing Results:") +print(svc_test_report) + +# confusino matrix of the testing accuracy +cm = confusion_matrix(y_test, svc_test_predictions) +plt.figure(figsize=(8, 6)) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['No', 'Yes'], + yticklabels=['No', 'Yes']) +plt.xlabel('Predicted Label') +plt.ylabel('True Label') +plt.title('Confusion Matrix for Testing Results') +plt.show() + +"""SVC with hyperparameter tuning""" + +# initialise a range of hyperparameters to loop over +C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] +kernel_types = ['linear', 'poly', 'rbf', 'sigmoid'] + +# matrix to store the accuracy of each hyperparameter pair +accuracy_matrix = np.zeros((len(kernel_types), len(C_values))) + +# variables to track the best accuracy and corresponding hyperparameters +best_accuracy = 0 +best_C = None +best_kernel = None + +# looping over each hyperparameter value +for i, kernel in enumerate(kernel_types): + for j, C in enumerate(C_values): + svc = SVC(C=C, kernel=kernel) + svc.fit(X_train, y_train) + + svc_test_predictions = svc.predict(X_test) + accuracy = accuracy_score(y_test, svc_test_predictions) + accuracy_matrix[i, j] = accuracy + + if accuracy > best_accuracy: + best_accuracy = accuracy + best_C = C + best_kernel = kernel + +# heatmap of accuracies +plt.figure(figsize=(10, 6)) +sns.heatmap(accuracy_matrix, annot=True, fmt=".2f", cmap='Blues', + xticklabels=[f"{C:.3f}" for C in C_values], + yticklabels=kernel_types) +plt.title('Accuracy for different C values and kernel types') +plt.xlabel('C Value') +plt.ylabel('Kernel Type') +plt.show() + +print(f"Best Accuracy: {best_accuracy:.3f}") +print(f"Best C: {best_C:.3f}") +print(f"Best Kernel: {best_kernel}") \ No newline at end of file