[CT414]: Assignment 2 results
This commit is contained in:
@ -1,187 +1,80 @@
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.util.Scanner;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public class MapReduceFiles {
|
||||
|
||||
private static final int LINES_PER_MAP_THREAD = 2000;
|
||||
private static final String CSV_FILE = "performance_results.csv";
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
if (args.length < 3) {
|
||||
System.err.println("usage: java MapReduceFiles file1.txt file2.txt file3.txt");
|
||||
|
||||
if (args.length < 1) {
|
||||
System.err.println("Usage: java MapReduceFiles file1.txt file2.txt ... fileN.txt");
|
||||
return;
|
||||
}
|
||||
|
||||
Map<String, String> input = new HashMap<String, String>();
|
||||
Map<String, String> input = new HashMap<>();
|
||||
try {
|
||||
input.put(args[0], readFile(args[0]));
|
||||
input.put(args[1], readFile(args[1]));
|
||||
input.put(args[2], readFile(args[2]));
|
||||
|
||||
for (String filename : args) {
|
||||
input.put(filename, readFile(filename));
|
||||
}
|
||||
|
||||
}
|
||||
catch (IOException ex)
|
||||
{
|
||||
System.err.println("Error reading files...\n" + ex.getMessage());
|
||||
} catch (IOException ex) {
|
||||
System.err.println("Error reading files: " + ex.getMessage());
|
||||
ex.printStackTrace();
|
||||
System.exit(0);
|
||||
return;
|
||||
}
|
||||
|
||||
// APPROACH #1: Brute force
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
int[] mapSizes = {1000, 2000, 5000, 10000};
|
||||
int[] reduceSizes = {100, 200, 500, 1000};
|
||||
|
||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
System.out.println("===== Starting Grid Search =====");
|
||||
|
||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||
while(inputIter.hasNext()) {
|
||||
Map.Entry<String, String> entry = inputIter.next();
|
||||
String file = entry.getKey();
|
||||
String contents = entry.getValue();
|
||||
try (PrintWriter writer = new PrintWriter(new FileWriter(CSV_FILE))) {
|
||||
writer.println("MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime");
|
||||
|
||||
String[] words = contents.trim().split("\\s+");
|
||||
|
||||
for(String word : words) {
|
||||
|
||||
Map<String, Integer> files = output.get(word);
|
||||
if (files == null) {
|
||||
files = new HashMap<String, Integer>();
|
||||
output.put(word, files);
|
||||
}
|
||||
|
||||
Integer occurrences = files.remove(file);
|
||||
if (occurrences == null) {
|
||||
files.put(file, 1);
|
||||
} else {
|
||||
files.put(file, occurrences.intValue() + 1);
|
||||
for (int mapSize : mapSizes) {
|
||||
for (int reduceSize : reduceSizes) {
|
||||
runDistributedMapReduce(input, mapSize, reduceSize, writer);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
System.err.println("Error writing to CSV file: " + e.getMessage());
|
||||
}
|
||||
|
||||
long timeTaken = System.currentTimeMillis() - startTime;
|
||||
System.out.println("Brute Force Results:");
|
||||
System.out.println("\tTotal Time: " + timeTaken + "\n");
|
||||
System.out.println("===== Grid Search Complete =====");
|
||||
System.out.println("Results saved to: " + CSV_FILE);
|
||||
}
|
||||
|
||||
public static void runDistributedMapReduce(Map<String, String> input, int linesPerMapThread, int wordsPerReduceThread, PrintWriter csvWriter) {
|
||||
final Map<String, Map<String, Integer>> output = new HashMap<>();
|
||||
|
||||
// APPROACH #2: MapReduce
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
|
||||
// MAP:
|
||||
// MAP Phase
|
||||
long mapStartTime = System.currentTimeMillis();
|
||||
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||
List<MappedItem> mappedItems = Collections.synchronizedList(new ArrayList<>());
|
||||
|
||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||
while(inputIter.hasNext()) {
|
||||
Map.Entry<String, String> entry = inputIter.next();
|
||||
String file = entry.getKey();
|
||||
String contents = entry.getValue();
|
||||
|
||||
map(file, contents, mappedItems);
|
||||
}
|
||||
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||
|
||||
// GROUP:
|
||||
long groupStartTime = System.currentTimeMillis();
|
||||
|
||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||
|
||||
Iterator<MappedItem> mappedIter = mappedItems.iterator();
|
||||
while(mappedIter.hasNext()) {
|
||||
MappedItem item = mappedIter.next();
|
||||
String word = item.getWord();
|
||||
String file = item.getFile();
|
||||
List<String> list = groupedItems.get(word);
|
||||
if (list == null) {
|
||||
list = new ArrayList<String>();
|
||||
groupedItems.put(word, list);
|
||||
}
|
||||
list.add(file);
|
||||
}
|
||||
|
||||
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||
|
||||
// REDUCE:
|
||||
long reduceStartTime = System.currentTimeMillis();
|
||||
|
||||
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
||||
while(groupedIter.hasNext()) {
|
||||
Map.Entry<String, List<String>> entry = groupedIter.next();
|
||||
String word = entry.getKey();
|
||||
List<String> list = entry.getValue();
|
||||
|
||||
reduce(word, list, output);
|
||||
}
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
long reduceTotalTime = endTime - reduceStartTime;
|
||||
long totalTime = endTime - startTime;
|
||||
|
||||
System.out.println("MapReduce Results:");
|
||||
System.out.println("\tMap Time: " + mapTotalTime);
|
||||
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||
}
|
||||
|
||||
|
||||
// APPROACH #3: Distributed MapReduce
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
|
||||
// MAP:
|
||||
long mapStartTime = System.currentTimeMillis();
|
||||
|
||||
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||
|
||||
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
|
||||
@Override
|
||||
final MapCallback<String, MappedItem> mapCallback = new MapCallback<>() {
|
||||
public synchronized void mapDone(String file, List<MappedItem> results) {
|
||||
mappedItems.addAll(results);
|
||||
}
|
||||
};
|
||||
|
||||
List<Thread> mapCluster = new ArrayList<Thread>();
|
||||
|
||||
List<Thread> mapCluster = new ArrayList<>();
|
||||
for (Map.Entry<String, String> entry : input.entrySet()) {
|
||||
final String file = entry.getKey();
|
||||
final String contents = entry.getValue();
|
||||
final String[] lines = contents.split("\\r?\\n");
|
||||
final String[] lines = entry.getValue().split("\\r?\\n");
|
||||
|
||||
for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
|
||||
int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
|
||||
for (int i = 0; i < lines.length; i += linesPerMapThread) {
|
||||
int end = Math.min(i + linesPerMapThread, lines.length);
|
||||
final List<String> chunk = new ArrayList<>();
|
||||
for (int j = i; j < end; j++) {
|
||||
chunk.addAll(splitLongLine(lines[j]));
|
||||
}
|
||||
|
||||
Thread t = new Thread(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
map(file, chunk, mapCallback);
|
||||
}
|
||||
});
|
||||
Thread t = new Thread(() -> map(file, chunk, mapCallback));
|
||||
mapCluster.add(t);
|
||||
t.start();
|
||||
}
|
||||
}
|
||||
|
||||
// wait for mapping phase to be over:
|
||||
for (Thread t : mapCluster) {
|
||||
try {
|
||||
t.join();
|
||||
@ -192,72 +85,48 @@ public class MapReduceFiles {
|
||||
|
||||
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||
|
||||
// GROUP:
|
||||
// GROUP Phase
|
||||
long groupStartTime = System.currentTimeMillis();
|
||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||
|
||||
Iterator<MappedItem> mappedIter = mappedItems.iterator();
|
||||
while(mappedIter.hasNext()) {
|
||||
MappedItem item = mappedIter.next();
|
||||
String word = item.getWord();
|
||||
String file = item.getFile();
|
||||
List<String> list = groupedItems.get(word);
|
||||
if (list == null) {
|
||||
list = new ArrayList<String>();
|
||||
groupedItems.put(word, list);
|
||||
Map<String, List<String>> groupedItems = new HashMap<>();
|
||||
for (MappedItem item : mappedItems) {
|
||||
groupedItems.computeIfAbsent(item.getWord(), k -> new ArrayList<>()).add(item.getFile());
|
||||
}
|
||||
list.add(file);
|
||||
}
|
||||
|
||||
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||
|
||||
// REDUCE:
|
||||
// REDUCE Phase
|
||||
long reduceStartTime = System.currentTimeMillis();
|
||||
|
||||
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
|
||||
@Override
|
||||
public synchronized void reduceDone(String k, Map<String, Integer> v) {
|
||||
output.put(k, v);
|
||||
final ReduceCallback<String, String, Integer> reduceCallback = (word, result) -> {
|
||||
synchronized (output) {
|
||||
output.put(word, result);
|
||||
}
|
||||
};
|
||||
|
||||
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
|
||||
|
||||
// Replace this constant if you want to try different values for performance tests
|
||||
final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
|
||||
|
||||
List<Thread> reduceCluster = new ArrayList<>();
|
||||
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
|
||||
Map<String, List<String>> currentChunk = new HashMap<>();
|
||||
int count = 0;
|
||||
|
||||
// Build chunks of words (100-1000 per thread)
|
||||
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
|
||||
currentChunk.put(entry.getKey(), entry.getValue());
|
||||
count++;
|
||||
if (count >= WORDS_PER_REDUCE_THREAD) {
|
||||
if (count >= wordsPerReduceThread) {
|
||||
reduceChunks.add(currentChunk);
|
||||
currentChunk = new HashMap<>();
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
if (!currentChunk.isEmpty()) {
|
||||
reduceChunks.add(currentChunk);
|
||||
}
|
||||
if (!currentChunk.isEmpty()) reduceChunks.add(currentChunk);
|
||||
|
||||
for (final Map<String, List<String>> chunk : reduceChunks) {
|
||||
Thread t = new Thread(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
Thread t = new Thread(() -> {
|
||||
for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
|
||||
reduce(entry.getKey(), entry.getValue(), reduceCallback);
|
||||
}
|
||||
}
|
||||
});
|
||||
reduceCluster.add(t);
|
||||
t.start();
|
||||
}
|
||||
|
||||
// wait for reducing phase to be over:
|
||||
for (Thread t : reduceCluster) {
|
||||
try {
|
||||
t.join();
|
||||
@ -266,23 +135,27 @@ public class MapReduceFiles {
|
||||
}
|
||||
}
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
long reduceTotalTime = endTime - startTime;
|
||||
long totalTime = endTime - startTime;
|
||||
long reduceTotalTime = System.currentTimeMillis() - reduceStartTime;
|
||||
long totalTime = mapTotalTime + groupTotalTime + reduceTotalTime;
|
||||
|
||||
// Print & Log
|
||||
System.out.println("MapLines: " + linesPerMapThread + ", ReduceWords: " + wordsPerReduceThread);
|
||||
System.out.println("\tMap Time: " + mapTotalTime + " ms");
|
||||
System.out.println("\tGroup Time: " + groupTotalTime + " ms");
|
||||
System.out.println("\tReduce Time: " + reduceTotalTime + " ms");
|
||||
System.out.println("\tTotal Time: " + totalTime + " ms");
|
||||
System.out.println("----------------------------------------------------");
|
||||
|
||||
System.out.println("Distributed MapReduce Results:");
|
||||
System.out.println("\tMap Time: " + mapTotalTime);
|
||||
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||
}
|
||||
csvWriter.printf("%d,%d,%d,%d,%d,%d%n",
|
||||
linesPerMapThread, wordsPerReduceThread,
|
||||
mapTotalTime, groupTotalTime, reduceTotalTime, totalTime);
|
||||
csvWriter.flush();
|
||||
}
|
||||
|
||||
public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
|
||||
List<MappedItem> results = new ArrayList<MappedItem>();
|
||||
List<MappedItem> results = new ArrayList<>();
|
||||
for (String line : lines) {
|
||||
String[] words = line.trim().split("\s+");
|
||||
String[] words = line.trim().split("\\s+");
|
||||
for (String word : words) {
|
||||
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||
if (!word.isEmpty()) {
|
||||
@ -293,53 +166,23 @@ public class MapReduceFiles {
|
||||
callback.mapDone(file, results);
|
||||
}
|
||||
|
||||
public static void map(String file, String contents, List<MappedItem> mappedItems) {
|
||||
String[] words = contents.trim().split("\s+");
|
||||
for(String word: words) {
|
||||
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||
if (!word.isEmpty()) {
|
||||
mappedItems.add(new MappedItem(word, file));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void reduce(String word, List<String> list, Map<String, Map<String, Integer>> output) {
|
||||
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
||||
for(String file: list) {
|
||||
Integer occurrences = reducedList.get(file);
|
||||
if (occurrences == null) {
|
||||
reducedList.put(file, 1);
|
||||
} else {
|
||||
reducedList.put(file, occurrences.intValue() + 1);
|
||||
}
|
||||
}
|
||||
output.put(word, reducedList);
|
||||
}
|
||||
|
||||
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
|
||||
|
||||
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
||||
Map<String, Integer> reducedList = new HashMap<>();
|
||||
for (String file : list) {
|
||||
Integer occurrences = reducedList.get(file);
|
||||
if (occurrences == null) {
|
||||
reducedList.put(file, 1);
|
||||
} else {
|
||||
reducedList.put(file, occurrences.intValue() + 1);
|
||||
}
|
||||
reducedList.put(file, reducedList.getOrDefault(file, 0) + 1);
|
||||
}
|
||||
callback.reduceDone(word, reducedList);
|
||||
}
|
||||
|
||||
public static interface MapCallback<E, V> {
|
||||
public void mapDone(E key, List<V> values);
|
||||
public interface MapCallback<E, V> {
|
||||
void mapDone(E key, List<V> values);
|
||||
}
|
||||
|
||||
public static interface ReduceCallback<E, K, V> {
|
||||
public void reduceDone(E e, Map<K,V> results);
|
||||
public interface ReduceCallback<E, K, V> {
|
||||
void reduceDone(E e, Map<K, V> results);
|
||||
}
|
||||
|
||||
private static class MappedItem {
|
||||
|
||||
private final String word;
|
||||
private final String file;
|
||||
|
||||
@ -369,11 +212,8 @@ public class MapReduceFiles {
|
||||
String lineSeparator = System.getProperty("line.separator");
|
||||
|
||||
try {
|
||||
if (scanner.hasNextLine()) {
|
||||
fileContents.append(scanner.nextLine());
|
||||
}
|
||||
while (scanner.hasNextLine()) {
|
||||
fileContents.append(lineSeparator + scanner.nextLine());
|
||||
fileContents.append(scanner.nextLine()).append(lineSeparator);
|
||||
}
|
||||
return fileContents.toString();
|
||||
} finally {
|
||||
|
@ -0,0 +1,17 @@
|
||||
MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime
|
||||
1000,100,2099,406,335,2840
|
||||
1000,200,1610,454,198,2262
|
||||
1000,500,1388,452,46,1886
|
||||
1000,1000,1538,302,48,1888
|
||||
2000,100,1726,314,263,2303
|
||||
2000,200,1512,323,62,1897
|
||||
2000,500,1669,334,46,2049
|
||||
2000,1000,1762,279,113,2154
|
||||
5000,100,1291,331,92,1714
|
||||
5000,200,1877,368,67,2312
|
||||
5000,500,1640,396,41,2077
|
||||
5000,1000,1439,365,193,1997
|
||||
10000,100,1285,359,94,1738
|
||||
10000,200,1598,359,98,2055
|
||||
10000,500,1489,314,68,1871
|
||||
10000,1000,1460,332,47,1839
|
|
27
year4/semester2/CT414/assignments/assignment2/code/plots.py
Normal file
27
year4/semester2/CT414/assignments/assignment2/code/plots.py
Normal file
@ -0,0 +1,27 @@
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
df = pd.read_csv('performance_results.csv')
|
||||
|
||||
def save_heatmap(metric, title, filename):
|
||||
pivot = df.pivot(index='MapLines', columns='ReduceWords', values=metric)
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(
|
||||
pivot,
|
||||
annot=True,
|
||||
fmt="d",
|
||||
cmap="RdYlGn_r",
|
||||
cbar_kws={'label': 'Time (ms)'}
|
||||
)
|
||||
plt.title(title)
|
||||
plt.ylabel("Lines per Map Thread")
|
||||
plt.xlabel("Words per Reduce Thread")
|
||||
plt.tight_layout()
|
||||
plt.savefig(filename)
|
||||
plt.close()
|
||||
print(f"Saved: {filename}")
|
||||
|
||||
save_heatmap('TotalTime', 'Total Time (ms)', '../latex/images/total_time_heatmap.png')
|
||||
save_heatmap('MapTime', 'Map Time (ms)', '../latex/images/map_time_heatmap.png')
|
||||
save_heatmap('ReduceTime', 'Reduce Time (ms)', '../latex/images/reduce_time_heatmap.png')
|
Binary file not shown.
@ -0,0 +1,204 @@
|
||||
%! TeX program = lualatex
|
||||
\documentclass[a4paper]{article}
|
||||
|
||||
% packages
|
||||
\usepackage{microtype} % Slightly tweak font spacing for aesthetics
|
||||
\usepackage[english]{babel} % Language hyphenation and typographical rules
|
||||
\usepackage{changepage} % adjust margins on the fly
|
||||
\usepackage{booktabs} % For better-looking tables
|
||||
\usepackage{pgfplotstable} % For reading and displaying CSV/TSV files
|
||||
\usepackage{appendix}
|
||||
|
||||
\usepackage[final, colorlinks = true, urlcolor = black, linkcolor = black, citecolor = black]{hyperref}
|
||||
\usepackage{fontspec}
|
||||
% \setmainfont{EB Garamond}
|
||||
% \setmonofont[Scale=MatchLowercase]{Deja Vu Sans Mono}
|
||||
|
||||
\setmainfont{EB Garamond}[
|
||||
Ligatures=TeX,
|
||||
Numbers=OldStyle
|
||||
]
|
||||
|
||||
% Fallback font (for missing characters)
|
||||
\setmainfont{EB Garamond}[
|
||||
% Ligatures=TeX,
|
||||
% Numbers=OldStyle
|
||||
]
|
||||
|
||||
\newfontfamily{\emojifont}{Noto Color Emoji}[Renderer=Harfbuzz]
|
||||
|
||||
% Monospace font configuration
|
||||
\setmonofont[Scale=MatchLowercase]{DejaVu Sans Mono}
|
||||
|
||||
\usepackage[backend=biber, style=numeric, date=iso, urldate=iso]{biblatex}
|
||||
\addbibresource{references.bib}
|
||||
\DeclareFieldFormat{urldate}{Accessed on: #1}
|
||||
|
||||
\usepackage{minted}
|
||||
\usemintedstyle{algol_nu}
|
||||
\usepackage{xcolor}
|
||||
|
||||
\usepackage{pgfplots}
|
||||
\pgfplotsset{width=\textwidth,compat=1.9}
|
||||
|
||||
\usepackage{caption}
|
||||
\newenvironment{code}{\captionsetup{type=listing}}{}
|
||||
\captionsetup[listing]{skip=0pt}
|
||||
\setlength{\abovecaptionskip}{5pt}
|
||||
\setlength{\belowcaptionskip}{5pt}
|
||||
|
||||
\usepackage[yyyymmdd]{datetime}
|
||||
\renewcommand{\dateseparator}{--}
|
||||
|
||||
\usepackage{titlesec}
|
||||
% \titleformat{\section}{\LARGE\bfseries}{}{}{}[\titlerule]
|
||||
% \titleformat{\subsection}{\Large\bfseries}{}{0em}{}
|
||||
% \titlespacing{\subsection}{0em}{-0.7em}{0em}
|
||||
%
|
||||
% \titleformat{\subsubsection}{\large\bfseries}{}{0em}{$\bullet$ }
|
||||
% \titlespacing{\subsubsection}{1em}{-0.7em}{0em}
|
||||
|
||||
% margins
|
||||
\addtolength{\hoffset}{-2.25cm}
|
||||
\addtolength{\textwidth}{4.5cm}
|
||||
\addtolength{\voffset}{-3.25cm}
|
||||
\addtolength{\textheight}{5cm}
|
||||
\setlength{\parskip}{0pt}
|
||||
\setlength{\parindent}{0in}
|
||||
% \setcounter{secnumdepth}{0}
|
||||
|
||||
\begin{document}
|
||||
\hrule \medskip
|
||||
\begin{minipage}{0.295\textwidth}
|
||||
\raggedright
|
||||
\footnotesize
|
||||
\begin{tabular}{@{}l l}
|
||||
Name: & Andrew Hayes \\
|
||||
Student ID: & 21321503 \\
|
||||
E-mail: & \href{mailto://a.hayes18@universityofgalway.ie}{\texttt{a.hayes18@universityofgalway.ie}} \\
|
||||
\end{tabular}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\centering
|
||||
\vspace{0.4em}
|
||||
\LARGE
|
||||
\textsc{ct414} \\
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.295\textwidth}
|
||||
\raggedleft
|
||||
\today
|
||||
\end{minipage}
|
||||
\medskip\hrule
|
||||
\begin{center}
|
||||
\normalsize
|
||||
Assignment 2: MapReduce
|
||||
\end{center}
|
||||
\hrule
|
||||
\medskip
|
||||
|
||||
\section{Set-Up}
|
||||
To obtain large text files to test the program with, I downloaded the first 10 long books I could think of from \url{archive.org} in \verb|txt| file form.
|
||||
These were:
|
||||
\begin{enumerate}
|
||||
\item The Bible;
|
||||
\item \textit{War \& Peace} by Leo Tolstoy;
|
||||
\item Plutarch's \textit{Lives};
|
||||
\item Herodotus' \textit{Histories};
|
||||
\item \textit{City of God} by Augustine of Hippo;
|
||||
\item \textit{Faust} by Goethe;
|
||||
\item \textit{Wealth of Nations} by Adam Smith;
|
||||
\item \textit{Capital} by Karl Marx;
|
||||
\item The complete works of William Shakespeare;
|
||||
\item \textit{Structure \& Interpretation of Computer Programs} by Harold Abelson \& Gerald Jay Sussman.
|
||||
\end{enumerate}
|
||||
|
||||
\section{Baseline Results}
|
||||
I modified the code to measure \& output the time taken by each approach, in milliseconds.
|
||||
I also added timing for the different phases of the two MapReduce implementations, timing the map time, group time, and reduce time separately.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{./images/baseline.png}
|
||||
\caption{Baseline results for my list of files (in milliseconds)}
|
||||
\end{figure}
|
||||
|
||||
As can be seen from the above terminal screenshot, the brute force approach performed best with no modifications, followed by the non-distributed MapReduce, followed by the distributed MapReduce;
|
||||
this is to be expected, as the brute force approach is the simplest \& requires the fewest iterations over the data and no complex data structures.
|
||||
The non-distributed MapReduce requires more intermediate data structure and more iterations over the data.
|
||||
Finally, the non-optimised version of the distributed MapReduce is the slowest because it spawns a thread for each word in the dataset, causing massive stress on the CPU and memory.
|
||||
\\\\
|
||||
I also updated the code to use \mintinline{java}{ArrayList}s rather than \mintinline{java}{LinkedList}s to reduce memory overhead and have faster traversal.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{./images/arraylist.png}
|
||||
\caption{Baseline results with \mintinline{java}{ArrayList} update (in milliseconds)}
|
||||
\end{figure}
|
||||
|
||||
As can be seen from the above terminal screenshot, this has no affect on the brute force results (besides slight variance due to background processes running on my laptop) as this approach did not use \mintinline{java}{LinkedList}s anyway.
|
||||
The non-distributed MapReduce approach was significantly faster due to the faster iteration and lower memory overhead.
|
||||
The distributed MapReduce saw significant improvements in the map \& group phases, but these were dwarfed by the still greatly inefficient reduce phase.
|
||||
|
||||
\section{Testing the Updated Code}
|
||||
After implementing the requested changes in steps 2--6 of the assignment specification, I then implemented a grid-search function which tested a range of values for the number of lines of text per map thread and the number of words per reduce thread.
|
||||
The results of this grid-search were exported to a CSV file for analysis.
|
||||
I then wrote a Python script to visualise the parameter combinations using heatmaps.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{./images/gridsearch.png}
|
||||
\caption{Running the grid-search and plotting the results}
|
||||
\end{figure}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\pgfplotstabletypeset[
|
||||
col sep=comma,
|
||||
string type,
|
||||
header=true,
|
||||
columns/MapLines/.style={column name=Map Lines},
|
||||
columns/ReduceWords/.style={column name=Reduce Words},
|
||||
columns/MapTime/.style={column name=Map Time (ms)},
|
||||
columns/GroupTime/.style={column name=Group Time (ms)},
|
||||
columns/ReduceTime/.style={column name=Reduce Time (ms)},
|
||||
columns/TotalTime/.style={column name=Total Time (ms)},
|
||||
every head row/.style={before row=\toprule, after row=\midrule},
|
||||
every last row/.style={after row=\bottomrule}
|
||||
]{../code/performance_results.csv}
|
||||
\caption{Results written to \texttt{performance\_results.csv}}
|
||||
\end{table}
|
||||
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{./images/total_time_heatmap.png}
|
||||
\caption{Heatmap of total time taken by each parameter combination}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{./images/map_time_heatmap.png}
|
||||
\caption{Heatmap of time taken during the map phase by each parameter combination}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{./images/reduce_time_heatmap.png}
|
||||
\caption{Heatmap of time taken during the reduce phase by each parameter combination}
|
||||
\end{figure}
|
||||
|
||||
\section{Appendix: Source Code}
|
||||
\begin{code}
|
||||
\inputminted[linenos, breaklines, frame=single]{java}{../code/MapReduceFiles.java}
|
||||
\caption{\texttt{MapReduceFiles.java}}
|
||||
\end{code}
|
||||
|
||||
\begin{code}
|
||||
\inputminted[linenos, breaklines, frame=single]{python}{../code/plots.py}
|
||||
\caption{\texttt{plots.py}}
|
||||
\end{code}
|
||||
|
||||
|
||||
|
||||
|
||||
\end{document}
|
Binary file not shown.
After Width: | Height: | Size: 237 KiB |
Binary file not shown.
After Width: | Height: | Size: 243 KiB |
Binary file not shown.
After Width: | Height: | Size: 702 KiB |
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
After Width: | Height: | Size: 38 KiB |
Reference in New Issue
Block a user