[CT414]: Assignment 2 results

This commit is contained in:
2025-03-24 00:47:26 +00:00
parent a5bb33e012
commit be4f004bcd
11 changed files with 378 additions and 290 deletions

View File

@ -1,289 +1,162 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.Scanner;
import java.util.*;
import java.io.*;
public class MapReduceFiles {
private static final int LINES_PER_MAP_THREAD = 2000;
private static final String CSV_FILE = "performance_results.csv";
public static void main(String[] args) {
if (args.length < 3) {
System.err.println("usage: java MapReduceFiles file1.txt file2.txt file3.txt");
if (args.length < 1) {
System.err.println("Usage: java MapReduceFiles file1.txt file2.txt ... fileN.txt");
return;
}
Map<String, String> input = new HashMap<String, String>();
Map<String, String> input = new HashMap<>();
try {
input.put(args[0], readFile(args[0]));
input.put(args[1], readFile(args[1]));
input.put(args[2], readFile(args[2]));
for (String filename : args) {
input.put(filename, readFile(filename));
}
}
catch (IOException ex)
{
System.err.println("Error reading files...\n" + ex.getMessage());
} catch (IOException ex) {
System.err.println("Error reading files: " + ex.getMessage());
ex.printStackTrace();
System.exit(0);
return;
}
// APPROACH #1: Brute force
{
long startTime = System.currentTimeMillis();
int[] mapSizes = {1000, 2000, 5000, 10000};
int[] reduceSizes = {100, 200, 500, 1000};
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
System.out.println("===== Starting Grid Search =====");
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
while(inputIter.hasNext()) {
Map.Entry<String, String> entry = inputIter.next();
String file = entry.getKey();
String contents = entry.getValue();
try (PrintWriter writer = new PrintWriter(new FileWriter(CSV_FILE))) {
writer.println("MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime");
String[] words = contents.trim().split("\\s+");
for(String word : words) {
Map<String, Integer> files = output.get(word);
if (files == null) {
files = new HashMap<String, Integer>();
output.put(word, files);
}
Integer occurrences = files.remove(file);
if (occurrences == null) {
files.put(file, 1);
} else {
files.put(file, occurrences.intValue() + 1);
}
for (int mapSize : mapSizes) {
for (int reduceSize : reduceSizes) {
runDistributedMapReduce(input, mapSize, reduceSize, writer);
}
}
long timeTaken = System.currentTimeMillis() - startTime;
System.out.println("Brute Force Results:");
System.out.println("\tTotal Time: " + timeTaken + "\n");
} catch (IOException e) {
System.err.println("Error writing to CSV file: " + e.getMessage());
}
System.out.println("===== Grid Search Complete =====");
System.out.println("Results saved to: " + CSV_FILE);
}
// APPROACH #2: MapReduce
{
long startTime = System.currentTimeMillis();
public static void runDistributedMapReduce(Map<String, String> input, int linesPerMapThread, int wordsPerReduceThread, PrintWriter csvWriter) {
final Map<String, Map<String, Integer>> output = new HashMap<>();
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
// MAP Phase
long mapStartTime = System.currentTimeMillis();
List<MappedItem> mappedItems = Collections.synchronizedList(new ArrayList<>());
// MAP:
long mapStartTime = System.currentTimeMillis();
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
while(inputIter.hasNext()) {
Map.Entry<String, String> entry = inputIter.next();
String file = entry.getKey();
String contents = entry.getValue();
map(file, contents, mappedItems);
final MapCallback<String, MappedItem> mapCallback = new MapCallback<>() {
public synchronized void mapDone(String file, List<MappedItem> results) {
mappedItems.addAll(results);
}
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
};
// GROUP:
long groupStartTime = System.currentTimeMillis();
List<Thread> mapCluster = new ArrayList<>();
for (Map.Entry<String, String> entry : input.entrySet()) {
final String file = entry.getKey();
final String[] lines = entry.getValue().split("\\r?\\n");
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
Iterator<MappedItem> mappedIter = mappedItems.iterator();
while(mappedIter.hasNext()) {
MappedItem item = mappedIter.next();
String word = item.getWord();
String file = item.getFile();
List<String> list = groupedItems.get(word);
if (list == null) {
list = new ArrayList<String>();
groupedItems.put(word, list);
for (int i = 0; i < lines.length; i += linesPerMapThread) {
int end = Math.min(i + linesPerMapThread, lines.length);
final List<String> chunk = new ArrayList<>();
for (int j = i; j < end; j++) {
chunk.addAll(splitLongLine(lines[j]));
}
list.add(file);
}
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
// REDUCE:
long reduceStartTime = System.currentTimeMillis();
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
while(groupedIter.hasNext()) {
Map.Entry<String, List<String>> entry = groupedIter.next();
String word = entry.getKey();
List<String> list = entry.getValue();
reduce(word, list, output);
}
long endTime = System.currentTimeMillis();
long reduceTotalTime = endTime - reduceStartTime;
long totalTime = endTime - startTime;
System.out.println("MapReduce Results:");
System.out.println("\tMap Time: " + mapTotalTime);
System.out.println("\tGroup Time: " + groupTotalTime);
System.out.println("\tReduce Time: " + reduceTotalTime);
System.out.println("\tTotal Time: " + totalTime + "\n");
}
// APPROACH #3: Distributed MapReduce
{
long startTime = System.currentTimeMillis();
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
// MAP:
long mapStartTime = System.currentTimeMillis();
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
@Override
public synchronized void mapDone(String file, List<MappedItem> results) {
mappedItems.addAll(results);
}
};
List<Thread> mapCluster = new ArrayList<Thread>();
for (Map.Entry<String, String> entry : input.entrySet()) {
final String file = entry.getKey();
final String contents = entry.getValue();
final String[] lines = contents.split("\\r?\\n");
for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
final List<String> chunk = new ArrayList<>();
for (int j = i; j < end; j++) {
chunk.addAll(splitLongLine(lines[j]));
}
Thread t = new Thread(new Runnable() {
@Override
public void run() {
map(file, chunk, mapCallback);
}
});
mapCluster.add(t);
t.start();
}
}
// wait for mapping phase to be over:
for(Thread t : mapCluster) {
try {
t.join();
} catch(InterruptedException e) {
throw new RuntimeException(e);
}
}
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
// GROUP:
long groupStartTime = System.currentTimeMillis();
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
Iterator<MappedItem> mappedIter = mappedItems.iterator();
while(mappedIter.hasNext()) {
MappedItem item = mappedIter.next();
String word = item.getWord();
String file = item.getFile();
List<String> list = groupedItems.get(word);
if (list == null) {
list = new ArrayList<String>();
groupedItems.put(word, list);
}
list.add(file);
}
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
// REDUCE:
long reduceStartTime = System.currentTimeMillis();
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
@Override
public synchronized void reduceDone(String k, Map<String, Integer> v) {
output.put(k, v);
}
};
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
// Replace this constant if you want to try different values for performance tests
final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
Map<String, List<String>> currentChunk = new HashMap<>();
int count = 0;
// Build chunks of words (100-1000 per thread)
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
currentChunk.put(entry.getKey(), entry.getValue());
count++;
if (count >= WORDS_PER_REDUCE_THREAD) {
reduceChunks.add(currentChunk);
currentChunk = new HashMap<>();
count = 0;
}
}
if (!currentChunk.isEmpty()) {
reduceChunks.add(currentChunk);
}
for (final Map<String, List<String>> chunk : reduceChunks) {
Thread t = new Thread(new Runnable() {
@Override
public void run() {
for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
reduce(entry.getKey(), entry.getValue(), reduceCallback);
}
}
});
reduceCluster.add(t);
Thread t = new Thread(() -> map(file, chunk, mapCallback));
mapCluster.add(t);
t.start();
}
// wait for reducing phase to be over:
for(Thread t : reduceCluster) {
try {
t.join();
} catch(InterruptedException e) {
throw new RuntimeException(e);
}
}
long endTime = System.currentTimeMillis();
long reduceTotalTime = endTime - startTime;
long totalTime = endTime - startTime;
System.out.println("Distributed MapReduce Results:");
System.out.println("\tMap Time: " + mapTotalTime);
System.out.println("\tGroup Time: " + groupTotalTime);
System.out.println("\tReduce Time: " + reduceTotalTime);
System.out.println("\tTotal Time: " + totalTime + "\n");
}
for (Thread t : mapCluster) {
try {
t.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
// GROUP Phase
long groupStartTime = System.currentTimeMillis();
Map<String, List<String>> groupedItems = new HashMap<>();
for (MappedItem item : mappedItems) {
groupedItems.computeIfAbsent(item.getWord(), k -> new ArrayList<>()).add(item.getFile());
}
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
// REDUCE Phase
long reduceStartTime = System.currentTimeMillis();
final ReduceCallback<String, String, Integer> reduceCallback = (word, result) -> {
synchronized (output) {
output.put(word, result);
}
};
List<Thread> reduceCluster = new ArrayList<>();
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
Map<String, List<String>> currentChunk = new HashMap<>();
int count = 0;
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
currentChunk.put(entry.getKey(), entry.getValue());
count++;
if (count >= wordsPerReduceThread) {
reduceChunks.add(currentChunk);
currentChunk = new HashMap<>();
count = 0;
}
}
if (!currentChunk.isEmpty()) reduceChunks.add(currentChunk);
for (final Map<String, List<String>> chunk : reduceChunks) {
Thread t = new Thread(() -> {
for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
reduce(entry.getKey(), entry.getValue(), reduceCallback);
}
});
reduceCluster.add(t);
t.start();
}
for (Thread t : reduceCluster) {
try {
t.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
long reduceTotalTime = System.currentTimeMillis() - reduceStartTime;
long totalTime = mapTotalTime + groupTotalTime + reduceTotalTime;
// Print & Log
System.out.println("MapLines: " + linesPerMapThread + ", ReduceWords: " + wordsPerReduceThread);
System.out.println("\tMap Time: " + mapTotalTime + " ms");
System.out.println("\tGroup Time: " + groupTotalTime + " ms");
System.out.println("\tReduce Time: " + reduceTotalTime + " ms");
System.out.println("\tTotal Time: " + totalTime + " ms");
System.out.println("----------------------------------------------------");
csvWriter.printf("%d,%d,%d,%d,%d,%d%n",
linesPerMapThread, wordsPerReduceThread,
mapTotalTime, groupTotalTime, reduceTotalTime, totalTime);
csvWriter.flush();
}
public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
List<MappedItem> results = new ArrayList<MappedItem>();
List<MappedItem> results = new ArrayList<>();
for (String line : lines) {
String[] words = line.trim().split("\s+");
for(String word: words) {
String[] words = line.trim().split("\\s+");
for (String word : words) {
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
if (!word.isEmpty()) {
results.add(new MappedItem(word, file));
@ -293,53 +166,23 @@ public class MapReduceFiles {
callback.mapDone(file, results);
}
public static void map(String file, String contents, List<MappedItem> mappedItems) {
String[] words = contents.trim().split("\s+");
for(String word: words) {
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
if (!word.isEmpty()) {
mappedItems.add(new MappedItem(word, file));
}
}
}
public static void reduce(String word, List<String> list, Map<String, Map<String, Integer>> output) {
Map<String, Integer> reducedList = new HashMap<String, Integer>();
for(String file: list) {
Integer occurrences = reducedList.get(file);
if (occurrences == null) {
reducedList.put(file, 1);
} else {
reducedList.put(file, occurrences.intValue() + 1);
}
}
output.put(word, reducedList);
}
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
Map<String, Integer> reducedList = new HashMap<String, Integer>();
for(String file: list) {
Integer occurrences = reducedList.get(file);
if (occurrences == null) {
reducedList.put(file, 1);
} else {
reducedList.put(file, occurrences.intValue() + 1);
}
Map<String, Integer> reducedList = new HashMap<>();
for (String file : list) {
reducedList.put(file, reducedList.getOrDefault(file, 0) + 1);
}
callback.reduceDone(word, reducedList);
}
public static interface MapCallback<E, V> {
public void mapDone(E key, List<V> values);
public interface MapCallback<E, V> {
void mapDone(E key, List<V> values);
}
public static interface ReduceCallback<E, K, V> {
public void reduceDone(E e, Map<K,V> results);
public interface ReduceCallback<E, K, V> {
void reduceDone(E e, Map<K, V> results);
}
private static class MappedItem {
private final String word;
private final String file;
@ -369,11 +212,8 @@ public class MapReduceFiles {
String lineSeparator = System.getProperty("line.separator");
try {
if (scanner.hasNextLine()) {
fileContents.append(scanner.nextLine());
}
while (scanner.hasNextLine()) {
fileContents.append(lineSeparator + scanner.nextLine());
fileContents.append(scanner.nextLine()).append(lineSeparator);
}
return fileContents.toString();
} finally {

View File

@ -0,0 +1,17 @@
MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime
1000,100,2099,406,335,2840
1000,200,1610,454,198,2262
1000,500,1388,452,46,1886
1000,1000,1538,302,48,1888
2000,100,1726,314,263,2303
2000,200,1512,323,62,1897
2000,500,1669,334,46,2049
2000,1000,1762,279,113,2154
5000,100,1291,331,92,1714
5000,200,1877,368,67,2312
5000,500,1640,396,41,2077
5000,1000,1439,365,193,1997
10000,100,1285,359,94,1738
10000,200,1598,359,98,2055
10000,500,1489,314,68,1871
10000,1000,1460,332,47,1839
1 MapLines ReduceWords MapTime GroupTime ReduceTime TotalTime
2 1000 100 2099 406 335 2840
3 1000 200 1610 454 198 2262
4 1000 500 1388 452 46 1886
5 1000 1000 1538 302 48 1888
6 2000 100 1726 314 263 2303
7 2000 200 1512 323 62 1897
8 2000 500 1669 334 46 2049
9 2000 1000 1762 279 113 2154
10 5000 100 1291 331 92 1714
11 5000 200 1877 368 67 2312
12 5000 500 1640 396 41 2077
13 5000 1000 1439 365 193 1997
14 10000 100 1285 359 94 1738
15 10000 200 1598 359 98 2055
16 10000 500 1489 314 68 1871
17 10000 1000 1460 332 47 1839

View File

@ -0,0 +1,27 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('performance_results.csv')
def save_heatmap(metric, title, filename):
pivot = df.pivot(index='MapLines', columns='ReduceWords', values=metric)
plt.figure(figsize=(8, 6))
sns.heatmap(
pivot,
annot=True,
fmt="d",
cmap="RdYlGn_r",
cbar_kws={'label': 'Time (ms)'}
)
plt.title(title)
plt.ylabel("Lines per Map Thread")
plt.xlabel("Words per Reduce Thread")
plt.tight_layout()
plt.savefig(filename)
plt.close()
print(f"Saved: {filename}")
save_heatmap('TotalTime', 'Total Time (ms)', '../latex/images/total_time_heatmap.png')
save_heatmap('MapTime', 'Map Time (ms)', '../latex/images/map_time_heatmap.png')
save_heatmap('ReduceTime', 'Reduce Time (ms)', '../latex/images/reduce_time_heatmap.png')