[CT414]: Assignment 2 code steps 1-5

This commit is contained in:
2025-03-24 00:06:53 +00:00
parent 045d6ce949
commit a5bb33e012
2 changed files with 132 additions and 52 deletions

View File

@ -1,7 +1,6 @@
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.io.IOException; import java.io.IOException;
@ -12,6 +11,8 @@ import java.util.Scanner;
public class MapReduceFiles { public class MapReduceFiles {
private static final int LINES_PER_MAP_THREAD = 2000;
public static void main(String[] args) { public static void main(String[] args) {
if (args.length < 3) { if (args.length < 3) {
@ -32,13 +33,15 @@ public class MapReduceFiles {
} }
catch (IOException ex) catch (IOException ex)
{ {
System.err.println("Error reading files...\n" + ex.getMessage()); System.err.println("Error reading files...\n" + ex.getMessage());
ex.printStackTrace(); ex.printStackTrace();
System.exit(0); System.exit(0);
} }
// APPROACH #1: Brute force // APPROACH #1: Brute force
{ {
long startTime = System.currentTimeMillis();
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>(); Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator(); Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
@ -66,18 +69,21 @@ public class MapReduceFiles {
} }
} }
// show me: long timeTaken = System.currentTimeMillis() - startTime;
System.out.println(output); System.out.println("Brute Force Results:");
System.out.println("\tTotal Time: " + timeTaken + "\n");
} }
// APPROACH #2: MapReduce // APPROACH #2: MapReduce
{ {
long startTime = System.currentTimeMillis();
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>(); Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
// MAP: // MAP:
long mapStartTime = System.currentTimeMillis();
List<MappedItem> mappedItems = new LinkedList<MappedItem>(); List<MappedItem> mappedItems = new ArrayList<MappedItem>();
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator(); Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
while(inputIter.hasNext()) { while(inputIter.hasNext()) {
@ -87,8 +93,10 @@ public class MapReduceFiles {
map(file, contents, mappedItems); map(file, contents, mappedItems);
} }
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
// GROUP: // GROUP:
long groupStartTime = System.currentTimeMillis();
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>(); Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
@ -99,13 +107,16 @@ public class MapReduceFiles {
String file = item.getFile(); String file = item.getFile();
List<String> list = groupedItems.get(word); List<String> list = groupedItems.get(word);
if (list == null) { if (list == null) {
list = new LinkedList<String>(); list = new ArrayList<String>();
groupedItems.put(word, list); groupedItems.put(word, list);
} }
list.add(file); list.add(file);
} }
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
// REDUCE: // REDUCE:
long reduceStartTime = System.currentTimeMillis();
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator(); Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
while(groupedIter.hasNext()) { while(groupedIter.hasNext()) {
@ -116,17 +127,27 @@ public class MapReduceFiles {
reduce(word, list, output); reduce(word, list, output);
} }
System.out.println(output); long endTime = System.currentTimeMillis();
long reduceTotalTime = endTime - reduceStartTime;
long totalTime = endTime - startTime;
System.out.println("MapReduce Results:");
System.out.println("\tMap Time: " + mapTotalTime);
System.out.println("\tGroup Time: " + groupTotalTime);
System.out.println("\tReduce Time: " + reduceTotalTime);
System.out.println("\tTotal Time: " + totalTime + "\n");
} }
// APPROACH #3: Distributed MapReduce // APPROACH #3: Distributed MapReduce
{ {
long startTime = System.currentTimeMillis();
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>(); final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
// MAP: // MAP:
long mapStartTime = System.currentTimeMillis();
final List<MappedItem> mappedItems = new LinkedList<MappedItem>(); List<MappedItem> mappedItems = new ArrayList<MappedItem>();
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() { final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
@Override @Override
@ -135,22 +156,29 @@ public class MapReduceFiles {
} }
}; };
List<Thread> mapCluster = new ArrayList<Thread>(input.size()); List<Thread> mapCluster = new ArrayList<Thread>();
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator(); for (Map.Entry<String, String> entry : input.entrySet()) {
while(inputIter.hasNext()) {
Map.Entry<String, String> entry = inputIter.next();
final String file = entry.getKey(); final String file = entry.getKey();
final String contents = entry.getValue(); final String contents = entry.getValue();
final String[] lines = contents.split("\\r?\\n");
Thread t = new Thread(new Runnable() { for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
@Override int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
public void run() { final List<String> chunk = new ArrayList<>();
map(file, contents, mapCallback); for (int j = i; j < end; j++) {
chunk.addAll(splitLongLine(lines[j]));
} }
});
mapCluster.add(t); Thread t = new Thread(new Runnable() {
t.start(); @Override
public void run() {
map(file, chunk, mapCallback);
}
});
mapCluster.add(t);
t.start();
}
} }
// wait for mapping phase to be over: // wait for mapping phase to be over:
@ -162,8 +190,10 @@ public class MapReduceFiles {
} }
} }
// GROUP: long mapTotalTime = System.currentTimeMillis() - mapStartTime;
// GROUP:
long groupStartTime = System.currentTimeMillis();
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>(); Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
Iterator<MappedItem> mappedIter = mappedItems.iterator(); Iterator<MappedItem> mappedIter = mappedItems.iterator();
@ -173,13 +203,16 @@ public class MapReduceFiles {
String file = item.getFile(); String file = item.getFile();
List<String> list = groupedItems.get(word); List<String> list = groupedItems.get(word);
if (list == null) { if (list == null) {
list = new LinkedList<String>(); list = new ArrayList<String>();
groupedItems.put(word, list); groupedItems.put(word, list);
} }
list.add(file); list.add(file);
} }
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
// REDUCE: // REDUCE:
long reduceStartTime = System.currentTimeMillis();
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() { final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
@Override @Override
@ -190,16 +223,34 @@ public class MapReduceFiles {
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size()); List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator(); // Replace this constant if you want to try different values for performance tests
while(groupedIter.hasNext()) { final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
Map.Entry<String, List<String>> entry = groupedIter.next();
final String word = entry.getKey();
final List<String> list = entry.getValue();
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
Map<String, List<String>> currentChunk = new HashMap<>();
int count = 0;
// Build chunks of words (100-1000 per thread)
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
currentChunk.put(entry.getKey(), entry.getValue());
count++;
if (count >= WORDS_PER_REDUCE_THREAD) {
reduceChunks.add(currentChunk);
currentChunk = new HashMap<>();
count = 0;
}
}
if (!currentChunk.isEmpty()) {
reduceChunks.add(currentChunk);
}
for (final Map<String, List<String>> chunk : reduceChunks) {
Thread t = new Thread(new Runnable() { Thread t = new Thread(new Runnable() {
@Override @Override
public void run() { public void run() {
reduce(word, list, reduceCallback); for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
reduce(entry.getKey(), entry.getValue(), reduceCallback);
}
} }
}); });
reduceCluster.add(t); reduceCluster.add(t);
@ -215,14 +266,40 @@ public class MapReduceFiles {
} }
} }
System.out.println(output); long endTime = System.currentTimeMillis();
long reduceTotalTime = endTime - startTime;
long totalTime = endTime - startTime;
System.out.println("Distributed MapReduce Results:");
System.out.println("\tMap Time: " + mapTotalTime);
System.out.println("\tGroup Time: " + groupTotalTime);
System.out.println("\tReduce Time: " + reduceTotalTime);
System.out.println("\tTotal Time: " + totalTime + "\n");
} }
} }
public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
List<MappedItem> results = new ArrayList<MappedItem>();
for (String line : lines) {
String[] words = line.trim().split("\s+");
for(String word: words) {
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
if (!word.isEmpty()) {
results.add(new MappedItem(word, file));
}
}
}
callback.mapDone(file, results);
}
public static void map(String file, String contents, List<MappedItem> mappedItems) { public static void map(String file, String contents, List<MappedItem> mappedItems) {
String[] words = contents.trim().split("\\s+"); String[] words = contents.trim().split("\s+");
for(String word: words) { for(String word: words) {
mappedItems.add(new MappedItem(word, file)); word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
if (!word.isEmpty()) {
mappedItems.add(new MappedItem(word, file));
}
} }
} }
@ -239,25 +316,6 @@ public class MapReduceFiles {
output.put(word, reducedList); output.put(word, reducedList);
} }
public static interface MapCallback<E, V> {
public void mapDone(E key, List<V> values);
}
public static void map(String file, String contents, MapCallback<String, MappedItem> callback) {
String[] words = contents.trim().split("\\s+");
List<MappedItem> results = new ArrayList<MappedItem>(words.length);
for(String word: words) {
results.add(new MappedItem(word, file));
}
callback.mapDone(file, results);
}
public static interface ReduceCallback<E, K, V> {
public void reduceDone(E e, Map<K,V> results);
}
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) { public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
Map<String, Integer> reducedList = new HashMap<String, Integer>(); Map<String, Integer> reducedList = new HashMap<String, Integer>();
@ -272,6 +330,14 @@ public class MapReduceFiles {
callback.reduceDone(word, reducedList); callback.reduceDone(word, reducedList);
} }
public static interface MapCallback<E, V> {
public void mapDone(E key, List<V> values);
}
public static interface ReduceCallback<E, K, V> {
public void reduceDone(E e, Map<K,V> results);
}
private static class MappedItem { private static class MappedItem {
private final String word; private final String word;
@ -315,4 +381,15 @@ public class MapReduceFiles {
} }
} }
private static List<String> splitLongLine(String line) {
List<String> result = new ArrayList<>();
while (line.length() > 80) {
int splitAt = line.lastIndexOf(' ', 80);
if (splitAt <= 0) splitAt = 80;
result.add(line.substring(0, splitAt));
line = line.substring(splitAt).trim();
}
if (!line.isEmpty()) result.add(line);
return result;
}
} }

File diff suppressed because one or more lines are too long