[CT414]: Assignment 2 code steps 1-5
This commit is contained in:
@ -1,7 +1,6 @@
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
@ -12,6 +11,8 @@ import java.util.Scanner;
|
||||
|
||||
public class MapReduceFiles {
|
||||
|
||||
private static final int LINES_PER_MAP_THREAD = 2000;
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
||||
if (args.length < 3) {
|
||||
@ -39,6 +40,8 @@ public class MapReduceFiles {
|
||||
|
||||
// APPROACH #1: Brute force
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
|
||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||
@ -66,18 +69,21 @@ public class MapReduceFiles {
|
||||
}
|
||||
}
|
||||
|
||||
// show me:
|
||||
System.out.println(output);
|
||||
long timeTaken = System.currentTimeMillis() - startTime;
|
||||
System.out.println("Brute Force Results:");
|
||||
System.out.println("\tTotal Time: " + timeTaken + "\n");
|
||||
}
|
||||
|
||||
|
||||
// APPROACH #2: MapReduce
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
|
||||
// MAP:
|
||||
|
||||
List<MappedItem> mappedItems = new LinkedList<MappedItem>();
|
||||
long mapStartTime = System.currentTimeMillis();
|
||||
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||
|
||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||
while(inputIter.hasNext()) {
|
||||
@ -87,8 +93,10 @@ public class MapReduceFiles {
|
||||
|
||||
map(file, contents, mappedItems);
|
||||
}
|
||||
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||
|
||||
// GROUP:
|
||||
long groupStartTime = System.currentTimeMillis();
|
||||
|
||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||
|
||||
@ -99,13 +107,16 @@ public class MapReduceFiles {
|
||||
String file = item.getFile();
|
||||
List<String> list = groupedItems.get(word);
|
||||
if (list == null) {
|
||||
list = new LinkedList<String>();
|
||||
list = new ArrayList<String>();
|
||||
groupedItems.put(word, list);
|
||||
}
|
||||
list.add(file);
|
||||
}
|
||||
|
||||
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||
|
||||
// REDUCE:
|
||||
long reduceStartTime = System.currentTimeMillis();
|
||||
|
||||
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
||||
while(groupedIter.hasNext()) {
|
||||
@ -116,17 +127,27 @@ public class MapReduceFiles {
|
||||
reduce(word, list, output);
|
||||
}
|
||||
|
||||
System.out.println(output);
|
||||
long endTime = System.currentTimeMillis();
|
||||
long reduceTotalTime = endTime - reduceStartTime;
|
||||
long totalTime = endTime - startTime;
|
||||
|
||||
System.out.println("MapReduce Results:");
|
||||
System.out.println("\tMap Time: " + mapTotalTime);
|
||||
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||
}
|
||||
|
||||
|
||||
// APPROACH #3: Distributed MapReduce
|
||||
{
|
||||
long startTime = System.currentTimeMillis();
|
||||
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||
|
||||
// MAP:
|
||||
long mapStartTime = System.currentTimeMillis();
|
||||
|
||||
final List<MappedItem> mappedItems = new LinkedList<MappedItem>();
|
||||
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||
|
||||
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
|
||||
@Override
|
||||
@ -135,23 +156,30 @@ public class MapReduceFiles {
|
||||
}
|
||||
};
|
||||
|
||||
List<Thread> mapCluster = new ArrayList<Thread>(input.size());
|
||||
List<Thread> mapCluster = new ArrayList<Thread>();
|
||||
|
||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||
while(inputIter.hasNext()) {
|
||||
Map.Entry<String, String> entry = inputIter.next();
|
||||
for (Map.Entry<String, String> entry : input.entrySet()) {
|
||||
final String file = entry.getKey();
|
||||
final String contents = entry.getValue();
|
||||
final String[] lines = contents.split("\\r?\\n");
|
||||
|
||||
for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
|
||||
int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
|
||||
final List<String> chunk = new ArrayList<>();
|
||||
for (int j = i; j < end; j++) {
|
||||
chunk.addAll(splitLongLine(lines[j]));
|
||||
}
|
||||
|
||||
Thread t = new Thread(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
map(file, contents, mapCallback);
|
||||
map(file, chunk, mapCallback);
|
||||
}
|
||||
});
|
||||
mapCluster.add(t);
|
||||
t.start();
|
||||
}
|
||||
}
|
||||
|
||||
// wait for mapping phase to be over:
|
||||
for(Thread t : mapCluster) {
|
||||
@ -162,8 +190,10 @@ public class MapReduceFiles {
|
||||
}
|
||||
}
|
||||
|
||||
// GROUP:
|
||||
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||
|
||||
// GROUP:
|
||||
long groupStartTime = System.currentTimeMillis();
|
||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||
|
||||
Iterator<MappedItem> mappedIter = mappedItems.iterator();
|
||||
@ -173,13 +203,16 @@ public class MapReduceFiles {
|
||||
String file = item.getFile();
|
||||
List<String> list = groupedItems.get(word);
|
||||
if (list == null) {
|
||||
list = new LinkedList<String>();
|
||||
list = new ArrayList<String>();
|
||||
groupedItems.put(word, list);
|
||||
}
|
||||
list.add(file);
|
||||
}
|
||||
|
||||
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||
|
||||
// REDUCE:
|
||||
long reduceStartTime = System.currentTimeMillis();
|
||||
|
||||
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
|
||||
@Override
|
||||
@ -190,16 +223,34 @@ public class MapReduceFiles {
|
||||
|
||||
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
|
||||
|
||||
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
||||
while(groupedIter.hasNext()) {
|
||||
Map.Entry<String, List<String>> entry = groupedIter.next();
|
||||
final String word = entry.getKey();
|
||||
final List<String> list = entry.getValue();
|
||||
// Replace this constant if you want to try different values for performance tests
|
||||
final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
|
||||
|
||||
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
|
||||
Map<String, List<String>> currentChunk = new HashMap<>();
|
||||
int count = 0;
|
||||
|
||||
// Build chunks of words (100-1000 per thread)
|
||||
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
|
||||
currentChunk.put(entry.getKey(), entry.getValue());
|
||||
count++;
|
||||
if (count >= WORDS_PER_REDUCE_THREAD) {
|
||||
reduceChunks.add(currentChunk);
|
||||
currentChunk = new HashMap<>();
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
if (!currentChunk.isEmpty()) {
|
||||
reduceChunks.add(currentChunk);
|
||||
}
|
||||
|
||||
for (final Map<String, List<String>> chunk : reduceChunks) {
|
||||
Thread t = new Thread(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
reduce(word, list, reduceCallback);
|
||||
for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
|
||||
reduce(entry.getKey(), entry.getValue(), reduceCallback);
|
||||
}
|
||||
}
|
||||
});
|
||||
reduceCluster.add(t);
|
||||
@ -215,16 +266,42 @@ public class MapReduceFiles {
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println(output);
|
||||
long endTime = System.currentTimeMillis();
|
||||
long reduceTotalTime = endTime - startTime;
|
||||
long totalTime = endTime - startTime;
|
||||
|
||||
|
||||
System.out.println("Distributed MapReduce Results:");
|
||||
System.out.println("\tMap Time: " + mapTotalTime);
|
||||
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
public static void map(String file, String contents, List<MappedItem> mappedItems) {
|
||||
String[] words = contents.trim().split("\\s+");
|
||||
public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
|
||||
List<MappedItem> results = new ArrayList<MappedItem>();
|
||||
for (String line : lines) {
|
||||
String[] words = line.trim().split("\s+");
|
||||
for(String word: words) {
|
||||
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||
if (!word.isEmpty()) {
|
||||
results.add(new MappedItem(word, file));
|
||||
}
|
||||
}
|
||||
}
|
||||
callback.mapDone(file, results);
|
||||
}
|
||||
|
||||
public static void map(String file, String contents, List<MappedItem> mappedItems) {
|
||||
String[] words = contents.trim().split("\s+");
|
||||
for(String word: words) {
|
||||
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||
if (!word.isEmpty()) {
|
||||
mappedItems.add(new MappedItem(word, file));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void reduce(String word, List<String> list, Map<String, Map<String, Integer>> output) {
|
||||
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
||||
@ -239,25 +316,6 @@ public class MapReduceFiles {
|
||||
output.put(word, reducedList);
|
||||
}
|
||||
|
||||
public static interface MapCallback<E, V> {
|
||||
|
||||
public void mapDone(E key, List<V> values);
|
||||
}
|
||||
|
||||
public static void map(String file, String contents, MapCallback<String, MappedItem> callback) {
|
||||
String[] words = contents.trim().split("\\s+");
|
||||
List<MappedItem> results = new ArrayList<MappedItem>(words.length);
|
||||
for(String word: words) {
|
||||
results.add(new MappedItem(word, file));
|
||||
}
|
||||
callback.mapDone(file, results);
|
||||
}
|
||||
|
||||
public static interface ReduceCallback<E, K, V> {
|
||||
|
||||
public void reduceDone(E e, Map<K,V> results);
|
||||
}
|
||||
|
||||
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
|
||||
|
||||
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
||||
@ -272,6 +330,14 @@ public class MapReduceFiles {
|
||||
callback.reduceDone(word, reducedList);
|
||||
}
|
||||
|
||||
public static interface MapCallback<E, V> {
|
||||
public void mapDone(E key, List<V> values);
|
||||
}
|
||||
|
||||
public static interface ReduceCallback<E, K, V> {
|
||||
public void reduceDone(E e, Map<K,V> results);
|
||||
}
|
||||
|
||||
private static class MappedItem {
|
||||
|
||||
private final String word;
|
||||
@ -315,4 +381,15 @@ public class MapReduceFiles {
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> splitLongLine(String line) {
|
||||
List<String> result = new ArrayList<>();
|
||||
while (line.length() > 80) {
|
||||
int splitAt = line.lastIndexOf(' ', 80);
|
||||
if (splitAt <= 0) splitAt = 80;
|
||||
result.add(line.substring(0, splitAt));
|
||||
line = line.substring(splitAt).trim();
|
||||
}
|
||||
if (!line.isEmpty()) result.add(line);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user