[CT414]: Assignment 2 code steps 1-5
This commit is contained in:
@ -1,7 +1,6 @@
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -12,6 +11,8 @@ import java.util.Scanner;
|
|||||||
|
|
||||||
public class MapReduceFiles {
|
public class MapReduceFiles {
|
||||||
|
|
||||||
|
private static final int LINES_PER_MAP_THREAD = 2000;
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
|
||||||
if (args.length < 3) {
|
if (args.length < 3) {
|
||||||
@ -32,13 +33,15 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
catch (IOException ex)
|
catch (IOException ex)
|
||||||
{
|
{
|
||||||
System.err.println("Error reading files...\n" + ex.getMessage());
|
System.err.println("Error reading files...\n" + ex.getMessage());
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// APPROACH #1: Brute force
|
// APPROACH #1: Brute force
|
||||||
{
|
{
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||||
|
|
||||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||||
@ -66,18 +69,21 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// show me:
|
long timeTaken = System.currentTimeMillis() - startTime;
|
||||||
System.out.println(output);
|
System.out.println("Brute Force Results:");
|
||||||
|
System.out.println("\tTotal Time: " + timeTaken + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// APPROACH #2: MapReduce
|
// APPROACH #2: MapReduce
|
||||||
{
|
{
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||||
|
|
||||||
// MAP:
|
// MAP:
|
||||||
|
long mapStartTime = System.currentTimeMillis();
|
||||||
List<MappedItem> mappedItems = new LinkedList<MappedItem>();
|
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||||
|
|
||||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
||||||
while(inputIter.hasNext()) {
|
while(inputIter.hasNext()) {
|
||||||
@ -87,8 +93,10 @@ public class MapReduceFiles {
|
|||||||
|
|
||||||
map(file, contents, mappedItems);
|
map(file, contents, mappedItems);
|
||||||
}
|
}
|
||||||
|
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||||
|
|
||||||
// GROUP:
|
// GROUP:
|
||||||
|
long groupStartTime = System.currentTimeMillis();
|
||||||
|
|
||||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||||
|
|
||||||
@ -99,13 +107,16 @@ public class MapReduceFiles {
|
|||||||
String file = item.getFile();
|
String file = item.getFile();
|
||||||
List<String> list = groupedItems.get(word);
|
List<String> list = groupedItems.get(word);
|
||||||
if (list == null) {
|
if (list == null) {
|
||||||
list = new LinkedList<String>();
|
list = new ArrayList<String>();
|
||||||
groupedItems.put(word, list);
|
groupedItems.put(word, list);
|
||||||
}
|
}
|
||||||
list.add(file);
|
list.add(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||||
|
|
||||||
// REDUCE:
|
// REDUCE:
|
||||||
|
long reduceStartTime = System.currentTimeMillis();
|
||||||
|
|
||||||
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
||||||
while(groupedIter.hasNext()) {
|
while(groupedIter.hasNext()) {
|
||||||
@ -116,17 +127,27 @@ public class MapReduceFiles {
|
|||||||
reduce(word, list, output);
|
reduce(word, list, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println(output);
|
long endTime = System.currentTimeMillis();
|
||||||
|
long reduceTotalTime = endTime - reduceStartTime;
|
||||||
|
long totalTime = endTime - startTime;
|
||||||
|
|
||||||
|
System.out.println("MapReduce Results:");
|
||||||
|
System.out.println("\tMap Time: " + mapTotalTime);
|
||||||
|
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||||
|
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||||
|
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// APPROACH #3: Distributed MapReduce
|
// APPROACH #3: Distributed MapReduce
|
||||||
{
|
{
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
|
||||||
|
|
||||||
// MAP:
|
// MAP:
|
||||||
|
long mapStartTime = System.currentTimeMillis();
|
||||||
|
|
||||||
final List<MappedItem> mappedItems = new LinkedList<MappedItem>();
|
List<MappedItem> mappedItems = new ArrayList<MappedItem>();
|
||||||
|
|
||||||
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
|
final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
|
||||||
@Override
|
@Override
|
||||||
@ -135,22 +156,29 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
List<Thread> mapCluster = new ArrayList<Thread>(input.size());
|
List<Thread> mapCluster = new ArrayList<Thread>();
|
||||||
|
|
||||||
Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
|
for (Map.Entry<String, String> entry : input.entrySet()) {
|
||||||
while(inputIter.hasNext()) {
|
|
||||||
Map.Entry<String, String> entry = inputIter.next();
|
|
||||||
final String file = entry.getKey();
|
final String file = entry.getKey();
|
||||||
final String contents = entry.getValue();
|
final String contents = entry.getValue();
|
||||||
|
final String[] lines = contents.split("\\r?\\n");
|
||||||
|
|
||||||
Thread t = new Thread(new Runnable() {
|
for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
|
||||||
@Override
|
int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
|
||||||
public void run() {
|
final List<String> chunk = new ArrayList<>();
|
||||||
map(file, contents, mapCallback);
|
for (int j = i; j < end; j++) {
|
||||||
|
chunk.addAll(splitLongLine(lines[j]));
|
||||||
}
|
}
|
||||||
});
|
|
||||||
mapCluster.add(t);
|
Thread t = new Thread(new Runnable() {
|
||||||
t.start();
|
@Override
|
||||||
|
public void run() {
|
||||||
|
map(file, chunk, mapCallback);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
mapCluster.add(t);
|
||||||
|
t.start();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for mapping phase to be over:
|
// wait for mapping phase to be over:
|
||||||
@ -162,8 +190,10 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GROUP:
|
long mapTotalTime = System.currentTimeMillis() - mapStartTime;
|
||||||
|
|
||||||
|
// GROUP:
|
||||||
|
long groupStartTime = System.currentTimeMillis();
|
||||||
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
|
||||||
|
|
||||||
Iterator<MappedItem> mappedIter = mappedItems.iterator();
|
Iterator<MappedItem> mappedIter = mappedItems.iterator();
|
||||||
@ -173,13 +203,16 @@ public class MapReduceFiles {
|
|||||||
String file = item.getFile();
|
String file = item.getFile();
|
||||||
List<String> list = groupedItems.get(word);
|
List<String> list = groupedItems.get(word);
|
||||||
if (list == null) {
|
if (list == null) {
|
||||||
list = new LinkedList<String>();
|
list = new ArrayList<String>();
|
||||||
groupedItems.put(word, list);
|
groupedItems.put(word, list);
|
||||||
}
|
}
|
||||||
list.add(file);
|
list.add(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
long groupTotalTime = System.currentTimeMillis() - groupStartTime;
|
||||||
|
|
||||||
// REDUCE:
|
// REDUCE:
|
||||||
|
long reduceStartTime = System.currentTimeMillis();
|
||||||
|
|
||||||
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
|
final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
|
||||||
@Override
|
@Override
|
||||||
@ -190,16 +223,34 @@ public class MapReduceFiles {
|
|||||||
|
|
||||||
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
|
List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
|
||||||
|
|
||||||
Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
|
// Replace this constant if you want to try different values for performance tests
|
||||||
while(groupedIter.hasNext()) {
|
final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
|
||||||
Map.Entry<String, List<String>> entry = groupedIter.next();
|
|
||||||
final String word = entry.getKey();
|
|
||||||
final List<String> list = entry.getValue();
|
|
||||||
|
|
||||||
|
List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
|
||||||
|
Map<String, List<String>> currentChunk = new HashMap<>();
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
// Build chunks of words (100-1000 per thread)
|
||||||
|
for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
|
||||||
|
currentChunk.put(entry.getKey(), entry.getValue());
|
||||||
|
count++;
|
||||||
|
if (count >= WORDS_PER_REDUCE_THREAD) {
|
||||||
|
reduceChunks.add(currentChunk);
|
||||||
|
currentChunk = new HashMap<>();
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!currentChunk.isEmpty()) {
|
||||||
|
reduceChunks.add(currentChunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (final Map<String, List<String>> chunk : reduceChunks) {
|
||||||
Thread t = new Thread(new Runnable() {
|
Thread t = new Thread(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
reduce(word, list, reduceCallback);
|
for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
|
||||||
|
reduce(entry.getKey(), entry.getValue(), reduceCallback);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
reduceCluster.add(t);
|
reduceCluster.add(t);
|
||||||
@ -215,14 +266,40 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.println(output);
|
long endTime = System.currentTimeMillis();
|
||||||
|
long reduceTotalTime = endTime - startTime;
|
||||||
|
long totalTime = endTime - startTime;
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println("Distributed MapReduce Results:");
|
||||||
|
System.out.println("\tMap Time: " + mapTotalTime);
|
||||||
|
System.out.println("\tGroup Time: " + groupTotalTime);
|
||||||
|
System.out.println("\tReduce Time: " + reduceTotalTime);
|
||||||
|
System.out.println("\tTotal Time: " + totalTime + "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
|
||||||
|
List<MappedItem> results = new ArrayList<MappedItem>();
|
||||||
|
for (String line : lines) {
|
||||||
|
String[] words = line.trim().split("\s+");
|
||||||
|
for(String word: words) {
|
||||||
|
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||||
|
if (!word.isEmpty()) {
|
||||||
|
results.add(new MappedItem(word, file));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
callback.mapDone(file, results);
|
||||||
|
}
|
||||||
|
|
||||||
public static void map(String file, String contents, List<MappedItem> mappedItems) {
|
public static void map(String file, String contents, List<MappedItem> mappedItems) {
|
||||||
String[] words = contents.trim().split("\\s+");
|
String[] words = contents.trim().split("\s+");
|
||||||
for(String word: words) {
|
for(String word: words) {
|
||||||
mappedItems.add(new MappedItem(word, file));
|
word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
|
||||||
|
if (!word.isEmpty()) {
|
||||||
|
mappedItems.add(new MappedItem(word, file));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -239,25 +316,6 @@ public class MapReduceFiles {
|
|||||||
output.put(word, reducedList);
|
output.put(word, reducedList);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static interface MapCallback<E, V> {
|
|
||||||
|
|
||||||
public void mapDone(E key, List<V> values);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void map(String file, String contents, MapCallback<String, MappedItem> callback) {
|
|
||||||
String[] words = contents.trim().split("\\s+");
|
|
||||||
List<MappedItem> results = new ArrayList<MappedItem>(words.length);
|
|
||||||
for(String word: words) {
|
|
||||||
results.add(new MappedItem(word, file));
|
|
||||||
}
|
|
||||||
callback.mapDone(file, results);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static interface ReduceCallback<E, K, V> {
|
|
||||||
|
|
||||||
public void reduceDone(E e, Map<K,V> results);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
|
public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
|
||||||
|
|
||||||
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
Map<String, Integer> reducedList = new HashMap<String, Integer>();
|
||||||
@ -272,6 +330,14 @@ public class MapReduceFiles {
|
|||||||
callback.reduceDone(word, reducedList);
|
callback.reduceDone(word, reducedList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static interface MapCallback<E, V> {
|
||||||
|
public void mapDone(E key, List<V> values);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static interface ReduceCallback<E, K, V> {
|
||||||
|
public void reduceDone(E e, Map<K,V> results);
|
||||||
|
}
|
||||||
|
|
||||||
private static class MappedItem {
|
private static class MappedItem {
|
||||||
|
|
||||||
private final String word;
|
private final String word;
|
||||||
@ -315,4 +381,15 @@ public class MapReduceFiles {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<String> splitLongLine(String line) {
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
while (line.length() > 80) {
|
||||||
|
int splitAt = line.lastIndexOf(' ', 80);
|
||||||
|
if (splitAt <= 0) splitAt = 80;
|
||||||
|
result.add(line.substring(0, splitAt));
|
||||||
|
line = line.substring(splitAt).trim();
|
||||||
|
}
|
||||||
|
if (!line.isEmpty()) result.add(line);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user