[CT414]: Assignment 2 results

2025-03-24 00:47:26 +00:00
parent a5bb33e012
commit be4f004bcd
11 changed files with 378 additions and 290 deletions
--- a/year4/semester2/CT414/assignments/assignment2/code/MapReduceFiles.java
+++ b/year4/semester2/CT414/assignments/assignment2/code/MapReduceFiles.java
@ -1,289 +1,162 @@
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.io.IOException;
-import java.io.File;
-import java.io.FileReader;
-import java.io.BufferedReader;
-import java.util.Scanner;
+import java.util.*;
+import java.io.*;

 public class MapReduceFiles {

-  private static final int LINES_PER_MAP_THREAD = 2000;
+  private static final String CSV_FILE = "performance_results.csv";

  public static void main(String[] args) {
-
-    if (args.length < 3) {
-      System.err.println("usage: java MapReduceFiles file1.txt file2.txt file3.txt");
-
+    if (args.length < 1) {
+      System.err.println("Usage: java MapReduceFiles file1.txt file2.txt ... fileN.txt");
+      return;
    }

-    Map<String, String> input = new HashMap<String, String>();
+    Map<String, String> input = new HashMap<>();
    try {
-      input.put(args[0], readFile(args[0]));
-      input.put(args[1], readFile(args[1]));
-      input.put(args[2], readFile(args[2]));
-
      for (String filename : args) {
        input.put(filename, readFile(filename));
      }
-
-    }
-    catch (IOException ex)
-    {
-      System.err.println("Error reading files...\n" + ex.getMessage());
+    } catch (IOException ex) {
+      System.err.println("Error reading files: " + ex.getMessage());
      ex.printStackTrace();
-      System.exit(0);
+      return;
    }

-    // APPROACH #1: Brute force
-    {
-      long startTime = System.currentTimeMillis();
+    int[] mapSizes = {1000, 2000, 5000, 10000};
+    int[] reduceSizes = {100, 200, 500, 1000};

-      Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
+    System.out.println("===== Starting Grid Search =====");

-      Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
-      while(inputIter.hasNext()) {
-        Map.Entry<String, String> entry = inputIter.next();
-        String file = entry.getKey();
-        String contents = entry.getValue();
+    try (PrintWriter writer = new PrintWriter(new FileWriter(CSV_FILE))) {
+      writer.println("MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime");

-        String[] words = contents.trim().split("\\s+");
-
-        for(String word : words) {
-
-          Map<String, Integer> files = output.get(word);
-          if (files == null) {
-            files = new HashMap<String, Integer>();
-            output.put(word, files);
-          }
-
-          Integer occurrences = files.remove(file);
-          if (occurrences == null) {
-            files.put(file, 1);
-          } else {
-            files.put(file, occurrences.intValue() + 1);
-          }
+      for (int mapSize : mapSizes) {
+        for (int reduceSize : reduceSizes) {
+          runDistributedMapReduce(input, mapSize, reduceSize, writer);
        }
      }

-      long timeTaken = System.currentTimeMillis() - startTime;
-      System.out.println("Brute Force Results:");
-      System.out.println("\tTotal Time: " + timeTaken + "\n");
+    } catch (IOException e) {
+      System.err.println("Error writing to CSV file: " + e.getMessage());
    }

+    System.out.println("===== Grid Search Complete =====");
+    System.out.println("Results saved to: " + CSV_FILE);
+  }

-    // APPROACH #2: MapReduce
-    {
-      long startTime = System.currentTimeMillis();
+  public static void runDistributedMapReduce(Map<String, String> input, int linesPerMapThread, int wordsPerReduceThread, PrintWriter csvWriter) {
+    final Map<String, Map<String, Integer>> output = new HashMap<>();

-      Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
+    // MAP Phase
+    long mapStartTime = System.currentTimeMillis();
+    List<MappedItem> mappedItems = Collections.synchronizedList(new ArrayList<>());

-      // MAP:
-      long mapStartTime = System.currentTimeMillis();
-      List<MappedItem> mappedItems = new ArrayList<MappedItem>();
-
-      Iterator<Map.Entry<String, String>> inputIter = input.entrySet().iterator();
-      while(inputIter.hasNext()) {
-        Map.Entry<String, String> entry = inputIter.next();
-        String file = entry.getKey();
-        String contents = entry.getValue();
-
-        map(file, contents, mappedItems);
+    final MapCallback<String, MappedItem> mapCallback = new MapCallback<>() {
+      public synchronized void mapDone(String file, List<MappedItem> results) {
+        mappedItems.addAll(results);
      }
-      long mapTotalTime = System.currentTimeMillis() - mapStartTime;
+    };

-      // GROUP:
-      long groupStartTime = System.currentTimeMillis();
+    List<Thread> mapCluster = new ArrayList<>();
+    for (Map.Entry<String, String> entry : input.entrySet()) {
+      final String file = entry.getKey();
+      final String[] lines = entry.getValue().split("\\r?\\n");

-      Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
-
-      Iterator<MappedItem> mappedIter = mappedItems.iterator();
-      while(mappedIter.hasNext()) {
-        MappedItem item = mappedIter.next();
-        String word = item.getWord();
-        String file = item.getFile();
-        List<String> list = groupedItems.get(word);
-        if (list == null) {
-          list = new ArrayList<String>();
-          groupedItems.put(word, list);
+      for (int i = 0; i < lines.length; i += linesPerMapThread) {
+        int end = Math.min(i + linesPerMapThread, lines.length);
+        final List<String> chunk = new ArrayList<>();
+        for (int j = i; j < end; j++) {
+          chunk.addAll(splitLongLine(lines[j]));
        }
-        list.add(file);
-      }

-      long groupTotalTime = System.currentTimeMillis() - groupStartTime;
-
-      // REDUCE:
-      long reduceStartTime = System.currentTimeMillis();
-
-      Iterator<Map.Entry<String, List<String>>> groupedIter = groupedItems.entrySet().iterator();
-      while(groupedIter.hasNext()) {
-        Map.Entry<String, List<String>> entry = groupedIter.next();
-        String word = entry.getKey();
-        List<String> list = entry.getValue();
-
-        reduce(word, list, output);
-      }
-
-      long endTime = System.currentTimeMillis();
-      long reduceTotalTime = endTime - reduceStartTime;
-      long totalTime = endTime - startTime;
-
-      System.out.println("MapReduce Results:");
-      System.out.println("\tMap Time: " + mapTotalTime);
-      System.out.println("\tGroup Time: " + groupTotalTime);
-      System.out.println("\tReduce Time: " + reduceTotalTime);
-      System.out.println("\tTotal Time: " + totalTime + "\n");
-    }
-
-
-    // APPROACH #3: Distributed MapReduce
-    {
-      long startTime = System.currentTimeMillis();
-      final Map<String, Map<String, Integer>> output = new HashMap<String, Map<String, Integer>>();
-
-      // MAP:
-      long mapStartTime = System.currentTimeMillis();
-
-      List<MappedItem> mappedItems = new ArrayList<MappedItem>();
-
-      final MapCallback<String, MappedItem> mapCallback = new MapCallback<String, MappedItem>() {
-        @Override
-        public synchronized void mapDone(String file, List<MappedItem> results) {
-          mappedItems.addAll(results);
-        }
-      };
-
-      List<Thread> mapCluster = new ArrayList<Thread>();
-
-      for (Map.Entry<String, String> entry : input.entrySet()) {
-        final String file = entry.getKey();
-        final String contents = entry.getValue();
-        final String[] lines = contents.split("\\r?\\n");
-
-        for (int i = 0; i < lines.length; i += LINES_PER_MAP_THREAD) {
-          int end = Math.min(i + LINES_PER_MAP_THREAD, lines.length);
-          final List<String> chunk = new ArrayList<>();
-          for (int j = i; j < end; j++) {
-            chunk.addAll(splitLongLine(lines[j]));
-          }
-
-          Thread t = new Thread(new Runnable() {
-            @Override
-            public void run() {
-              map(file, chunk, mapCallback);
-            }
-          });
-          mapCluster.add(t);
-          t.start();
-        }
-      }
-
-      // wait for mapping phase to be over:
-      for(Thread t : mapCluster) {
-        try {
-          t.join();
-        } catch(InterruptedException e) {
-          throw new RuntimeException(e);
-        }
-      }
-
-      long mapTotalTime = System.currentTimeMillis() - mapStartTime;
-
-      // GROUP:
-      long groupStartTime = System.currentTimeMillis();
-      Map<String, List<String>> groupedItems = new HashMap<String, List<String>>();
-
-      Iterator<MappedItem> mappedIter = mappedItems.iterator();
-      while(mappedIter.hasNext()) {
-        MappedItem item = mappedIter.next();
-        String word = item.getWord();
-        String file = item.getFile();
-        List<String> list = groupedItems.get(word);
-        if (list == null) {
-          list = new ArrayList<String>();
-          groupedItems.put(word, list);
-        }
-        list.add(file);
-      }
-
-      long groupTotalTime = System.currentTimeMillis() - groupStartTime;
-
-      // REDUCE:
-      long reduceStartTime = System.currentTimeMillis();
-
-      final ReduceCallback<String, String, Integer> reduceCallback = new ReduceCallback<String, String, Integer>() {
-        @Override
-        public synchronized void reduceDone(String k, Map<String, Integer> v) {
-          output.put(k, v);
-        }
-      };
-
-      List<Thread> reduceCluster = new ArrayList<Thread>(groupedItems.size());
-
-      // Replace this constant if you want to try different values for performance tests
-      final int WORDS_PER_REDUCE_THREAD = 500; // Between 100 and 1000
-
-      List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
-      Map<String, List<String>> currentChunk = new HashMap<>();
-      int count = 0;
-
-// Build chunks of words (100-1000 per thread)
-      for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
-        currentChunk.put(entry.getKey(), entry.getValue());
-        count++;
-        if (count >= WORDS_PER_REDUCE_THREAD) {
-          reduceChunks.add(currentChunk);
-          currentChunk = new HashMap<>();
-          count = 0;
-        }
-      }
-      if (!currentChunk.isEmpty()) {
-        reduceChunks.add(currentChunk);
-      }
-
-      for (final Map<String, List<String>> chunk : reduceChunks) {
-        Thread t = new Thread(new Runnable() {
-          @Override
-          public void run() {
-            for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
-              reduce(entry.getKey(), entry.getValue(), reduceCallback);
-            }
-          }
-        });
-        reduceCluster.add(t);
+        Thread t = new Thread(() -> map(file, chunk, mapCallback));
+        mapCluster.add(t);
        t.start();
      }
-
-      // wait for reducing phase to be over:
-      for(Thread t : reduceCluster) {
-        try {
-          t.join();
-        } catch(InterruptedException e) {
-          throw new RuntimeException(e);
-        }
-      }
-
-      long endTime = System.currentTimeMillis();
-      long reduceTotalTime = endTime - startTime;
-      long totalTime = endTime - startTime;
-
-
-      System.out.println("Distributed MapReduce Results:");
-      System.out.println("\tMap Time: " + mapTotalTime);
-      System.out.println("\tGroup Time: " + groupTotalTime);
-      System.out.println("\tReduce Time: " + reduceTotalTime);
-      System.out.println("\tTotal Time: " + totalTime + "\n");
    }
+
+    for (Thread t : mapCluster) {
+      try {
+        t.join();
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    long mapTotalTime = System.currentTimeMillis() - mapStartTime;
+
+    // GROUP Phase
+    long groupStartTime = System.currentTimeMillis();
+    Map<String, List<String>> groupedItems = new HashMap<>();
+    for (MappedItem item : mappedItems) {
+      groupedItems.computeIfAbsent(item.getWord(), k -> new ArrayList<>()).add(item.getFile());
+    }
+    long groupTotalTime = System.currentTimeMillis() - groupStartTime;
+
+    // REDUCE Phase
+    long reduceStartTime = System.currentTimeMillis();
+    final ReduceCallback<String, String, Integer> reduceCallback = (word, result) -> {
+      synchronized (output) {
+        output.put(word, result);
+      }
+    };
+
+    List<Thread> reduceCluster = new ArrayList<>();
+    List<Map<String, List<String>>> reduceChunks = new ArrayList<>();
+    Map<String, List<String>> currentChunk = new HashMap<>();
+    int count = 0;
+
+    for (Map.Entry<String, List<String>> entry : groupedItems.entrySet()) {
+      currentChunk.put(entry.getKey(), entry.getValue());
+      count++;
+      if (count >= wordsPerReduceThread) {
+        reduceChunks.add(currentChunk);
+        currentChunk = new HashMap<>();
+        count = 0;
+      }
+    }
+    if (!currentChunk.isEmpty()) reduceChunks.add(currentChunk);
+
+    for (final Map<String, List<String>> chunk : reduceChunks) {
+      Thread t = new Thread(() -> {
+        for (Map.Entry<String, List<String>> entry : chunk.entrySet()) {
+          reduce(entry.getKey(), entry.getValue(), reduceCallback);
+        }
+      });
+      reduceCluster.add(t);
+      t.start();
+    }
+
+    for (Thread t : reduceCluster) {
+      try {
+        t.join();
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    long reduceTotalTime = System.currentTimeMillis() - reduceStartTime;
+    long totalTime = mapTotalTime + groupTotalTime + reduceTotalTime;
+
+    // Print & Log
+    System.out.println("MapLines: " + linesPerMapThread + ", ReduceWords: " + wordsPerReduceThread);
+    System.out.println("\tMap Time: " + mapTotalTime + " ms");
+    System.out.println("\tGroup Time: " + groupTotalTime + " ms");
+    System.out.println("\tReduce Time: " + reduceTotalTime + " ms");
+    System.out.println("\tTotal Time: " + totalTime + " ms");
+    System.out.println("----------------------------------------------------");
+
+    csvWriter.printf("%d,%d,%d,%d,%d,%d%n",
+            linesPerMapThread, wordsPerReduceThread,
+            mapTotalTime, groupTotalTime, reduceTotalTime, totalTime);
+    csvWriter.flush();
  }

  public static void map(String file, List<String> lines, MapCallback<String, MappedItem> callback) {
-    List<MappedItem> results = new ArrayList<MappedItem>();
+    List<MappedItem> results = new ArrayList<>();
    for (String line : lines) {
-      String[] words = line.trim().split("\s+");
-      for(String word: words) {
+      String[] words = line.trim().split("\\s+");
+      for (String word : words) {
        word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
        if (!word.isEmpty()) {
          results.add(new MappedItem(word, file));
@ -293,53 +166,23 @@ public class MapReduceFiles {
    callback.mapDone(file, results);
  }

-  public static void map(String file, String contents, List<MappedItem> mappedItems) {
-    String[] words = contents.trim().split("\s+");
-    for(String word: words) {
-      word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();
-      if (!word.isEmpty()) {
-        mappedItems.add(new MappedItem(word, file));
-      }
-    }
-  }
-
-  public static void reduce(String word, List<String> list, Map<String, Map<String, Integer>> output) {
-    Map<String, Integer> reducedList = new HashMap<String, Integer>();
-    for(String file: list) {
-      Integer occurrences = reducedList.get(file);
-      if (occurrences == null) {
-        reducedList.put(file, 1);
-      } else {
-        reducedList.put(file, occurrences.intValue() + 1);
-      }
-    }
-    output.put(word, reducedList);
-  }
-
  public static void reduce(String word, List<String> list, ReduceCallback<String, String, Integer> callback) {
-
-    Map<String, Integer> reducedList = new HashMap<String, Integer>();
-    for(String file: list) {
-      Integer occurrences = reducedList.get(file);
-      if (occurrences == null) {
-        reducedList.put(file, 1);
-      } else {
-        reducedList.put(file, occurrences.intValue() + 1);
-      }
+    Map<String, Integer> reducedList = new HashMap<>();
+    for (String file : list) {
+      reducedList.put(file, reducedList.getOrDefault(file, 0) + 1);
    }
    callback.reduceDone(word, reducedList);
  }

-  public static interface MapCallback<E, V> {
-    public void mapDone(E key, List<V> values);
+  public interface MapCallback<E, V> {
+    void mapDone(E key, List<V> values);
  }

-  public static interface ReduceCallback<E, K, V> {
-    public void reduceDone(E e, Map<K,V> results);
+  public interface ReduceCallback<E, K, V> {
+    void reduceDone(E e, Map<K, V> results);
  }

  private static class MappedItem {
-
    private final String word;
    private final String file;

@ -369,11 +212,8 @@ public class MapReduceFiles {
    String lineSeparator = System.getProperty("line.separator");

    try {
-      if (scanner.hasNextLine()) {
-        fileContents.append(scanner.nextLine());
-      }
      while (scanner.hasNextLine()) {
-        fileContents.append(lineSeparator + scanner.nextLine());
+        fileContents.append(scanner.nextLine()).append(lineSeparator);
      }
      return fileContents.toString();
    } finally {
--- a/year4/semester2/CT414/assignments/assignment2/code/performance_results.csv
+++ b/year4/semester2/CT414/assignments/assignment2/code/performance_results.csv
@ -0,0 +1,17 @@
+MapLines,ReduceWords,MapTime,GroupTime,ReduceTime,TotalTime
+1000,100,2099,406,335,2840
+1000,200,1610,454,198,2262
+1000,500,1388,452,46,1886
+1000,1000,1538,302,48,1888
+2000,100,1726,314,263,2303
+2000,200,1512,323,62,1897
+2000,500,1669,334,46,2049
+2000,1000,1762,279,113,2154
+5000,100,1291,331,92,1714
+5000,200,1877,368,67,2312
+5000,500,1640,396,41,2077
+5000,1000,1439,365,193,1997
+10000,100,1285,359,94,1738
+10000,200,1598,359,98,2055
+10000,500,1489,314,68,1871
+10000,1000,1460,332,47,1839
--- a/year4/semester2/CT414/assignments/assignment2/code/plots.py
+++ b/year4/semester2/CT414/assignments/assignment2/code/plots.py
@ -0,0 +1,27 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+df = pd.read_csv('performance_results.csv')
+
+def save_heatmap(metric, title, filename):
+    pivot = df.pivot(index='MapLines', columns='ReduceWords', values=metric)
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(
+        pivot,
+        annot=True,
+        fmt="d",
+        cmap="RdYlGn_r",
+        cbar_kws={'label': 'Time (ms)'}
+    )
+    plt.title(title)
+    plt.ylabel("Lines per Map Thread")
+    plt.xlabel("Words per Reduce Thread")
+    plt.tight_layout()
+    plt.savefig(filename)
+    plt.close()
+    print(f"Saved: {filename}")
+
+save_heatmap('TotalTime', 'Total Time (ms)', '../latex/images/total_time_heatmap.png')
+save_heatmap('MapTime', 'Map Time (ms)', '../latex/images/map_time_heatmap.png')
+save_heatmap('ReduceTime', 'Reduce Time (ms)', '../latex/images/reduce_time_heatmap.png')